Skip to content

Feature/unicode only #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mcm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.pyc
.DS_Store
27 changes: 2 additions & 25 deletions mcm/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,10 @@ class CSVParser(object):
reader.seek_to_beginning()
# rows.next() will return the first row
"""
# Character escape sequences to replace
CLEAN_SUPER = [u'\ufffd', u'\xb2']

def __init__(self, csvfile, *args, **kwargs):
self.csvfile = csvfile
self.csvreader = self._get_csv_reader(csvfile, **kwargs)
self.clean_super_scripts()

def _get_csv_reader(self, *args, **kwargs):
"""Guess CSV dialect, and return CSV reader."""
Expand All @@ -192,27 +189,6 @@ def _get_csv_reader(self, *args, **kwargs):
del kwargs['reader_type']
return reader_type(self.csvfile, dialect, **kwargs)

def _clean_super(self, col, replace=u'2'):
"""Cleans up various superscript unicode escapes.

:param col: str, column name as read from the file.
:param replace: (optional) str, string to replace superscripts with.
:rtype: str, cleaned row name.

"""
for item in self.CLEAN_SUPER:
col = col.replace(item, unicode(replace))

return col

def clean_super_scripts(self):
"""Replaces column names with clean ones."""
new_fields = []
for col in self.csvreader.unicode_fieldnames:
new_fields.append(self._clean_super(col))

self.csvreader.unicode_fieldnames = new_fields

def next(self):
"""Wouldn't it be nice to get iterables form csvreader?"""
while 1:
Expand All @@ -233,7 +209,7 @@ def num_columns(self):

def headers(self):
"""original ordered list of spreadsheet headers"""
return self.csvreader.fieldnames
return self.csvreader.unicode_fieldnames


class MCMParser(object):
Expand All @@ -257,6 +233,7 @@ class MCMParser(object):
# rows.next() will return the first row

"""

def __init__(self, import_file, *args, **kwargs):
self.reader = self._get_reader(import_file)
self.import_file = import_file
Expand Down
Binary file modified mcm/tests/test_data/test_espm.xls
Binary file not shown.
Binary file modified mcm/tests/test_data/test_espm.xlsx
Binary file not shown.
Binary file modified mcm/tests/test_data/test_espm_blank_rows.xls
Binary file not shown.
76 changes: 49 additions & 27 deletions mcm/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import unicodecsv

from mcm import reader
from mcm.tests import utils


class TestCSVParser(TestCase):
Expand All @@ -24,32 +23,6 @@ def test_get_csv_reader(self):
isinstance(self.parser.csvreader, unicodecsv.DictReader)
)

def test_clean_super(self):
"""Make sure we clean out unicode escaped super scripts."""
expected = u'Testing 2. And 2.'
test = u'Testing \xb2. And \ufffd.'
self.assertEqual(
self.parser._clean_super(test),
expected
)

# Test that our replace keyword works
new_expected = expected.replace('2', '3')
self.assertEqual(
self.parser._clean_super(test, replace=u'3'),
new_expected
)

def test_clean_super_scripts(self):
"""Call _clean_super on all fieldnames."""
escape = u'\xb2'
# We know we have one of these escapes in our columns...

# self.parser.clean_super_scripts() is run by __init__ now
self.assertFalse(utils.list_has_substring(
escape, self.parser.csvreader.unicode_fieldnames
))


class TestMCMParserCSV(TestCase):
def setUp(self):
Expand Down Expand Up @@ -205,3 +178,52 @@ def test_headers(self):
self.parser.headers()[-1],
'Release Date'
)


class TestMCMParserConsistency(TestCase):
def setUp(self):

self.csv_f = open('test_data/test_espm.csv', 'rb')
self.xls_f = open('test_data/test_espm.xls', 'rb')
self.xlsx_f = open('test_data/test_espm.xlsx', 'rb')

self.parser_csv = reader.MCMParser(self.csv_f)
self.parser_xls = reader.MCMParser(self.xls_f)
self.parser_xlsx = reader.MCMParser(self.xlsx_f)
self.total_callbacks = 0

def my_callback(self, rows):
self.total_callbacks += 1

def tearDown(self):
self.csv_f.close()
self.xls_f.close()
self.xlsx_f.close()

def test_header_consistency(self):
""" Assert that headers are equivalent regardless of file format """
header_sets = (
self.parser_csv.headers(),
self.parser_xls.headers(),
self.parser_xlsx.headers(),
)

self.assertEqual(
header_sets[0],
header_sets[1],
)
self.assertEqual(
header_sets[1],
header_sets[2],
)
self.assertEqual(
header_sets[0],
header_sets[2],
)

def test_header_unicode(self):
""" Assert that headers are equivalent regardless of file format """
headers = self.parser_csv.headers()

# assert that the superscript shows up
self.assertIn(u'\xb2', headers[7])