diff --git a/mcm/.gitignore b/mcm/.gitignore new file mode 100644 index 0000000..0205d62 --- /dev/null +++ b/mcm/.gitignore @@ -0,0 +1,2 @@ +*.pyc +.DS_Store diff --git a/mcm/reader.py b/mcm/reader.py index 8c69f91..26ea643 100644 --- a/mcm/reader.py +++ b/mcm/reader.py @@ -165,13 +165,10 @@ class CSVParser(object): reader.seek_to_beginning() # rows.next() will return the first row """ - # Character escape sequences to replace - CLEAN_SUPER = [u'\ufffd', u'\xb2'] def __init__(self, csvfile, *args, **kwargs): self.csvfile = csvfile self.csvreader = self._get_csv_reader(csvfile, **kwargs) - self.clean_super_scripts() def _get_csv_reader(self, *args, **kwargs): """Guess CSV dialect, and return CSV reader.""" @@ -192,27 +189,6 @@ def _get_csv_reader(self, *args, **kwargs): del kwargs['reader_type'] return reader_type(self.csvfile, dialect, **kwargs) - def _clean_super(self, col, replace=u'2'): - """Cleans up various superscript unicode escapes. - - :param col: str, column name as read from the file. - :param replace: (optional) str, string to replace superscripts with. - :rtype: str, cleaned row name. - - """ - for item in self.CLEAN_SUPER: - col = col.replace(item, unicode(replace)) - - return col - - def clean_super_scripts(self): - """Replaces column names with clean ones.""" - new_fields = [] - for col in self.csvreader.unicode_fieldnames: - new_fields.append(self._clean_super(col)) - - self.csvreader.unicode_fieldnames = new_fields - def next(self): """Wouldn't it be nice to get iterables form csvreader?""" while 1: @@ -233,7 +209,7 @@ def num_columns(self): def headers(self): """original ordered list of spreadsheet headers""" - return self.csvreader.fieldnames + return self.csvreader.unicode_fieldnames class MCMParser(object): @@ -257,6 +233,7 @@ class MCMParser(object): # rows.next() will return the first row """ + def __init__(self, import_file, *args, **kwargs): self.reader = self._get_reader(import_file) self.import_file = import_file diff --git a/mcm/tests/test_data/test_espm.xls b/mcm/tests/test_data/test_espm.xls index 2e46ba2..b76ac58 100644 Binary files a/mcm/tests/test_data/test_espm.xls and b/mcm/tests/test_data/test_espm.xls differ diff --git a/mcm/tests/test_data/test_espm.xlsx b/mcm/tests/test_data/test_espm.xlsx index 6b43a4f..ea74faf 100644 Binary files a/mcm/tests/test_data/test_espm.xlsx and b/mcm/tests/test_data/test_espm.xlsx differ diff --git a/mcm/tests/test_data/test_espm_blank_rows.xls b/mcm/tests/test_data/test_espm_blank_rows.xls index 49c1714..931e2dc 100644 Binary files a/mcm/tests/test_data/test_espm_blank_rows.xls and b/mcm/tests/test_data/test_espm_blank_rows.xls differ diff --git a/mcm/tests/test_reader.py b/mcm/tests/test_reader.py index 96931e9..1159168 100644 --- a/mcm/tests/test_reader.py +++ b/mcm/tests/test_reader.py @@ -7,7 +7,6 @@ import unicodecsv from mcm import reader -from mcm.tests import utils class TestCSVParser(TestCase): @@ -24,32 +23,6 @@ def test_get_csv_reader(self): isinstance(self.parser.csvreader, unicodecsv.DictReader) ) - def test_clean_super(self): - """Make sure we clean out unicode escaped super scripts.""" - expected = u'Testing 2. And 2.' - test = u'Testing \xb2. And \ufffd.' - self.assertEqual( - self.parser._clean_super(test), - expected - ) - - # Test that our replace keyword works - new_expected = expected.replace('2', '3') - self.assertEqual( - self.parser._clean_super(test, replace=u'3'), - new_expected - ) - - def test_clean_super_scripts(self): - """Call _clean_super on all fieldnames.""" - escape = u'\xb2' - # We know we have one of these escapes in our columns... - - # self.parser.clean_super_scripts() is run by __init__ now - self.assertFalse(utils.list_has_substring( - escape, self.parser.csvreader.unicode_fieldnames - )) - class TestMCMParserCSV(TestCase): def setUp(self): @@ -205,3 +178,52 @@ def test_headers(self): self.parser.headers()[-1], 'Release Date' ) + + +class TestMCMParserConsistency(TestCase): + def setUp(self): + + self.csv_f = open('test_data/test_espm.csv', 'rb') + self.xls_f = open('test_data/test_espm.xls', 'rb') + self.xlsx_f = open('test_data/test_espm.xlsx', 'rb') + + self.parser_csv = reader.MCMParser(self.csv_f) + self.parser_xls = reader.MCMParser(self.xls_f) + self.parser_xlsx = reader.MCMParser(self.xlsx_f) + self.total_callbacks = 0 + + def my_callback(self, rows): + self.total_callbacks += 1 + + def tearDown(self): + self.csv_f.close() + self.xls_f.close() + self.xlsx_f.close() + + def test_header_consistency(self): + """ Assert that headers are equivalent regardless of file format """ + header_sets = ( + self.parser_csv.headers(), + self.parser_xls.headers(), + self.parser_xlsx.headers(), + ) + + self.assertEqual( + header_sets[0], + header_sets[1], + ) + self.assertEqual( + header_sets[1], + header_sets[2], + ) + self.assertEqual( + header_sets[0], + header_sets[2], + ) + + def test_header_unicode(self): + """ Assert that headers are equivalent regardless of file format """ + headers = self.parser_csv.headers() + + # assert that the superscript shows up + self.assertIn(u'\xb2', headers[7])