From c8748b2389657f19bbff31228388ee1fc1c334a1 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 11:25:59 +0200 Subject: [PATCH 1/4] Migrate tests to pytest --- pyexcel_xlsxr/__init__.py | 11 +- pyexcel_xlsxr/_version.py | 4 +- pyexcel_xlsxr/messy_xlsx.py | 4 +- requirements-dev.txt | 4 + test.bat | 2 +- test.sh | 2 +- tests/test_bug_fixes.py | 221 ++++++++++++++++++------------------ tests/test_filter.py | 17 ++- tests/test_formatters.py | 19 ++-- tests/test_messy_xlsx.py | 33 +++--- tests/test_reading.py | 4 +- tests/test_stringio.py | 6 +- 12 files changed, 158 insertions(+), 169 deletions(-) create mode 100644 requirements-dev.txt diff --git a/pyexcel_xlsxr/__init__.py b/pyexcel_xlsxr/__init__.py index ecd1f09..1ddd3c8 100644 --- a/pyexcel_xlsxr/__init__.py +++ b/pyexcel_xlsxr/__init__.py @@ -1,10 +1,11 @@ """ - pyexcel_xlsxr - ~~~~~~~~~~~~~~~~~~~ - The lower level xlsx file format handler using lxml - :copyright: (c) 2015-2020 by Onni Software Ltd & its contributors - :license: New BSD License +pyexcel_xlsxr +~~~~~~~~~~~~~~~~~~~ +The lower level xlsx file format handler using lxml +:copyright: (c) 2015-2020 by Onni Software Ltd & its contributors +:license: New BSD License """ + from pyexcel_io.io import get_data as read_data from pyexcel_io.io import isstream from pyexcel_io.plugins import IOPluginInfoChainV2 diff --git a/pyexcel_xlsxr/_version.py b/pyexcel_xlsxr/_version.py index 91ed185..5bf313f 100644 --- a/pyexcel_xlsxr/_version.py +++ b/pyexcel_xlsxr/_version.py @@ -1,2 +1,2 @@ -__version__ = '0.6.1' -__author__ = 'C.W.' +__version__ = "0.6.1" +__author__ = "C.W." diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index 22193be..77efc66 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -338,10 +338,10 @@ def parse_book_properties(book_content): ) namespaces = {"r": ns} - xlsx_header = u"".format( + xlsx_header = "".format( " ".join('xmlns:{0}="{1}"'.format(k, v) for k, v in namespaces.items()) ).encode("utf-8") - xlsx_footer = u"".encode("utf-8") + xlsx_footer = "".encode("utf-8") sheets = SHEET_FMT_MATCHER.findall(book_content) for sheet in sheets: block = xlsx_header + sheet + xlsx_footer diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..a640608 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +black~=25.1 +coverage~=7.9 +pytest~=8.4 +pyexcel~=0.7.3 diff --git a/test.bat b/test.bat index d9bf7b2..ba61639 100644 --- a/test.bat +++ b/test.bat @@ -1,2 +1,2 @@ pip freeze -nosetests --with-coverage --cover-package pyexcel_xlsxr --cover-package tests tests --with-doctest --doctest-extension=.rst README.rst pyexcel_xlsxr +coverage run -m --source=pyexcel_xlsxr pytest && coverage report --show-missing diff --git a/test.sh b/test.sh index 9c34733..b926de3 100644 --- a/test.sh +++ b/test.sh @@ -1,3 +1,3 @@ #/bin/bash pip freeze -nosetests --with-coverage --cover-package pyexcel_xlsxr --cover-package tests tests --with-doctest --doctest-extension=.rst README.rst pyexcel_xlsxr +coverage run -m --source=pyexcel_xlsxr pytest && coverage report --show-missing diff --git a/tests/test_bug_fixes.py b/tests/test_bug_fixes.py index 3448b2b..2e43b27 100644 --- a/tests/test_bug_fixes.py +++ b/tests/test_bug_fixes.py @@ -4,123 +4,118 @@ from pyexcel_xlsxr import get_data from pyexcel_io.reader import EncapsulatedSheetReader -from nose.tools import eq_ - def test_issue_1(): test_file = get_fixture("issue_1.xlsx") data = get_data(test_file) data_array = [list(map(str, row)) for row in data["dataSheet1"]] - eq_( - data_array, - [ - ["", "D0"], - ["Pads", "PADA"], - ["Timestamp", "13:26:26.375087"], - ["I", "V"], - ["0.0", "0.7830809999999999"], - ["1.0", "1.11145"], - ["2.0", "1.176147"], - ["3.0", "1.222229"], - ["4.0", "1.25946"], - ["5.0", "1.293334"], - ["6.0", "1.323852"], - ["7.0", "1.351623"], - ["8.0", "1.3778679999999999"], - ["9.0", "1.402893"], - ["10.0", "1.427001"], - ["11.0", "1.449279"], - ["12.0", "1.471252"], - ["13.0", "1.4923089999999999"], - ["14.0", "1.512451"], - ["15.0", "1.531982"], - ["16.0", "1.551513"], - ["17.0", "1.5701289999999999"], - ["18.0", "1.588134"], - ["19.0", "1.606445"], - ["20.0", "1.623535"], - ["21.0", "1.64093"], - ["22.0", "1.657714"], - ["23.0", "1.674804"], - ["24.0", "1.6906729999999999"], - ["25.0", "1.707153"], - ["26.0", "1.7233269999999998"], - ["27.0", "1.738586"], - ["28.0", "1.7544549999999999"], - ["29.0", "1.769104"], - ["30.0", "1.784667"], - ["31.0", "1.799316"], - ["32.0", "1.8148799999999998"], - ["33.0", "1.8286129999999998"], - ["34.0", "1.8432609999999998"], - ["35.0", "1.85791"], - ["36.0", "1.871948"], - ["37.0", "1.885986"], - ["38.0", "1.900329"], - ["39.0", "1.913452"], - ["40.0", "1.92749"], - ["41.0", "1.941223"], - ["42.0", "1.954345"], - ["43.0", "1.967773"], - ["44.0", "1.9808949999999999"], - ["45.0", "1.9940179999999998"], - ["46.0", "2.007446"], - ["47.0", "2.019958"], - ["48.0", "2.03247"], - ["49.0", "2.0455929999999998"], - ["50.0", "2.05841"], - ["51.0", "2.071228"], - ["52.0", "2.083129"], - ["53.0", "2.095336"], - ["54.0", "2.1072379999999997"], - ["55.0", "2.120056"], - ["56.0", "2.131652"], - ["57.0", "2.143859"], - ["58.0", "2.156066"], - ["59.0", "2.167663"], - ["60.0", "2.1795649999999998"], - ["61.0", "2.191162"], - ["62.0", "2.2021479999999998"], - ["63.0", "2.214355"], - ["64.0", "2.225646"], - ["65.0", "2.236633"], - ["66.0", "2.247009"], - ["67.0", "2.258911"], - ["68.0", "2.269897"], - ["69.0", "2.2808829999999998"], - ["70.0", "2.2915639999999997"], - ["71.0", "2.302246"], - ["72.0", "2.3138419999999997"], - ["73.0", "2.3245229999999997"], - ["74.0", "2.334899"], - ["75.0", "2.3455809999999997"], - ["76.0", "2.356262"], - ["77.0", "2.366333"], - ["78.0", "2.376708"], - ["79.0", "2.3864739999999998"], - ["80.0", "2.3971549999999997"], - ["81.0", "2.407531"], - ["82.0", "2.417602"], - ["83.0", "2.427673"], - ["84.0", "2.438354"], - ["85.0", "2.4472039999999997"], - ["86.0", "2.457885"], - ["87.0", "2.467956"], - ["88.0", "2.477722"], - ["89.0", "2.487487"], - ["90.0", "2.4978629999999997"], - ["91.0", "2.506408"], - ["92.0", "2.515869"], - ["93.0", "2.5256339999999997"], - ["94.0", "2.535095"], - ["95.0", "2.54425"], - ["96.0", "2.5537099999999997"], - ["97.0", "2.562866"], - ["98.0", "2.572021"], - ["99.0", "2.5805659999999997"], - ["100.0", "2.589721"], - ], - ) + assert data_array == [ + ["", "D0"], + ["Pads", "PADA"], + ["Timestamp", "13:26:26.375087"], + ["I", "V"], + ["0.0", "0.7830809999999999"], + ["1.0", "1.11145"], + ["2.0", "1.176147"], + ["3.0", "1.222229"], + ["4.0", "1.25946"], + ["5.0", "1.293334"], + ["6.0", "1.323852"], + ["7.0", "1.351623"], + ["8.0", "1.3778679999999999"], + ["9.0", "1.402893"], + ["10.0", "1.427001"], + ["11.0", "1.449279"], + ["12.0", "1.471252"], + ["13.0", "1.4923089999999999"], + ["14.0", "1.512451"], + ["15.0", "1.531982"], + ["16.0", "1.551513"], + ["17.0", "1.5701289999999999"], + ["18.0", "1.588134"], + ["19.0", "1.606445"], + ["20.0", "1.623535"], + ["21.0", "1.64093"], + ["22.0", "1.657714"], + ["23.0", "1.674804"], + ["24.0", "1.6906729999999999"], + ["25.0", "1.707153"], + ["26.0", "1.7233269999999998"], + ["27.0", "1.738586"], + ["28.0", "1.7544549999999999"], + ["29.0", "1.769104"], + ["30.0", "1.784667"], + ["31.0", "1.799316"], + ["32.0", "1.8148799999999998"], + ["33.0", "1.8286129999999998"], + ["34.0", "1.8432609999999998"], + ["35.0", "1.85791"], + ["36.0", "1.871948"], + ["37.0", "1.885986"], + ["38.0", "1.900329"], + ["39.0", "1.913452"], + ["40.0", "1.92749"], + ["41.0", "1.941223"], + ["42.0", "1.954345"], + ["43.0", "1.967773"], + ["44.0", "1.9808949999999999"], + ["45.0", "1.9940179999999998"], + ["46.0", "2.007446"], + ["47.0", "2.019958"], + ["48.0", "2.03247"], + ["49.0", "2.0455929999999998"], + ["50.0", "2.05841"], + ["51.0", "2.071228"], + ["52.0", "2.083129"], + ["53.0", "2.095336"], + ["54.0", "2.1072379999999997"], + ["55.0", "2.120056"], + ["56.0", "2.131652"], + ["57.0", "2.143859"], + ["58.0", "2.156066"], + ["59.0", "2.167663"], + ["60.0", "2.1795649999999998"], + ["61.0", "2.191162"], + ["62.0", "2.2021479999999998"], + ["63.0", "2.214355"], + ["64.0", "2.225646"], + ["65.0", "2.236633"], + ["66.0", "2.247009"], + ["67.0", "2.258911"], + ["68.0", "2.269897"], + ["69.0", "2.2808829999999998"], + ["70.0", "2.2915639999999997"], + ["71.0", "2.302246"], + ["72.0", "2.3138419999999997"], + ["73.0", "2.3245229999999997"], + ["74.0", "2.334899"], + ["75.0", "2.3455809999999997"], + ["76.0", "2.356262"], + ["77.0", "2.366333"], + ["78.0", "2.376708"], + ["79.0", "2.3864739999999998"], + ["80.0", "2.3971549999999997"], + ["81.0", "2.407531"], + ["82.0", "2.417602"], + ["83.0", "2.427673"], + ["84.0", "2.438354"], + ["85.0", "2.4472039999999997"], + ["86.0", "2.457885"], + ["87.0", "2.467956"], + ["88.0", "2.477722"], + ["89.0", "2.487487"], + ["90.0", "2.4978629999999997"], + ["91.0", "2.506408"], + ["92.0", "2.515869"], + ["93.0", "2.5256339999999997"], + ["94.0", "2.535095"], + ["95.0", "2.54425"], + ["96.0", "2.5537099999999997"], + ["97.0", "2.562866"], + ["98.0", "2.572021"], + ["99.0", "2.5805659999999997"], + ["100.0", "2.589721"], + ] def test_issue_5(): @@ -131,7 +126,7 @@ def test_issue_5(): sheet = EncapsulatedSheetReader(XLSXSheet(native_sheet)) data = sheet.to_array() - eq_(list(data), [[None, 11, 11]]) + assert list(data) == [[None, 11, 11]] def get_fixture(file_name): diff --git a/tests/test_filter.py b/tests/test_filter.py index d5c0f95..55f8e6a 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -1,11 +1,10 @@ import os +import unittest from pyexcel_io import get_data, save_data -from nose.tools import eq_ - -class TestFilter: +class TestFilter(unittest.TestCase): def setUp(self): self.test_file = "test_filter.xlsx" sample = [ @@ -24,21 +23,21 @@ def test_filter_row(self): self.test_file, start_row=3, library="pyexcel-xlsxr" ) expected = [[4, 24, 34], [5, 25, 35], [6, 26, 36]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def test_filter_row_2(self): filtered_data = get_data( self.test_file, start_row=3, row_limit=1, library="pyexcel-xlsxr" ) expected = [[4, 24, 34]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def test_filter_column(self): filtered_data = get_data( self.test_file, start_column=1, library="pyexcel-xlsxr" ) expected = [[21, 31], [22, 32], [23, 33], [24, 34], [25, 35], [26, 36]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def test_filter_column_2(self): filtered_data = get_data( @@ -48,7 +47,7 @@ def test_filter_column_2(self): library="pyexcel-xlsxr", ) expected = [[21], [22], [23], [24], [25], [26]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def test_filter_both_ways(self): filtered_data = get_data( @@ -58,7 +57,7 @@ def test_filter_both_ways(self): library="pyexcel-xlsxr", ) expected = [[24, 34], [25, 35], [26, 36]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def test_filter_both_ways_2(self): filtered_data = get_data( @@ -70,7 +69,7 @@ def test_filter_both_ways_2(self): library="pyexcel-xlsxr", ) expected = [[24]] - eq_(filtered_data[self.sheet_name], expected) + assert filtered_data[self.sheet_name] == expected def tearDown(self): os.unlink(self.test_file) diff --git a/tests/test_formatters.py b/tests/test_formatters.py index 6ff84e6..a9e39b7 100644 --- a/tests/test_formatters.py +++ b/tests/test_formatters.py @@ -1,10 +1,9 @@ import os +import unittest from textwrap import dedent import pyexcel as pe -from nose.tools import eq_ - class TestDateFormat: def test_reading_date_format(self): @@ -22,15 +21,15 @@ def test_reading_date_format(self): library="pyexcel-xlsxr", ) assert isinstance(r[1, 0], datetime.date) - eq_(r[1, 0].strftime("%d/%m/%y"), "25/12/14") + assert r[1, 0].strftime("%d/%m/%y") == "25/12/14" assert isinstance(r[1, 1], datetime.time) is True assert r[1, 1].strftime("%H:%M:%S") == "11:11:11" value = r[4, 0].isoformat() - eq_(value, "1899-12-30T00:00:00") - eq_(r[4, 1].isoformat(), "00:00:00") + assert value == "1899-12-30T00:00:00" + assert r[4, 1].isoformat() == "00:00:00" -class TestAutoDetectInt: +class TestAutoDetectInt(unittest.TestCase): def setUp(self): self.content = [[1, 2, 3.1]] self.test_file = "test_auto_detect_init.xlsx" @@ -45,7 +44,7 @@ def test_auto_detect_int(self): | 1 | 2 | 3.1 | +---+---+-----+""" ).strip() - eq_(str(sheet), expected) + assert str(sheet) == expected def test_get_book_auto_detect_int(self): book = pe.get_book(file_name=self.test_file, library="pyexcel-xlsxr") @@ -56,7 +55,7 @@ def test_get_book_auto_detect_int(self): | 1 | 2 | 3.1 | +---+---+-----+""" ).strip() - eq_(str(book), expected) + assert str(book) == expected def test_auto_detect_int_false(self): sheet = pe.get_sheet( @@ -71,7 +70,7 @@ def test_auto_detect_int_false(self): | 1.0 | 2.0 | 3.1 | +-----+-----+-----+""" ).strip() - eq_(str(sheet), expected) + assert str(sheet) == expected def test_get_book_auto_detect_int_false(self): book = pe.get_book( @@ -86,7 +85,7 @@ def test_get_book_auto_detect_int_false(self): | 1.0 | 2.0 | 3.1 | +-----+-----+-----+""" ).strip() - eq_(str(book), expected) + assert str(book) == expected def tearDown(self): os.unlink(self.test_file) diff --git a/tests/test_messy_xlsx.py b/tests/test_messy_xlsx.py index e4f31fe..f6f5c7c 100644 --- a/tests/test_messy_xlsx.py +++ b/tests/test_messy_xlsx.py @@ -10,8 +10,6 @@ parse_book_properties, ) -from nose.tools import eq_ - def test_get_sheet_index(): samples = [ @@ -22,7 +20,7 @@ def test_get_sheet_index(): ] expected = [0, 1, 0, 0] actual = [get_sheet_index(file_name) for file_name in samples] - eq_(actual, expected) + assert actual == expected def test_list_one(): @@ -47,7 +45,7 @@ def test_list_one(): ] sheet_files = find_sheets(test_sample) - eq_(sheet_files, expected) + assert sheet_files == expected def test_alternative_file_list(): @@ -72,7 +70,7 @@ def test_alternative_file_list(): ] sheet_files = find_sheets(test_sample) - eq_(sheet_files, expected) + assert sheet_files == expected def test_single_sheet(): @@ -91,7 +89,7 @@ def test_single_sheet(): expected = ["xl/worksheets/sheet.xml"] sheet_files = find_sheets(test_sample) - eq_(sheet_files, expected) + assert sheet_files == expected def test_alternative_single_sheet(): @@ -110,7 +108,7 @@ def test_alternative_single_sheet(): expected = ["xl/worksheets/worksheet.xml"] sheet_files = find_sheets(test_sample) - eq_(sheet_files, expected) + assert sheet_files == expected def test_parse_row(): @@ -131,13 +129,10 @@ def __init__(self): self.properties = {"date1904": False} data = parse_row(xml_string, Book()) - eq_( - [cell for cell in data], - [ - datetime(year=2015, month=1, day=1), - time(hour=13, minute=13, second=13), - ], - ) + assert [cell for cell in data] == [ + datetime(year=2015, month=1, day=1), + time(hour=13, minute=13, second=13), + ] def test_parse_styles(): @@ -152,7 +147,7 @@ def test_parse_styles(): b"\n", b" " ) styles = parse_styles(sample) - eq_(list(styles.values()), ["general", "dd/mm/yy", "h:mm:ss;@"]) + assert list(styles.values()) == ["general", "dd/mm/yy", "h:mm:ss;@"] def test_parse_properties(): @@ -166,7 +161,7 @@ def test_parse_properties(): b"\n", b" " ) properties = parse_book_properties(sample) - eq_(properties, {"date1904": False, "sheets": []}) + assert properties == {"date1904": False, "sheets": []} def test_parse_sheet_properties(): @@ -181,7 +176,7 @@ def test_parse_sheet_properties(): b"\n", b" " ) properties = parse_book_properties(sample) - eq_(properties, {"sheets": ["Sheet1", "Sheet2", "Sheet3"]}) + assert properties == {"sheets": ["Sheet1", "Sheet2", "Sheet3"]} def test_parse_xfs_styles(): @@ -206,7 +201,7 @@ def test_parse_xfs_styles(): b"\n", b" " ) xfs_styles = parse_xfs_styles(sample) - eq_(xfs_styles, [164, 165, 166]) + assert xfs_styles == [164, 165, 166] def test_parse_shared_strings(): @@ -217,4 +212,4 @@ def test_parse_shared_strings(): b"\n", b" " ) content = parse_shared_strings(sample) - eq_(list(content), ["Date", "Time"]) + assert list(content) == ["Date", "Time"] diff --git a/tests/test_reading.py b/tests/test_reading.py index e80ccca..a8fcfe9 100644 --- a/tests/test_reading.py +++ b/tests/test_reading.py @@ -4,8 +4,6 @@ from pyexcel_xlsxr import get_data from pyexcel_io._compact import OrderedDict -from nose.tools import eq_ - def test_reading(): data = get_data( @@ -38,4 +36,4 @@ def test_reading(): ) expected.update({"Sheet2": []}) expected.update({"Sheet3": []}) - eq_(data, expected) + assert data == expected diff --git a/tests/test_stringio.py b/tests/test_stringio.py index a93805b..babb013 100644 --- a/tests/test_stringio.py +++ b/tests/test_stringio.py @@ -3,8 +3,6 @@ import pyexcel from base import create_sample_file1 -from nose.tools import eq_ - class TestStringIO: def test_xlsx_stringio(self): @@ -17,7 +15,7 @@ def test_xlsx_stringio(self): ) result = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", 1.1, 1] actual = list(r.enumerate()) - eq_(result, actual) + assert result == actual if os.path.exists(testfile): os.unlink(testfile) @@ -31,4 +29,4 @@ def test_xlsx_output_stringio(self): ) result = [1, 2, 3, 4, 5, 6] actual = list(r.enumerate()) - eq_(result, actual) + assert result == actual From 6fc737e5aa0de7059d7384b2ce533e872195d361 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 10:40:40 +0200 Subject: [PATCH 2/4] Fix row regex to prevent catastrophic backtracking --- pyexcel_xlsxr/messy_xlsx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index 77efc66..dbc0f82 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -11,7 +11,7 @@ WORK_BOOK = "xl/workbook.xml" SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml" SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml" -XLSX_ROW_MATCH = re.compile(rb".*?().*?", re.MULTILINE) +XLSX_ROW_MATCH = re.compile(rb"]*>.*?", re.DOTALL) NUMBER_FMT_MATCHER = re.compile( rb".*?().*?", re.MULTILINE ) From c0b2f5c771a6c88e384a1919040cba4fff308c80 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 10:57:01 +0200 Subject: [PATCH 3/4] Improve regular expressions --- pyexcel_xlsxr/messy_xlsx.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pyexcel_xlsxr/messy_xlsx.py b/pyexcel_xlsxr/messy_xlsx.py index dbc0f82..ea9d6c8 100644 --- a/pyexcel_xlsxr/messy_xlsx.py +++ b/pyexcel_xlsxr/messy_xlsx.py @@ -9,17 +9,12 @@ STYLE_FILENAME = "xl/styles.xml" SHARED_STRING = "xl/sharedStrings.xml" WORK_BOOK = "xl/workbook.xml" -SHEET_MATCHER = "xl/worksheets/(work)?sheet([0-9]+)?.xml" -SHEET_INDEX_MATCHER = "xl/worksheets/(work)?sheet(([0-9]+)?).xml" +SHEET_MATCHER = re.compile(r"xl/worksheets/(?:work)?sheet([0-9]+)?.xml") XLSX_ROW_MATCH = re.compile(rb"]*>.*?", re.DOTALL) -NUMBER_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -XFS_FMT_MATCHER = re.compile( - rb".*?().*?", re.MULTILINE -) -SHEET_FMT_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) -DATE_1904_MATCHER = re.compile(rb".*?().*?", re.MULTILINE) +NUMBER_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +XFS_FMT_MATCHER = re.compile(rb"]*>.*?", re.DOTALL) +SHEET_FMT_MATCHER = re.compile(rb"", re.DOTALL) +DATE_1904_MATCHER = re.compile(rb"", re.DOTALL) # "xmlns:x14ac="http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac" # But it not used for now X14AC_NAMESPACE = b'xmlns:x14ac="http://not.used.com/"' @@ -158,14 +153,15 @@ def find_sheets(file_list): return [ sheet_file for sheet_file in file_list - if re.match(SHEET_MATCHER, sheet_file) + if SHEET_MATCHER.match(sheet_file) ] def get_sheet_index(file_name): - if re.match(SHEET_MATCHER, file_name): - result = re.search(SHEET_INDEX_MATCHER, file_name) - index = int(result.group(3)) if result.group(3) else 1 + sheet_match = SHEET_MATCHER.match(file_name) + + if sheet_match: + index = int(sheet_match.group(1)) if sheet_match.group(1) else 1 return index - 1 else: raise Exception("Invalid sheet file name") From 71acae8719c94c5cb76c1a33ac5b31cfaaef6305 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Peeters Date: Thu, 26 Jun 2025 11:32:39 +0200 Subject: [PATCH 4/4] Update changelog --- CONTRIBUTORS.rst | 1 + changelog.yml | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 16c04e4..5adc869 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -6,3 +6,4 @@ In alphabetical order: * `Mark Skelton `_ +* `Pierre-Louis Peeters `_ diff --git a/changelog.yml b/changelog.yml index 37d7ac6..76e745b 100644 --- a/changelog.yml +++ b/changelog.yml @@ -2,7 +2,12 @@ name: pyexcel-xlsxr organisation: pyexcel releases: - changes: - - action: Updated +- action: Updated + details: + - 'Fix freeze when parsing certain corrupt XLSX files' + date: 26.06.2025 + version: 0.6.2 +- action: Updated details: - '#9: Potential fix for incorrect reading of data with empty cells when used with pyexcel ' date: 11.11.2024