From 132592bfd72f8681fb82b11a77ca3752c5c7e603 Mon Sep 17 00:00:00 2001 From: tharunkumar4562 Date: Wed, 11 Mar 2026 15:11:09 +0530 Subject: [PATCH 1/4] Handle malformed XML in extract_text_from_xml and add edge case test --- openverifiablellm/utils.py | 34 +++++++++++++++++++++------------- tests/test_util.py | 20 ++++++++++++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py index ad9b7de..feb7b23 100644 --- a/openverifiablellm/utils.py +++ b/openverifiablellm/utils.py @@ -222,20 +222,28 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False): open_func = bz2.open if is_bz2 else open - with open_func(input_path, "rb") as f: - context = ET.iterparse(f, events=("end",)) - - with open(output_path, "w", encoding="utf-8") as out: - for _, elem in context: - if elem.tag.endswith("page"): - text_elem = elem.find(".//{*}text") - - if text_elem is not None and text_elem.text: - cleaned = clean_wikitext(text_elem.text) - if cleaned: - out.write(cleaned + "\n\n") + try: + with open_func(input_path, "rb") as f: + context = ET.iterparse(f, events=("end",)) + + with open(output_path, "w", encoding="utf-8") as out: + for _, elem in context: + if elem.tag.endswith("page"): + text_elem = elem.find(".//{*}text") + + if text_elem is not None and text_elem.text: + cleaned = clean_wikitext(text_elem.text) + if cleaned: + out.write(cleaned + "\n\n") + + elem.clear() + except ET.ParseError as e: + # provide context about which file failed to parse + msg = f"Failed to parse XML dump '{input_path}': {e}" + logger.error(msg) + # re-raise a new ParseError containing context + raise ET.ParseError(msg) from e - elem.clear() logger.info("Preprocessing complete. Output saved to %s", output_path) if write_manifest: generate_manifest(input_path, output_path) diff --git a/tests/test_util.py b/tests/test_util.py index 43e6c1f..66a671d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -179,6 +179,26 @@ def test_extract_text_from_xml_uncompressed(tmp_path, monkeypatch): assert "Hello Uncompressed" in processed_file.read_text() +def test_extract_text_from_xml_malformed(tmp_path, monkeypatch): + # create a file missing closing tags + xml_content = """ + + + Broken XML + """ + + input_file = tmp_path / "malformed.xml" + input_file.write_text(xml_content, encoding="utf-8") + + monkeypatch.chdir(tmp_path) + + # ensure the parse error bubbles up + with pytest.raises(Exception) as excinfo: + utils.extract_text_from_xml(input_file) + + # elementtree ParseError is expected + assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value) + # --------------- manifest includes merkle fields ------------------------------------ From 723674a9815b275dd475a9975a6142c2487722d8 Mon Sep 17 00:00:00 2001 From: tharunkumar4562 <tharunkumarmalepati3@gmail.com> Date: Thu, 12 Mar 2026 22:35:12 +0530 Subject: [PATCH 2/4] Apply Ruff formatting --- tests/test_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_util.py b/tests/test_util.py index 66a671d..c232da2 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -199,6 +199,7 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch): # elementtree ParseError is expected assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value) + # --------------- manifest includes merkle fields ------------------------------------ From f7a229aa1db79162d9175c6d5871406f8c55dc75 Mon Sep 17 00:00:00 2001 From: tharunkumar4562 <tharunkumarmalepati3@gmail.com> Date: Mon, 16 Mar 2026 01:30:29 +0530 Subject: [PATCH 3/4] Fix atomic write in extract_text_from_xml and tighten ParseError test --- openverifiablellm/utils.py | 28 ++++++++++++++++++++-------- tests/test_util.py | 7 ++++--- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py index feb7b23..a9336fd 100644 --- a/openverifiablellm/utils.py +++ b/openverifiablellm/utils.py @@ -7,6 +7,7 @@ import platform import re import sys +import tempfile import time import tracemalloc from pathlib import Path @@ -222,11 +223,15 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False): open_func = bz2.open if is_bz2 else open + temp_output_fd, temp_output_path = tempfile.mkstemp(suffix=".tmp", dir=output_dir) + os.close(temp_output_fd) + temp_output_path = Path(temp_output_path) + try: with open_func(input_path, "rb") as f: context = ET.iterparse(f, events=("end",)) - with open(output_path, "w", encoding="utf-8") as out: + with temp_output_path.open("w", encoding="utf-8") as out: for _, elem in context: if elem.tag.endswith("page"): text_elem = elem.find(".//{*}text") @@ -235,18 +240,25 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False): cleaned = clean_wikitext(text_elem.text) if cleaned: out.write(cleaned + "\n\n") + out.flush() + os.fsync(out.fileno()) - elem.clear() + os.replace(temp_output_path, output_path) except ET.ParseError as e: - # provide context about which file failed to parse msg = f"Failed to parse XML dump '{input_path}': {e}" logger.error(msg) - # re-raise a new ParseError containing context raise ET.ParseError(msg) from e - - logger.info("Preprocessing complete. Output saved to %s", output_path) - if write_manifest: - generate_manifest(input_path, output_path) + except Exception: + if temp_output_path.exists(): + temp_output_path.unlink(missing_ok=True) + raise + else: + logger.info("Preprocessing complete. Output saved to %s", output_path) + if write_manifest: + generate_manifest(input_path, output_path) + finally: + if temp_output_path.exists() and temp_output_path != output_path: + temp_output_path.unlink(missing_ok=True) # generate data manifest diff --git a/tests/test_util.py b/tests/test_util.py index c232da2..457fa54 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,7 @@ import bz2 import hashlib import json +from defusedxml.ElementTree import ParseError import pytest @@ -193,11 +194,11 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) # ensure the parse error bubbles up - with pytest.raises(Exception) as excinfo: + with pytest.raises(ParseError) as excinfo: utils.extract_text_from_xml(input_file) - # elementtree ParseError is expected - assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value) + # elementtree ParseError with context is expected + assert "Failed to parse XML dump" in str(excinfo.value) # --------------- manifest includes merkle fields ------------------------------------ From d2f196004251c084bae6f153a663e683352dbcaf Mon Sep 17 00:00:00 2001 From: tharunkumar4562 <tharunkumarmalepati3@gmail.com> Date: Mon, 16 Mar 2026 01:47:18 +0530 Subject: [PATCH 4/4] Fix XML parsing cleanup and tighten malformed XML test --- openverifiablellm/utils.py | 2 ++ tests/test_util.py | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py index a9336fd..fee4626 100644 --- a/openverifiablellm/utils.py +++ b/openverifiablellm/utils.py @@ -240,6 +240,8 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False): cleaned = clean_wikitext(text_elem.text) if cleaned: out.write(cleaned + "\n\n") + + elem.clear() out.flush() os.fsync(out.fileno()) diff --git a/tests/test_util.py b/tests/test_util.py index 457fa54..88f291f 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,7 +1,7 @@ import bz2 import hashlib import json -from defusedxml.ElementTree import ParseError +import xml.etree.ElementTree as ET import pytest @@ -193,13 +193,10 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) - # ensure the parse error bubbles up - with pytest.raises(ParseError) as excinfo: + # ensure the parse error bubbles up with the contextual message + with pytest.raises(ET.ParseError, match=r"^Failed to parse XML"): utils.extract_text_from_xml(input_file) - # elementtree ParseError with context is expected - assert "Failed to parse XML dump" in str(excinfo.value) - # --------------- manifest includes merkle fields ------------------------------------