From 132592bfd72f8681fb82b11a77ca3752c5c7e603 Mon Sep 17 00:00:00 2001
From: tharunkumar4562 <tharunkumarmalepati3@gmail.com>
Date: Wed, 11 Mar 2026 15:11:09 +0530
Subject: [PATCH 1/4] Handle malformed XML in extract_text_from_xml and add
 edge case test

---
 openverifiablellm/utils.py | 34 +++++++++++++++++++++-------------
 tests/test_util.py         | 20 ++++++++++++++++++++
 2 files changed, 41 insertions(+), 13 deletions(-)
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
index ad9b7de..feb7b23 100644
--- a/openverifiablellm/utils.py
+++ b/openverifiablellm/utils.py
@@ -222,20 +222,28 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):
 
     open_func = bz2.open if is_bz2 else open
 
-    with open_func(input_path, "rb") as f:
-        context = ET.iterparse(f, events=("end",))
-
-        with open(output_path, "w", encoding="utf-8") as out:
-            for _, elem in context:
-                if elem.tag.endswith("page"):
-                    text_elem = elem.find(".//{*}text")
-
-                    if text_elem is not None and text_elem.text:
-                        cleaned = clean_wikitext(text_elem.text)
-                        if cleaned:
-                            out.write(cleaned + "\n\n")
+    try:
+        with open_func(input_path, "rb") as f:
+            context = ET.iterparse(f, events=("end",))
+
+            with open(output_path, "w", encoding="utf-8") as out:
+                for _, elem in context:
+                    if elem.tag.endswith("page"):
+                        text_elem = elem.find(".//{*}text")
+
+                        if text_elem is not None and text_elem.text:
+                            cleaned = clean_wikitext(text_elem.text)
+                            if cleaned:
+                                out.write(cleaned + "\n\n")
+
+                        elem.clear()
+    except ET.ParseError as e:
+        # provide context about which file failed to parse
+        msg = f"Failed to parse XML dump '{input_path}': {e}"
+        logger.error(msg)
+        # re-raise a new ParseError containing context
+        raise ET.ParseError(msg) from e
 
-                    elem.clear()
     logger.info("Preprocessing complete. Output saved to %s", output_path)
     if write_manifest:
         generate_manifest(input_path, output_path)
diff --git a/tests/test_util.py b/tests/test_util.py
index 43e6c1f..66a671d 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -179,6 +179,26 @@ def test_extract_text_from_xml_uncompressed(tmp_path, monkeypatch):
     assert "Hello Uncompressed" in processed_file.read_text()
 
 
+def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
+    # create a file missing closing tags
+    xml_content = """<?xml version=\"1.0\"?>
+    <mediawiki>
+      <page>
+        <title>Broken XML
+    """
+
+    input_file = tmp_path / "malformed.xml"
+    input_file.write_text(xml_content, encoding="utf-8")
+
+    monkeypatch.chdir(tmp_path)
+
+    # ensure the parse error bubbles up
+    with pytest.raises(Exception) as excinfo:
+        utils.extract_text_from_xml(input_file)
+
+    # elementtree ParseError is expected
+    assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value)
+
 # --------------- manifest includes merkle fields ------------------------------------
 
 

From 723674a9815b275dd475a9975a6142c2487722d8 Mon Sep 17 00:00:00 2001
From: tharunkumar4562 <tharunkumarmalepati3@gmail.com>
Date: Thu, 12 Mar 2026 22:35:12 +0530
Subject: [PATCH 2/4] Apply Ruff formatting

---
 tests/test_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_util.py b/tests/test_util.py
index 66a671d..c232da2 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -199,6 +199,7 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
     # elementtree ParseError is expected
     assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value)
 
+
 # --------------- manifest includes merkle fields ------------------------------------
 
 

From f7a229aa1db79162d9175c6d5871406f8c55dc75 Mon Sep 17 00:00:00 2001
From: tharunkumar4562 <tharunkumarmalepati3@gmail.com>
Date: Mon, 16 Mar 2026 01:30:29 +0530
Subject: [PATCH 3/4] Fix atomic write in extract_text_from_xml and tighten
 ParseError test

---
 openverifiablellm/utils.py | 28 ++++++++++++++++++++--------
 tests/test_util.py         |  7 ++++---
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
index feb7b23..a9336fd 100644
--- a/openverifiablellm/utils.py
+++ b/openverifiablellm/utils.py
@@ -7,6 +7,7 @@
 import platform
 import re
 import sys
+import tempfile
 import time
 import tracemalloc
 from pathlib import Path
@@ -222,11 +223,15 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):
 
     open_func = bz2.open if is_bz2 else open
 
+    temp_output_fd, temp_output_path = tempfile.mkstemp(suffix=".tmp", dir=output_dir)
+    os.close(temp_output_fd)
+    temp_output_path = Path(temp_output_path)
+
     try:
         with open_func(input_path, "rb") as f:
             context = ET.iterparse(f, events=("end",))
 
-            with open(output_path, "w", encoding="utf-8") as out:
+            with temp_output_path.open("w", encoding="utf-8") as out:
                 for _, elem in context:
                     if elem.tag.endswith("page"):
                         text_elem = elem.find(".//{*}text")
@@ -235,18 +240,25 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):
                             cleaned = clean_wikitext(text_elem.text)
                             if cleaned:
                                 out.write(cleaned + "\n\n")
+                out.flush()
+                os.fsync(out.fileno())
 
-                        elem.clear()
+        os.replace(temp_output_path, output_path)
     except ET.ParseError as e:
-        # provide context about which file failed to parse
         msg = f"Failed to parse XML dump '{input_path}': {e}"
         logger.error(msg)
-        # re-raise a new ParseError containing context
         raise ET.ParseError(msg) from e
-
-    logger.info("Preprocessing complete. Output saved to %s", output_path)
-    if write_manifest:
-        generate_manifest(input_path, output_path)
+    except Exception:
+        if temp_output_path.exists():
+            temp_output_path.unlink(missing_ok=True)
+        raise
+    else:
+        logger.info("Preprocessing complete. Output saved to %s", output_path)
+        if write_manifest:
+            generate_manifest(input_path, output_path)
+    finally:
+        if temp_output_path.exists() and temp_output_path != output_path:
+            temp_output_path.unlink(missing_ok=True)
 
 
 # generate data manifest
diff --git a/tests/test_util.py b/tests/test_util.py
index c232da2..457fa54 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,6 +1,7 @@
 import bz2
 import hashlib
 import json
+from defusedxml.ElementTree import ParseError
 
 import pytest
 
@@ -193,11 +194,11 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
     monkeypatch.chdir(tmp_path)
 
     # ensure the parse error bubbles up
-    with pytest.raises(Exception) as excinfo:
+    with pytest.raises(ParseError) as excinfo:
         utils.extract_text_from_xml(input_file)
 
-    # elementtree ParseError is expected
-    assert "Failed to parse XML" in str(excinfo.value) or "ParseError" in str(excinfo.value)
+    # elementtree ParseError with context is expected
+    assert "Failed to parse XML dump" in str(excinfo.value)
 
 
 # --------------- manifest includes merkle fields ------------------------------------

From d2f196004251c084bae6f153a663e683352dbcaf Mon Sep 17 00:00:00 2001
From: tharunkumar4562 <tharunkumarmalepati3@gmail.com>
Date: Mon, 16 Mar 2026 01:47:18 +0530
Subject: [PATCH 4/4] Fix XML parsing cleanup and tighten malformed XML test

---
 openverifiablellm/utils.py | 2 ++
 tests/test_util.py         | 9 +++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
index a9336fd..fee4626 100644
--- a/openverifiablellm/utils.py
+++ b/openverifiablellm/utils.py
@@ -240,6 +240,8 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):
                             cleaned = clean_wikitext(text_elem.text)
                             if cleaned:
                                 out.write(cleaned + "\n\n")
+
+                        elem.clear()
                 out.flush()
                 os.fsync(out.fileno())
 
diff --git a/tests/test_util.py b/tests/test_util.py
index 457fa54..88f291f 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,7 +1,7 @@
 import bz2
 import hashlib
 import json
-from defusedxml.ElementTree import ParseError
+import xml.etree.ElementTree as ET
 
 import pytest
 
@@ -193,13 +193,10 @@ def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
 
     monkeypatch.chdir(tmp_path)
 
-    # ensure the parse error bubbles up
-    with pytest.raises(ParseError) as excinfo:
+    # ensure the parse error bubbles up with the contextual message
+    with pytest.raises(ET.ParseError, match=r"^Failed to parse XML"):
         utils.extract_text_from_xml(input_file)
 
-    # elementtree ParseError with context is expected
-    assert "Failed to parse XML dump" in str(excinfo.value)
-
 
 # --------------- manifest includes merkle fields ------------------------------------