AOSSIE-Org · tharunkumar4562 · Mar 11, 2026 · Mar 12, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/openverifiablellm/utils.py b/openverifiablellm/utils.py
@@ -7,6 +7,7 @@
 import platform
 import re
 import sys
+import tempfile
 import time
 import tracemalloc
 from pathlib import Path
@@ -222,23 +223,44 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):
 
     open_func = bz2.open if is_bz2 else open
 
-    with open_func(input_path, "rb") as f:
-        context = ET.iterparse(f, events=("end",))
+    temp_output_fd, temp_output_path = tempfile.mkstemp(suffix=".tmp", dir=output_dir)
+    os.close(temp_output_fd)
+    temp_output_path = Path(temp_output_path)
 
-        with open(output_path, "w", encoding="utf-8") as out:
-            for _, elem in context:
-                if elem.tag.endswith("page"):
-                    text_elem = elem.find(".//{*}text")
-
-                    if text_elem is not None and text_elem.text:
-                        cleaned = clean_wikitext(text_elem.text)
-                        if cleaned:
-                            out.write(cleaned + "\n\n")
-
-                    elem.clear()
-    logger.info("Preprocessing complete. Output saved to %s", output_path)
-    if write_manifest:
-        generate_manifest(input_path, output_path)
+    try:
+        with open_func(input_path, "rb") as f:
+            context = ET.iterparse(f, events=("end",))
+
+            with temp_output_path.open("w", encoding="utf-8") as out:
+                for _, elem in context:
+                    if elem.tag.endswith("page"):
+                        text_elem = elem.find(".//{*}text")
+
+                        if text_elem is not None and text_elem.text:
+                            cleaned = clean_wikitext(text_elem.text)
+                            if cleaned:
+                                out.write(cleaned + "\n\n")
+
+                        elem.clear()
+                out.flush()
+                os.fsync(out.fileno())
+
+        os.replace(temp_output_path, output_path)
+    except ET.ParseError as e:
+        msg = f"Failed to parse XML dump '{input_path}': {e}"
+        logger.error(msg)
+        raise ET.ParseError(msg) from e
+    except Exception:
+        if temp_output_path.exists():
+            temp_output_path.unlink(missing_ok=True)
+        raise
-    except Exception:
-        if temp_output_path.exists():
-            temp_output_path.unlink(missing_ok=True)
-        raise
+    except Exception:
+        raise
-    except Exception:
-        if temp_output_path.exists():
-            temp_output_path.unlink(missing_ok=True)
-        raise
+    except Exception:
+        raise
+    else:
+        logger.info("Preprocessing complete. Output saved to %s", output_path)
+        if write_manifest:
+            generate_manifest(input_path, output_path)
+    finally:
+        if temp_output_path.exists() and temp_output_path != output_path:
+            temp_output_path.unlink(missing_ok=True)
 
 
 # generate data manifest

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -1,6 +1,7 @@
 import bz2
 import hashlib
 import json
+import xml.etree.ElementTree as ET
 
 import pytest
 
@@ -179,6 +180,24 @@ def test_extract_text_from_xml_uncompressed(tmp_path, monkeypatch):
     assert "Hello Uncompressed" in processed_file.read_text()
 
 
+def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
+    # create a file missing closing tags
+    xml_content = """<?xml version=\"1.0\"?>
+    <mediawiki>
+      <page>
+        <title>Broken XML
+    """
+
+    input_file = tmp_path / "malformed.xml"
+    input_file.write_text(xml_content, encoding="utf-8")
+
+    monkeypatch.chdir(tmp_path)
+
+    # ensure the parse error bubbles up with the contextual message
+    with pytest.raises(ET.ParseError, match=r"^Failed to parse XML"):
+        utils.extract_text_from_xml(input_file)
+
+
 # --------------- manifest includes merkle fields ------------------------------------