Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions openverifiablellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import platform
import re
import sys
import tempfile
import time
import tracemalloc
from pathlib import Path
Expand Down Expand Up @@ -222,23 +223,44 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False):

open_func = bz2.open if is_bz2 else open

with open_func(input_path, "rb") as f:
context = ET.iterparse(f, events=("end",))
temp_output_fd, temp_output_path = tempfile.mkstemp(suffix=".tmp", dir=output_dir)
os.close(temp_output_fd)
temp_output_path = Path(temp_output_path)

with open(output_path, "w", encoding="utf-8") as out:
for _, elem in context:
if elem.tag.endswith("page"):
text_elem = elem.find(".//{*}text")

if text_elem is not None and text_elem.text:
cleaned = clean_wikitext(text_elem.text)
if cleaned:
out.write(cleaned + "\n\n")

elem.clear()
logger.info("Preprocessing complete. Output saved to %s", output_path)
if write_manifest:
generate_manifest(input_path, output_path)
try:
with open_func(input_path, "rb") as f:
context = ET.iterparse(f, events=("end",))

with temp_output_path.open("w", encoding="utf-8") as out:
for _, elem in context:
if elem.tag.endswith("page"):
text_elem = elem.find(".//{*}text")

if text_elem is not None and text_elem.text:
cleaned = clean_wikitext(text_elem.text)
if cleaned:
out.write(cleaned + "\n\n")

elem.clear()
out.flush()
os.fsync(out.fileno())

os.replace(temp_output_path, output_path)
except ET.ParseError as e:
msg = f"Failed to parse XML dump '{input_path}': {e}"
logger.error(msg)
raise ET.ParseError(msg) from e
except Exception:
if temp_output_path.exists():
temp_output_path.unlink(missing_ok=True)
raise
Comment on lines +253 to +256
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Redundant cleanup that may mask the original exception.

The explicit unlink in the generic Exception handler is unnecessary since the finally block (lines 261-263) already handles cleanup. If unlink() raises an unexpected error (e.g., PermissionError), it will mask the original exception.

The ParseError handler correctly relies on finally for cleanup—this block should follow the same pattern.

♻️ Proposed fix: remove redundant cleanup
     except ET.ParseError as e:
         msg = f"Failed to parse XML dump '{input_path}': {e}"
         logger.error(msg)
         raise ET.ParseError(msg) from e
     except Exception:
-        if temp_output_path.exists():
-            temp_output_path.unlink(missing_ok=True)
         raise
     else:
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except Exception:
if temp_output_path.exists():
temp_output_path.unlink(missing_ok=True)
raise
except Exception:
raise
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/utils.py` around lines 253 - 256, The generic "except
Exception" block currently calls temp_output_path.unlink(missing_ok=True) which
duplicates cleanup performed in the finally block and can mask the original
exception if unlink itself fails; remove the unlink call from the "except
Exception" handler so it simply re-raises the exception (matching the ParseError
handler), leaving file removal to the existing finally block that uses
temp_output_path.unlink(missing_ok=True).

else:
logger.info("Preprocessing complete. Output saved to %s", output_path)
if write_manifest:
generate_manifest(input_path, output_path)
finally:
if temp_output_path.exists() and temp_output_path != output_path:
temp_output_path.unlink(missing_ok=True)


# generate data manifest
Expand Down
19 changes: 19 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import bz2
import hashlib
import json
import xml.etree.ElementTree as ET

import pytest

Expand Down Expand Up @@ -179,6 +180,24 @@ def test_extract_text_from_xml_uncompressed(tmp_path, monkeypatch):
assert "Hello Uncompressed" in processed_file.read_text()


def test_extract_text_from_xml_malformed(tmp_path, monkeypatch):
# create a file missing closing tags
xml_content = """<?xml version=\"1.0\"?>
<mediawiki>
<page>
<title>Broken XML
"""

input_file = tmp_path / "malformed.xml"
input_file.write_text(xml_content, encoding="utf-8")

monkeypatch.chdir(tmp_path)

# ensure the parse error bubbles up with the contextual message
with pytest.raises(ET.ParseError, match=r"^Failed to parse XML"):
utils.extract_text_from_xml(input_file)


# --------------- manifest includes merkle fields ------------------------------------


Expand Down
Loading