-
-
Notifications
You must be signed in to change notification settings - Fork 28
Handle malformed XML in extract_text_from_xml and add edge case test #71
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
132592b
723674a
f7a229a
d2f1960
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |||||||||||||
| import platform | ||||||||||||||
| import re | ||||||||||||||
| import sys | ||||||||||||||
| import tempfile | ||||||||||||||
| import time | ||||||||||||||
| import tracemalloc | ||||||||||||||
| from pathlib import Path | ||||||||||||||
|
|
@@ -222,23 +223,44 @@ def extract_text_from_xml(input_path, *, write_manifest: bool = False): | |||||||||||||
|
|
||||||||||||||
| open_func = bz2.open if is_bz2 else open | ||||||||||||||
|
|
||||||||||||||
| with open_func(input_path, "rb") as f: | ||||||||||||||
| context = ET.iterparse(f, events=("end",)) | ||||||||||||||
| temp_output_fd, temp_output_path = tempfile.mkstemp(suffix=".tmp", dir=output_dir) | ||||||||||||||
| os.close(temp_output_fd) | ||||||||||||||
| temp_output_path = Path(temp_output_path) | ||||||||||||||
|
|
||||||||||||||
| with open(output_path, "w", encoding="utf-8") as out: | ||||||||||||||
| for _, elem in context: | ||||||||||||||
| if elem.tag.endswith("page"): | ||||||||||||||
| text_elem = elem.find(".//{*}text") | ||||||||||||||
|
|
||||||||||||||
| if text_elem is not None and text_elem.text: | ||||||||||||||
| cleaned = clean_wikitext(text_elem.text) | ||||||||||||||
| if cleaned: | ||||||||||||||
| out.write(cleaned + "\n\n") | ||||||||||||||
|
|
||||||||||||||
| elem.clear() | ||||||||||||||
| logger.info("Preprocessing complete. Output saved to %s", output_path) | ||||||||||||||
| if write_manifest: | ||||||||||||||
| generate_manifest(input_path, output_path) | ||||||||||||||
| try: | ||||||||||||||
| with open_func(input_path, "rb") as f: | ||||||||||||||
| context = ET.iterparse(f, events=("end",)) | ||||||||||||||
|
|
||||||||||||||
| with temp_output_path.open("w", encoding="utf-8") as out: | ||||||||||||||
| for _, elem in context: | ||||||||||||||
| if elem.tag.endswith("page"): | ||||||||||||||
| text_elem = elem.find(".//{*}text") | ||||||||||||||
|
|
||||||||||||||
| if text_elem is not None and text_elem.text: | ||||||||||||||
| cleaned = clean_wikitext(text_elem.text) | ||||||||||||||
| if cleaned: | ||||||||||||||
| out.write(cleaned + "\n\n") | ||||||||||||||
|
|
||||||||||||||
| elem.clear() | ||||||||||||||
| out.flush() | ||||||||||||||
| os.fsync(out.fileno()) | ||||||||||||||
|
|
||||||||||||||
| os.replace(temp_output_path, output_path) | ||||||||||||||
| except ET.ParseError as e: | ||||||||||||||
| msg = f"Failed to parse XML dump '{input_path}': {e}" | ||||||||||||||
| logger.error(msg) | ||||||||||||||
| raise ET.ParseError(msg) from e | ||||||||||||||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||
| except Exception: | ||||||||||||||
| if temp_output_path.exists(): | ||||||||||||||
| temp_output_path.unlink(missing_ok=True) | ||||||||||||||
| raise | ||||||||||||||
|
Comment on lines
+253
to
+256
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Redundant cleanup that may mask the original exception. The explicit The ♻️ Proposed fix: remove redundant cleanup except ET.ParseError as e:
msg = f"Failed to parse XML dump '{input_path}': {e}"
logger.error(msg)
raise ET.ParseError(msg) from e
except Exception:
- if temp_output_path.exists():
- temp_output_path.unlink(missing_ok=True)
raise
else:📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||
| else: | ||||||||||||||
| logger.info("Preprocessing complete. Output saved to %s", output_path) | ||||||||||||||
| if write_manifest: | ||||||||||||||
| generate_manifest(input_path, output_path) | ||||||||||||||
| finally: | ||||||||||||||
| if temp_output_path.exists() and temp_output_path != output_path: | ||||||||||||||
| temp_output_path.unlink(missing_ok=True) | ||||||||||||||
|
|
||||||||||||||
|
|
||||||||||||||
| # generate data manifest | ||||||||||||||
|
|
||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.