Skip to content

Commit

Permalink
fix: Fix JSONConverter to properly skip files that are not utf-8 enco…
Browse files Browse the repository at this point in the history
…ded (#8775)

* Small fix

* Add reno

* Trying out license header fix here
  • Loading branch information
sjrl authored Jan 28, 2025
1 parent e3dc164 commit bba84e5
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 0 deletions.
1 change: 1 addition & 0 deletions haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def _get_content_and_meta(self, source: ByteStream) -> List[Tuple[str, Dict[str,
source=source.meta["file_path"],
error=exc,
)
return []

meta_fields = self._meta_fields or set()

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Fixed JSONConverter to properly skip converting JSON files that are not utf-8 encoded.
19 changes: 19 additions & 0 deletions test/components/converters/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,25 @@ def test_run_with_bad_filter(tmpdir, caplog):
assert result == {"documents": []}


def test_run_with_bad_encoding(tmpdir, caplog):
test_file = Path(tmpdir / "test_file.json")
test_file.write_text(json.dumps(test_data[0]), "utf-16")

sources = [test_file]
converter = JSONConverter(".laureates")

caplog.clear()
with caplog.at_level(logging.WARNING):
result = converter.run(sources=sources)

records = caplog.records
assert len(records) == 1
assert records[0].msg.startswith(
f"Failed to extract text from {test_file}. Skipping it. Error: 'utf-8' codec can't decode byte"
)
assert result == {"documents": []}


def test_run_with_single_meta(tmpdir):
first_test_file = Path(tmpdir / "first_test_file.json")
second_test_file = Path(tmpdir / "second_test_file.json")
Expand Down

0 comments on commit bba84e5

Please sign in to comment.