Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions graphify/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def load_cached(path: Path, root: Path = Path(".")) -> dict | None:
if not entry.exists():
return None
try:
return json.loads(entry.read_text())
except (json.JSONDecodeError, OSError):
return json.loads(entry.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError, UnicodeDecodeError):
return None


Expand All @@ -70,7 +70,10 @@ def save_cached(path: Path, result: dict, root: Path = Path(".")) -> None:
entry = cache_dir(root) / f"{h}.json"
tmp = entry.with_suffix(".tmp")
try:
tmp.write_text(json.dumps(result))
# ensure_ascii=False keeps non-ASCII identifiers/labels readable in
# the cache file; encoding="utf-8" prevents UnicodeEncodeError on
# Windows where the default cp1252 codec cannot encode CJK/emoji.
tmp.write_text(json.dumps(result, ensure_ascii=False), encoding="utf-8")
os.replace(tmp, entry)
except Exception:
tmp.unlink(missing_ok=True)
Expand Down
10 changes: 8 additions & 2 deletions graphify/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
def load_manifest(manifest_path: str = _MANIFEST_PATH) -> dict[str, float]:
"""Load the file modification time manifest from a previous run."""
try:
return json.loads(Path(manifest_path).read_text())
return json.loads(Path(manifest_path).read_text(encoding="utf-8"))
except Exception:
return {}

Expand All @@ -442,7 +442,13 @@ def save_manifest(files: dict[str, list[str]], manifest_path: str = _MANIFEST_PA
except OSError:
pass # file deleted between detect() and manifest write - skip it
Path(manifest_path).parent.mkdir(parents=True, exist_ok=True)
Path(manifest_path).write_text(json.dumps(manifest, indent=2))
# ensure_ascii=False so file paths with non-ASCII characters (CJK, accented
# Latin) survive a roundtrip; encoding="utf-8" prevents UnicodeEncodeError
# on Windows where the default cp1252 codec cannot encode those paths.
Path(manifest_path).write_text(
json.dumps(manifest, indent=2, ensure_ascii=False),
encoding="utf-8",
)


def detect_incremental(root: Path, manifest_path: str = _MANIFEST_PATH) -> dict:
Expand Down
116 changes: 116 additions & 0 deletions tests/test_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""Verify cache and manifest I/O survive non-ASCII content cross-platform.

On Windows the default text codec is cp1252, which raises UnicodeEncodeError
when writing CJK / emoji / accented Latin content unless an explicit
encoding="utf-8" is passed. This test pins the behavior so any future
regression is caught immediately.
"""
from __future__ import annotations

import json
from pathlib import Path

import pytest

from graphify.cache import load_cached, save_cached
from graphify.detect import load_manifest, save_manifest


# Sample content that crashes default-codec write_text on Windows
NON_ASCII_PAYLOADS = [
"ascii_only",
"中文测试", # CJK
"日本語コメント", # Japanese
"한글 테스트", # Korean
"café résumé naïve", # Accented Latin
"emoji 🚀 mixed 中 with 한", # Mixed scripts + emoji
]


def _make_source_file(tmp_path: Path, name: str = "module.py") -> Path:
"""Create a temporary source file we can hash and cache."""
source = tmp_path / name
source.write_text("x = 1\n", encoding="utf-8")
return source


@pytest.mark.parametrize("payload", NON_ASCII_PAYLOADS)
def test_cache_roundtrip_preserves_non_ascii_labels(tmp_path, payload):
"""save_cached / load_cached must roundtrip non-ASCII node labels."""
source = _make_source_file(tmp_path)
result = {
"nodes": [{"id": "n1", "label": payload, "source_file": str(source)}],
"edges": [],
}
save_cached(source, result, root=tmp_path)
loaded = load_cached(source, root=tmp_path)

assert loaded is not None
assert loaded["nodes"][0]["label"] == payload


def test_cache_roundtrip_preserves_non_ascii_in_source_file_path(tmp_path):
"""Cache must handle source_file paths containing non-ASCII characters."""
# Create a directory and file with CJK characters in the name
cjk_dir = tmp_path / "中文目录"
cjk_dir.mkdir()
source = cjk_dir / "模块.py"
source.write_text("x = 1\n", encoding="utf-8")

result = {
"nodes": [{"id": "n1", "label": "MyClass", "source_file": str(source)}],
"edges": [],
}
save_cached(source, result, root=tmp_path)
loaded = load_cached(source, root=tmp_path)

assert loaded is not None
assert loaded["nodes"][0]["source_file"] == str(source)


def test_cache_file_is_valid_utf8_on_disk(tmp_path):
"""The cache file written to disk must be readable as UTF-8 by other tools."""
source = _make_source_file(tmp_path)
payload = "中文 emoji 🚀"
result = {"nodes": [{"id": "n1", "label": payload}], "edges": []}
save_cached(source, result, root=tmp_path)

# Find the cache entry on disk and verify it's valid UTF-8 JSON
cache_files = list((tmp_path / "graphify-out" / "cache").glob("*.json"))
assert len(cache_files) == 1
raw = cache_files[0].read_bytes()
# Round-trip via UTF-8 decode + JSON parse must succeed
parsed = json.loads(raw.decode("utf-8"))
assert parsed["nodes"][0]["label"] == payload


@pytest.mark.parametrize("path_name", [
"ascii_module.py",
"中文模块.py",
"日本語.py",
"café.py",
])
def test_manifest_roundtrip_preserves_non_ascii_paths(tmp_path, path_name):
"""save_manifest / load_manifest must roundtrip non-ASCII file paths."""
source = tmp_path / path_name
source.write_text("x = 1\n", encoding="utf-8")

manifest_path = str(tmp_path / "manifest.json")
save_manifest({"code": [str(source)]}, manifest_path=manifest_path)

loaded = load_manifest(manifest_path=manifest_path)
assert str(source) in loaded
assert loaded[str(source)] == source.stat().st_mtime


def test_manifest_file_is_valid_utf8_on_disk(tmp_path):
"""The manifest file on disk must be readable as UTF-8."""
source = tmp_path / "中文.py"
source.write_text("x = 1\n", encoding="utf-8")

manifest_path = str(tmp_path / "manifest.json")
save_manifest({"code": [str(source)]}, manifest_path=manifest_path)

raw = Path(manifest_path).read_bytes()
parsed = json.loads(raw.decode("utf-8"))
assert str(source) in parsed
Loading