Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions every_eval_ever/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import importlib
from typing import Any

__all__ = ["eval_types", "instance_level_types"]
__all__ = ['eval_types', 'instance_level_types']


def __getattr__(name: str) -> Any:
if name in {"eval_types", "instance_level_types"}:
module = importlib.import_module(f".{name}", __name__)
if name in {'eval_types', 'instance_level_types'}:
module = importlib.import_module(f'.{name}', __name__)
globals()[name] = module
return module
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
2 changes: 1 addition & 1 deletion every_eval_ever/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .cli import main

if __name__ == "__main__":
if __name__ == '__main__':
raise SystemExit(main())
74 changes: 40 additions & 34 deletions every_eval_ever/check_duplicate_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,32 @@
import os
from typing import Any, Dict, List

IGNORE_KEYS = {"retrieved_timestamp", "evaluation_id"}
IGNORE_KEYS = {'retrieved_timestamp', 'evaluation_id'}


def expand_paths(paths: List[str]) -> List[str]:
"""Expand folders to file paths."""
file_paths: List[str] = []
for path in paths:
if os.path.isfile(path) and path.endswith(".json"):
if os.path.isfile(path) and path.endswith('.json'):
file_paths.append(path)
elif os.path.isdir(path):
for root, _, file_names in os.walk(path):
for file_name in file_names:
if file_name.endswith(".json"):
if file_name.endswith('.json'):
file_paths.append(os.path.join(root, file_name))
else:
raise Exception(f"Could not find file or directory at path: {path}")
raise Exception(f'Could not find file or directory at path: {path}')
return file_paths


def annotate_error(file_path: str, message: str, **kwargs) -> None:
"""If run in GitHub Actions, annotate errors."""
if os.environ.get("GITHUB_ACTION"):
joined_kwargs = "".join(f",{key}={value}" for key, value in kwargs.items())
print(f"::error file={file_path}{joined_kwargs}::{message}")
if os.environ.get('GITHUB_ACTION'):
joined_kwargs = ''.join(
f',{key}={value}' for key, value in kwargs.items()
)
print(f'::error file={file_path}{joined_kwargs}::{message}')


def normalize_list(items: List[Any]) -> List[Any]:
Expand All @@ -36,7 +38,7 @@ def normalize_list(items: List[Any]) -> List[Any]:
return sorted(
normalized_items,
key=lambda item: json.dumps(
item, sort_keys=True, separators=(",", ":"), ensure_ascii=True
item, sort_keys=True, separators=(',', ':'), ensure_ascii=True
),
)

Expand All @@ -58,80 +60,84 @@ def normalized_hash(payload: Dict[str, Any]) -> str:
encoded = json.dumps(
normalized,
sort_keys=True,
separators=(",", ":"),
separators=(',', ':'),
ensure_ascii=True,
)
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
return hashlib.sha256(encoded.encode('utf-8')).hexdigest()


def main(argv: List[str] | None = None) -> int:
parser = argparse.ArgumentParser(
prog="check_duplicate_entries",
description="Detects duplicate evaluation entries ignoring scrape timestamp fields.",
prog='check_duplicate_entries',
description='Detects duplicate evaluation entries ignoring scrape timestamp fields.',
)
parser.add_argument(
"paths", nargs="+", type=str, help="File or folder paths to JSON data"
'paths', nargs='+', type=str, help='File or folder paths to JSON data'
)
args = parser.parse_args(argv)

file_paths = expand_paths(args.paths)
print()
print(f"Checking {len(file_paths)} JSON files for duplicates...")
print(f'Checking {len(file_paths)} JSON files for duplicates...')
print()

groups: Dict[str, List[Dict[str, Any]]] = {}
for file_path in file_paths:
try:
with open(file_path, "r") as f:
with open(file_path, 'r') as f:
payload = json.load(f)
except json.JSONDecodeError as e:
message = f"JSONDecodeError: {str(e)}"
message = f'JSONDecodeError: {str(e)}'
annotate_error(
file_path,
message,
title="JSONDecodeError",
title='JSONDecodeError',
col=e.colno,
line=e.lineno,
)
print(f"{file_path}")
print(" " + message)
print(f'{file_path}')
print(' ' + message)
print()
raise

entry_hash = normalized_hash(payload)
groups.setdefault(entry_hash, []).append(
{
"path": file_path,
"evaluation_id": payload.get("evaluation_id"),
"retrieved_timestamp": payload.get("retrieved_timestamp"),
'path': file_path,
'evaluation_id': payload.get('evaluation_id'),
'retrieved_timestamp': payload.get('retrieved_timestamp'),
}
)

duplicate_groups = [entries for entries in groups.values() if len(entries) > 1]
duplicate_groups = [
entries for entries in groups.values() if len(entries) > 1
]
if not duplicate_groups:
print("No duplicates found.")
print('No duplicates found.')
print()
return 0

ignore_label = ", ".join(f"`{key}`" for key in sorted(IGNORE_KEYS))
print(f"Found duplicate entries (ignoring keys: {ignore_label}).")
ignore_label = ', '.join(f'`{key}`' for key in sorted(IGNORE_KEYS))
print(f'Found duplicate entries (ignoring keys: {ignore_label}).')
print()

for index, entries in enumerate(duplicate_groups, start=1):
print(f"Duplicate group {index} ({len(entries)} files):")
print(f'Duplicate group {index} ({len(entries)} files):')
for entry in entries:
print(f" - {entry['path']}")
print(f" evaluation_id: {entry.get('evaluation_id')}")
print(f" retrieved_timestamp: {entry.get('retrieved_timestamp')}")
print(f' - {entry["path"]}')
print(f' evaluation_id: {entry.get("evaluation_id")}')
print(
f' retrieved_timestamp: {entry.get("retrieved_timestamp")}'
)
annotate_error(
entry["path"],
"Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`).",
title="DuplicateEntry",
entry['path'],
'Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`).',
title='DuplicateEntry',
)
print()

return 1


if __name__ == "__main__":
if __name__ == '__main__':
raise SystemExit(main())
Loading
Loading