diff --git a/every_eval_ever/__init__.py b/every_eval_ever/__init__.py index 5e49a619a..7d41cde78 100644 --- a/every_eval_ever/__init__.py +++ b/every_eval_ever/__init__.py @@ -5,12 +5,12 @@ import importlib from typing import Any -__all__ = ["eval_types", "instance_level_types"] +__all__ = ['eval_types', 'instance_level_types'] def __getattr__(name: str) -> Any: - if name in {"eval_types", "instance_level_types"}: - module = importlib.import_module(f".{name}", __name__) + if name in {'eval_types', 'instance_level_types'}: + module = importlib.import_module(f'.{name}', __name__) globals()[name] = module return module - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') diff --git a/every_eval_ever/__main__.py b/every_eval_ever/__main__.py index bfdcd0c11..03f036e6d 100644 --- a/every_eval_ever/__main__.py +++ b/every_eval_ever/__main__.py @@ -1,4 +1,4 @@ from .cli import main -if __name__ == "__main__": +if __name__ == '__main__': raise SystemExit(main()) diff --git a/every_eval_ever/check_duplicate_entries.py b/every_eval_ever/check_duplicate_entries.py index a9bb862eb..5f6c53342 100644 --- a/every_eval_ever/check_duplicate_entries.py +++ b/every_eval_ever/check_duplicate_entries.py @@ -4,30 +4,32 @@ import os from typing import Any, Dict, List -IGNORE_KEYS = {"retrieved_timestamp", "evaluation_id"} +IGNORE_KEYS = {'retrieved_timestamp', 'evaluation_id'} def expand_paths(paths: List[str]) -> List[str]: """Expand folders to file paths.""" file_paths: List[str] = [] for path in paths: - if os.path.isfile(path) and path.endswith(".json"): + if os.path.isfile(path) and path.endswith('.json'): file_paths.append(path) elif os.path.isdir(path): for root, _, file_names in os.walk(path): for file_name in file_names: - if file_name.endswith(".json"): + if file_name.endswith('.json'): file_paths.append(os.path.join(root, file_name)) else: - raise Exception(f"Could not find file or directory at path: {path}") + raise Exception(f'Could not find file or directory at path: {path}') return file_paths def annotate_error(file_path: str, message: str, **kwargs) -> None: """If run in GitHub Actions, annotate errors.""" - if os.environ.get("GITHUB_ACTION"): - joined_kwargs = "".join(f",{key}={value}" for key, value in kwargs.items()) - print(f"::error file={file_path}{joined_kwargs}::{message}") + if os.environ.get('GITHUB_ACTION'): + joined_kwargs = ''.join( + f',{key}={value}' for key, value in kwargs.items() + ) + print(f'::error file={file_path}{joined_kwargs}::{message}') def normalize_list(items: List[Any]) -> List[Any]: @@ -36,7 +38,7 @@ def normalize_list(items: List[Any]) -> List[Any]: return sorted( normalized_items, key=lambda item: json.dumps( - item, sort_keys=True, separators=(",", ":"), ensure_ascii=True + item, sort_keys=True, separators=(',', ':'), ensure_ascii=True ), ) @@ -58,80 +60,84 @@ def normalized_hash(payload: Dict[str, Any]) -> str: encoded = json.dumps( normalized, sort_keys=True, - separators=(",", ":"), + separators=(',', ':'), ensure_ascii=True, ) - return hashlib.sha256(encoded.encode("utf-8")).hexdigest() + return hashlib.sha256(encoded.encode('utf-8')).hexdigest() def main(argv: List[str] | None = None) -> int: parser = argparse.ArgumentParser( - prog="check_duplicate_entries", - description="Detects duplicate evaluation entries ignoring scrape timestamp fields.", + prog='check_duplicate_entries', + description='Detects duplicate evaluation entries ignoring scrape timestamp fields.', ) parser.add_argument( - "paths", nargs="+", type=str, help="File or folder paths to JSON data" + 'paths', nargs='+', type=str, help='File or folder paths to JSON data' ) args = parser.parse_args(argv) file_paths = expand_paths(args.paths) print() - print(f"Checking {len(file_paths)} JSON files for duplicates...") + print(f'Checking {len(file_paths)} JSON files for duplicates...') print() groups: Dict[str, List[Dict[str, Any]]] = {} for file_path in file_paths: try: - with open(file_path, "r") as f: + with open(file_path, 'r') as f: payload = json.load(f) except json.JSONDecodeError as e: - message = f"JSONDecodeError: {str(e)}" + message = f'JSONDecodeError: {str(e)}' annotate_error( file_path, message, - title="JSONDecodeError", + title='JSONDecodeError', col=e.colno, line=e.lineno, ) - print(f"{file_path}") - print(" " + message) + print(f'{file_path}') + print(' ' + message) print() raise entry_hash = normalized_hash(payload) groups.setdefault(entry_hash, []).append( { - "path": file_path, - "evaluation_id": payload.get("evaluation_id"), - "retrieved_timestamp": payload.get("retrieved_timestamp"), + 'path': file_path, + 'evaluation_id': payload.get('evaluation_id'), + 'retrieved_timestamp': payload.get('retrieved_timestamp'), } ) - duplicate_groups = [entries for entries in groups.values() if len(entries) > 1] + duplicate_groups = [ + entries for entries in groups.values() if len(entries) > 1 + ] if not duplicate_groups: - print("No duplicates found.") + print('No duplicates found.') print() return 0 - ignore_label = ", ".join(f"`{key}`" for key in sorted(IGNORE_KEYS)) - print(f"Found duplicate entries (ignoring keys: {ignore_label}).") + ignore_label = ', '.join(f'`{key}`' for key in sorted(IGNORE_KEYS)) + print(f'Found duplicate entries (ignoring keys: {ignore_label}).') print() for index, entries in enumerate(duplicate_groups, start=1): - print(f"Duplicate group {index} ({len(entries)} files):") + print(f'Duplicate group {index} ({len(entries)} files):') for entry in entries: - print(f" - {entry['path']}") - print(f" evaluation_id: {entry.get('evaluation_id')}") - print(f" retrieved_timestamp: {entry.get('retrieved_timestamp')}") + print(f' - {entry["path"]}') + print(f' evaluation_id: {entry.get("evaluation_id")}') + print( + f' retrieved_timestamp: {entry.get("retrieved_timestamp")}' + ) annotate_error( - entry["path"], - "Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`).", - title="DuplicateEntry", + entry['path'], + 'Duplicate entry detected (ignoring `evaluation_id` and `retrieved_timestamp`).', + title='DuplicateEntry', ) print() return 1 -if __name__ == "__main__": +if __name__ == '__main__': raise SystemExit(main()) diff --git a/every_eval_ever/cli.py b/every_eval_ever/cli.py index 104e8ec9a..41da376de 100644 --- a/every_eval_ever/cli.py +++ b/every_eval_ever/cli.py @@ -10,44 +10,50 @@ from typing import Any EVALUATOR_RELATIONSHIP_CHOICES = [ - "first_party", - "third_party", - "collaborative", - "other", + 'first_party', + 'third_party', + 'collaborative', + 'other', ] def _common_metadata(args: argparse.Namespace) -> dict[str, Any]: return { - "source_organization_name": args.source_organization_name, - "evaluator_relationship": args.evaluator_relationship, - "source_organization_url": args.source_organization_url, - "source_organization_logo_url": args.source_organization_logo_url, - "eval_library_name": args.eval_library_name, - "eval_library_version": args.eval_library_version, - "parent_eval_output_dir": args.output_dir, + 'source_organization_name': args.source_organization_name, + 'evaluator_relationship': args.evaluator_relationship, + 'source_organization_url': args.source_organization_url, + 'source_organization_logo_url': args.source_organization_logo_url, + 'eval_library_name': args.eval_library_name, + 'eval_library_version': args.eval_library_version, + 'parent_eval_output_dir': args.output_dir, } def _output_dir_for_log(base_output: Path, log: Any) -> Path: - dataset = "unknown" + dataset = 'unknown' if log.evaluation_results and log.evaluation_results[0].source_data: - dataset = log.evaluation_results[0].source_data.dataset_name or "unknown" - model_id = log.model_info.id or "unknown" - parts = model_id.split("/", 1) - developer = parts[0] if len(parts) == 2 else "unknown" + dataset = ( + log.evaluation_results[0].source_data.dataset_name or 'unknown' + ) + model_id = log.model_info.id or 'unknown' + parts = model_id.split('/', 1) + developer = parts[0] if len(parts) == 2 else 'unknown' model_name = parts[1] if len(parts) == 2 else model_id out_dir = base_output / dataset / developer / model_name out_dir.mkdir(parents=True, exist_ok=True) return out_dir -def _write_log(log: Any, base_output: Path, eval_uuid: str | None = None) -> Path: +def _write_log( + log: Any, base_output: Path, eval_uuid: str | None = None +) -> Path: out_dir = _output_dir_for_log(base_output, log) eval_uuid = eval_uuid or str(uuid.uuid4()) - out_file = out_dir / f"{eval_uuid}.json" - with out_file.open("w", encoding="utf-8") as file: - json.dump(log.model_dump(mode="json", exclude_none=True), file, indent=2) + out_file = out_dir / f'{eval_uuid}.json' + with out_file.open('w', encoding='utf-8') as file: + json.dump( + log.model_dump(mode='json', exclude_none=True), file, indent=2 + ) return out_file @@ -61,26 +67,28 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int: adapter = LMEvalAdapter() metadata = _common_metadata(args) if args.inference_engine: - metadata["inference_engine"] = args.inference_engine + metadata['inference_engine'] = args.inference_engine if args.inference_engine_version: - metadata["inference_engine_version"] = args.inference_engine_version + metadata['inference_engine_version'] = args.inference_engine_version log_path = Path(args.log_path) - metadata["parent_eval_output_dir"] = str(log_path.parent if log_path.is_file() else log_path) + metadata['parent_eval_output_dir'] = str( + log_path.parent if log_path.is_file() else log_path + ) if log_path.is_file(): logs = adapter.transform_from_file(log_path, metadata) elif log_path.is_dir(): logs = adapter.transform_from_directory(log_path, metadata) else: - raise FileNotFoundError(f"Path is not a file or directory: {log_path}") + raise FileNotFoundError(f'Path is not a file or directory: {log_path}') output_dir = Path(args.output_dir) for log in logs: eval_uuid = str(uuid.uuid4()) if args.include_samples: meta = adapter.get_eval_metadata(log.evaluation_id) - parent_dir = meta.get("parent_dir") - task_name = meta.get("task_name") + parent_dir = meta.get('parent_dir') + task_name = meta.get('task_name') if parent_dir and task_name: samples_file = find_samples_file(Path(parent_dir), task_name) if samples_file: @@ -95,7 +103,7 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int: log.detailed_evaluation_results = detailed print(_write_log(log, output_dir, eval_uuid=eval_uuid)) - print(f"Converted {len(logs)} evaluation log(s).") + print(f'Converted {len(logs)} evaluation log(s).') return 0 @@ -104,7 +112,7 @@ def _cmd_convert_inspect(args: argparse.Namespace) -> int: adapter = InspectAIAdapter() metadata = _common_metadata(args) - metadata["file_uuid"] = str(uuid.uuid4()) + metadata['file_uuid'] = str(uuid.uuid4()) log_path = Path(args.log_path) if log_path.is_file(): @@ -112,13 +120,13 @@ def _cmd_convert_inspect(args: argparse.Namespace) -> int: elif log_path.is_dir(): logs = adapter.transform_from_directory(log_path, metadata) else: - raise FileNotFoundError(f"Path is not a file or directory: {log_path}") + raise FileNotFoundError(f'Path is not a file or directory: {log_path}') output_dir = Path(args.output_dir) for log in logs: print(_write_log(log, output_dir)) - print(f"Converted {len(logs)} evaluation log(s).") + print(f'Converted {len(logs)} evaluation log(s).') return 0 @@ -127,160 +135,162 @@ def _cmd_convert_helm(args: argparse.Namespace) -> int: adapter = HELMAdapter() metadata = _common_metadata(args) - metadata["file_uuid"] = str(uuid.uuid4()) + metadata['file_uuid'] = str(uuid.uuid4()) logs = adapter.transform_from_directory( Path(args.log_path), - output_path=str(Path(args.output_dir) / "helm_output"), + output_path=str(Path(args.output_dir) / 'helm_output'), metadata_args=metadata, ) output_dir = Path(args.output_dir) for log in logs: print(_write_log(log, output_dir)) - print(f"Converted {len(logs)} evaluation log(s).") + print(f'Converted {len(logs)} evaluation log(s).') return 0 def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( - prog="every_eval_ever", + prog='every_eval_ever', description=( - "CLI for validating and converting evaluation results into the " - "Every Eval Ever schema." + 'CLI for validating and converting evaluation results into the ' + 'Every Eval Ever schema.' ), epilog=( - "Examples:\n" - " every_eval_ever convert lm_eval --log_path results.json --output_dir data\n" - " every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n" - " every_eval_ever convert helm --log_path helm_run_dir --output_dir data" + 'Examples:\n' + ' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n' + ' every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n' + ' every_eval_ever convert helm --log_path helm_run_dir --output_dir data' ), formatter_class=argparse.RawDescriptionHelpFormatter, ) - subparsers = parser.add_subparsers(dest="command", required=True) + subparsers = parser.add_subparsers(dest='command', required=True) validate_parser = subparsers.add_parser( - "validate", - help="Validate JSON and JSONL files with Pydantic models", + 'validate', + help='Validate JSON and JSONL files with Pydantic models', description=( - "Validate aggregate .json and instance-level .jsonl files " - "using the bundled Pydantic models." + 'Validate aggregate .json and instance-level .jsonl files ' + 'using the bundled Pydantic models.' ), ) validate_parser.add_argument( - "paths", - nargs="+", - help="One or more files or directories containing .json/.jsonl files.", + 'paths', + nargs='+', + help='One or more files or directories containing .json/.jsonl files.', ) validate_parser.add_argument( - "--max-errors", + '--max-errors', type=int, default=50, - help="Maximum errors to report per JSONL file.", + help='Maximum errors to report per JSONL file.', ) validate_parser.add_argument( - "--format", - choices=["rich", "json", "github"], - default="rich", - dest="output_format", - help="Output format.", + '--format', + choices=['rich', 'json', 'github'], + default='rich', + dest='output_format', + help='Output format.', ) check_duplicates_parser = subparsers.add_parser( - "check-duplicates", - help="Detect duplicate evaluation JSON entries", + 'check-duplicates', + help='Detect duplicate evaluation JSON entries', description=( - "Detect duplicate evaluation entries while ignoring scrape-specific " - "keys (evaluation_id and retrieved_timestamp)." + 'Detect duplicate evaluation entries while ignoring scrape-specific ' + 'keys (evaluation_id and retrieved_timestamp).' ), ) check_duplicates_parser.add_argument( - "paths", - nargs="+", - help="One or more JSON files or directories containing JSON files.", + 'paths', + nargs='+', + help='One or more JSON files or directories containing JSON files.', ) convert_parser = subparsers.add_parser( - "convert", - help="Convert source eval logs to every_eval_ever", - description="Convert outputs from supported eval frameworks into Every Eval Ever JSON.", + 'convert', + help='Convert source eval logs to every_eval_ever', + description='Convert outputs from supported eval frameworks into Every Eval Ever JSON.', + ) + convert_subparsers = convert_parser.add_subparsers( + dest='source', required=True ) - convert_subparsers = convert_parser.add_subparsers(dest="source", required=True) - for source in ["lm_eval", "inspect", "helm"]: + for source in ['lm_eval', 'inspect', 'helm']: source_parser = convert_subparsers.add_parser( source, - help=f"Convert {source} logs", - description=f"Convert {source} evaluation outputs to Every Eval Ever format.", + help=f'Convert {source} logs', + description=f'Convert {source} evaluation outputs to Every Eval Ever format.', ) source_parser.add_argument( - "--log_path", - "--log-path", + '--log_path', + '--log-path', required=True, - help="Path to source log file or directory to convert.", + help='Path to source log file or directory to convert.', ) source_parser.add_argument( - "--output_dir", - "--output-dir", - default="data", - help="Base output directory where converted files are written.", + '--output_dir', + '--output-dir', + default='data', + help='Base output directory where converted files are written.', ) source_parser.add_argument( - "--source_organization_name", - "--source-organization-name", - default="unknown", - help="Organization name for source_metadata.source_organization_name.", + '--source_organization_name', + '--source-organization-name', + default='unknown', + help='Organization name for source_metadata.source_organization_name.', ) source_parser.add_argument( - "--evaluator_relationship", - "--evaluator-relationship", - default="third_party", + '--evaluator_relationship', + '--evaluator-relationship', + default='third_party', choices=EVALUATOR_RELATIONSHIP_CHOICES, - help="Relationship between evaluator and model developer.", + help='Relationship between evaluator and model developer.', ) source_parser.add_argument( - "--source_organization_url", - "--source-organization-url", + '--source_organization_url', + '--source-organization-url', default=None, - help="Optional organization URL for source metadata.", + help='Optional organization URL for source metadata.', ) source_parser.add_argument( - "--source_organization_logo_url", - "--source-organization-logo-url", + '--source_organization_logo_url', + '--source-organization-logo-url', default=None, - help="Optional organization logo URL for source metadata.", + help='Optional organization logo URL for source metadata.', ) source_parser.add_argument( - "--eval_library_name", - "--eval-library-name", + '--eval_library_name', + '--eval-library-name', default=source, - help="Evaluation library name recorded in eval_library.name.", + help='Evaluation library name recorded in eval_library.name.', ) source_parser.add_argument( - "--eval_library_version", - "--eval-library-version", - default="unknown", - help="Evaluation library version recorded in eval_library.version.", + '--eval_library_version', + '--eval-library-version', + default='unknown', + help='Evaluation library version recorded in eval_library.version.', ) - if source == "lm_eval": + if source == 'lm_eval': source_parser.add_argument( - "--include_samples", - "--include-samples", - action="store_true", - help="Also convert lm-eval sample JSONL into instance-level output.", + '--include_samples', + '--include-samples', + action='store_true', + help='Also convert lm-eval sample JSONL into instance-level output.', ) source_parser.add_argument( - "--inference_engine", - "--inference-engine", + '--inference_engine', + '--inference-engine', default=None, - help="Override inferred inference engine (e.g. vllm, transformers).", + help='Override inferred inference engine (e.g. vllm, transformers).', ) source_parser.add_argument( - "--inference_engine_version", - "--inference-engine-version", + '--inference_engine_version', + '--inference-engine-version', default=None, - help="Inference engine version to record in model_info.inference_engine.version.", + help='Inference engine version to record in model_info.inference_engine.version.', ) return parser @@ -290,37 +300,37 @@ def main(argv: list[str] | None = None) -> int: parser = build_parser() args = parser.parse_args(argv) - if args.command == "validate": + if args.command == 'validate': from every_eval_ever.validate import main as validate_main return validate_main( [ - *args.paths, - "--max-errors", + *args.paths, + '--max-errors', str(args.max_errors), - "--format", + '--format', args.output_format, ] ) - if args.command == "check-duplicates": + if args.command == 'check-duplicates': from every_eval_ever.check_duplicate_entries import ( main as check_duplicates_main, ) return check_duplicates_main(args.paths) - if args.command == "convert": - if args.source == "lm_eval": + if args.command == 'convert': + if args.source == 'lm_eval': return _cmd_convert_lm_eval(args) - if args.source == "inspect": + if args.source == 'inspect': return _cmd_convert_inspect(args) - if args.source == "helm": + if args.source == 'helm': return _cmd_convert_helm(args) parser.print_help() return 1 -if __name__ == "__main__": +if __name__ == '__main__': raise SystemExit(main(sys.argv[1:])) diff --git a/every_eval_ever/converters/common/adapter.py b/every_eval_ever/converters/common/adapter.py index 6c7712f09..4598ee6de 100644 --- a/every_eval_ever/converters/common/adapter.py +++ b/every_eval_ever/converters/common/adapter.py @@ -3,91 +3,95 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum -from huggingface_hub import model_info from pathlib import Path -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Union + +from huggingface_hub import model_info -from every_eval_ever.converters.common.error import AdapterError, TransformationError +from every_eval_ever.converters.common.error import ( + AdapterError, + TransformationError, +) from every_eval_ever.eval_types import EvaluationLog + @dataclass class AdapterMetadata: """Metadata about the adapter""" + name: str version: str supported_library_versions: List[str] description: str - + class SupportedLibrary(Enum): """Supported evaluation libraries""" - LM_EVAL = "lm-evaluation-harness" - INSPECT_AI = "inspect-ai" - HELM = "helm" - CUSTOM = "custom" + + LM_EVAL = 'lm-evaluation-harness' + INSPECT_AI = 'inspect-ai' + HELM = 'helm' + CUSTOM = 'custom' class BaseEvaluationAdapter(ABC): """ Base class for all evaluation adapters. - + Each adapter is responsible for transforming evaluation outputs from a specific library into the unified schema format. """ - + def __init__(self, strict_validation: bool = True): """ Initialize the adapter. - + Args: strict_validation: If True, raise errors on validation failures. If False, log warnings and continue. """ self.strict_validation = strict_validation self.logger = logging.getLogger(self.__class__.__name__) - + @property @abstractmethod def metadata(self) -> AdapterMetadata: """Return metadata about this adapter""" pass - + @property @abstractmethod def supported_library(self) -> SupportedLibrary: """Return the library this adapter supports""" pass - + @abstractmethod def _transform_single( self, raw_data: Any, metadata_args: Dict[str, Any] ) -> EvaluationLog: """ Transform a single evaluation record. - + Args: raw_data: Single evaluation record in library-specific format - + Returns: EvaluationLog in unified schema format - + Raises: TransformationError: If transformation fails """ pass - + def transform( self, data: Any, metadata_args: Dict[str, Any] - ) -> Union[ - EvaluationLog, - List[EvaluationLog] - ]: + ) -> Union[EvaluationLog, List[EvaluationLog]]: """ Transform evaluation data to unified schema format. - + Args: data: Raw evaluation output (single record or list) - + Returns: Transformed data in unified schema format """ @@ -100,63 +104,55 @@ def transform( result = self._transform_single(item, metadata_args) results.append(result) except Exception as e: - self._handle_transformation_error(e, f"item {i}") + self._handle_transformation_error(e, f'item {i}') return results else: return self._transform_single(data, metadata_args) - + except Exception as e: - self._handle_transformation_error(e, "data transformation") - + self._handle_transformation_error(e, 'data transformation') + def transform_from_file( self, file_path: Union[str, Path], metadata_args: Dict[str, Any] - ) -> Union[ - EvaluationLog, - List[EvaluationLog] - ]: + ) -> Union[EvaluationLog, List[EvaluationLog]]: """ Load and transform evaluation data from file. - + Args: file_path: Path to the evaluation output file - + Returns: Transformed data in unified schema format """ file_path = Path(file_path) - + if not file_path.exists(): - raise AdapterError(f"File not found: {file_path}") - + raise AdapterError(f'File not found: {file_path}') + try: data = self._load_file(file_path) return self.transform(data, metadata_args) except Exception as e: - raise AdapterError(f"Failed to load file {file_path}: {str(e)}") - + raise AdapterError(f'Failed to load file {file_path}: {str(e)}') + @abstractmethod def transform_from_directory( - self, - dir_path: Union[str, Path], - metadata_args: Dict[str, Any] = None - ) -> Union[ - EvaluationLog, - List[EvaluationLog] - ]: + self, dir_path: Union[str, Path], metadata_args: Dict[str, Any] = None + ) -> Union[EvaluationLog, List[EvaluationLog]]: """ Load and transform evaluation data from all files in a directory. - + Args: dir_path: Path to the directory containing evaluation output files - + Returns: Transformed data in unified schema format """ dir_path = Path(dir_path) - + if not dir_path.is_dir(): - raise AdapterError(f"Path is not a directory: {dir_path}") - + raise AdapterError(f'Path is not a directory: {dir_path}') + # Subclass must implement this part # e.g., how to iterate through files and process them pass @@ -164,10 +160,10 @@ def transform_from_directory( def _load_file(self, file_path: Path) -> Any: """ Load data from file. Override for custom file formats. - + Args: file_path: Path to the file - + Returns: Loaded data """ @@ -183,12 +179,12 @@ def _load_file(self, file_path: Path) -> Any: data.append(json.loads(line)) return data else: - raise AdapterError(f"Unsupported file format: {file_path.suffix}") - + raise AdapterError(f'Unsupported file format: {file_path.suffix}') + def _handle_transformation_error(self, error: Exception, context: str): """Handle transformation errors based on strict_validation setting""" - error_msg = f"Transformation error in {context}: {str(error)}" - + error_msg = f'Transformation error in {context}: {str(error)}' + if self.strict_validation: raise TransformationError(error_msg) from error else: @@ -200,4 +196,4 @@ def _check_if_model_is_on_huggingface(self, model_path): return info except Exception: # self.logger.warning(f"Model '{model_path}' not found on Hugging Face.") - pass \ No newline at end of file + pass diff --git a/every_eval_ever/converters/common/error.py b/every_eval_ever/converters/common/error.py index 7fe041bf7..45f0c5e59 100644 --- a/every_eval_ever/converters/common/error.py +++ b/every_eval_ever/converters/common/error.py @@ -1,7 +1,10 @@ class AdapterError(Exception): """Base exception for adapter errors""" + pass + class TransformationError(AdapterError): """Raised when transformation logic fails""" - pass \ No newline at end of file + + pass diff --git a/every_eval_ever/converters/common/utils.py b/every_eval_ever/converters/common/utils.py index 31bd1a56c..17b98ccd1 100644 --- a/every_eval_ever/converters/common/utils.py +++ b/every_eval_ever/converters/common/utils.py @@ -1,60 +1,61 @@ import hashlib from datetime import datetime -from huggingface_hub import HfApi from typing import Dict +from huggingface_hub import HfApi + + def convert_timestamp_to_unix_format(timestamp: str) -> str: dt = datetime.fromisoformat(timestamp) return str(dt.timestamp()) + def get_current_unix_timestamp() -> str: return str(datetime.now().timestamp()) + def get_model_organization_info(model_base_name: str) -> Dict: """ - Searches the Hugging Face Hub for a model based on its base name + Searches the Hugging Face Hub for a model based on its base name and attempts to find the organization that published the most relevant/original version. Args: model_base_name: The model name without an organization (e.g., 'deepseek-coder-6.7b-base'). Returns: - A dictionary containing the best-guess organization and full repository ID, + A dictionary containing the best-guess organization and full repository ID, or an error message. """ - + api = HfApi() - + try: models = api.list_models( - search=model_base_name, - sort="downloads", - direction=-1, - limit=50 + search=model_base_name, sort='downloads', direction=-1, limit=50 ) models_list = list(models) except Exception as e: - return f"Failed to connect to Hugging Face Hub: {e}" + return f'Failed to connect to Hugging Face Hub: {e}' if not models_list: return 'not_found' # Heuristic to find the 'Original' Organization: - # The original model is usually the one with the shortest repo_id + # The original model is usually the one with the shortest repo_id # that includes the base model name (e.g., 'deepseek-ai/deepseek-coder-6.7b-base'). # We also prioritize the one with the highest downloads. - - best_match = models_list[0] # Start with the most downloaded model + + best_match = models_list[0] # Start with the most downloaded model for model in models_list: repo_id = model.modelId - + parts = repo_id.split('/') if len(parts) != 2: - continue - + continue + org, name = parts - + # A good heuristic: the model name part (name) should exactly match the base name, # or be a very close variant (e.g., -instruct) with the highest download count. if model_base_name in name and name == model_base_name: @@ -66,12 +67,14 @@ def get_model_organization_info(model_base_name: str) -> Dict: return organization + def sha256_file(path, chunk_size=8192): sha256 = hashlib.sha256() - with open(path, "rb") as f: - for chunk in iter(lambda: f.read(chunk_size), b""): + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(chunk_size), b''): sha256.update(chunk) return sha256.hexdigest() + def sha256_string(text: str, chunk_size=8192): - return hashlib.sha256(text.encode('utf-8')).hexdigest() \ No newline at end of file + return hashlib.sha256(text.encode('utf-8')).hexdigest() diff --git a/every_eval_ever/converters/helm/__main__.py b/every_eval_ever/converters/helm/__main__.py index d09707a2f..33129c341 100644 --- a/every_eval_ever/converters/helm/__main__.py +++ b/every_eval_ever/converters/helm/__main__.py @@ -1,7 +1,8 @@ from __future__ import annotations -from argparse import ArgumentParser -import uuid + import json +import uuid +from argparse import ArgumentParser from enum import Enum from pathlib import Path from typing import Any, Dict, List, Union @@ -11,23 +12,50 @@ except ImportError as exc: raise SystemExit( "The 'crfm-helm' package is required to use the HELM converter.\n" - "Install it with: uv sync --extra helm" + 'Install it with: uv sync --extra helm' ) from exc -from every_eval_ever.eval_types import EvaluatorRelationship, EvaluationLog +from every_eval_ever.eval_types import EvaluationLog, EvaluatorRelationship + def parse_args(): parser = ArgumentParser() - parser.add_argument('--log_path', type=str, default='tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', help="Path to directory with single evaluaion or multiple evaluations to convert") + parser.add_argument( + '--log_path', + type=str, + default='tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + help='Path to directory with single evaluaion or multiple evaluations to convert', + ) parser.add_argument('--output_dir', type=str, default='data') - parser.add_argument('--source_organization_name', type=str, help='Orgnization which pushed evaluation.') - parser.add_argument('--evaluator_relationship', type=str, default='other', help='Relationship of evaluation author to the model', choices=['first_party', 'third_party', 'collaborative', 'other']) + parser.add_argument( + '--source_organization_name', + type=str, + help='Orgnization which pushed evaluation.', + ) + parser.add_argument( + '--evaluator_relationship', + type=str, + default='other', + help='Relationship of evaluation author to the model', + choices=['first_party', 'third_party', 'collaborative', 'other'], + ) parser.add_argument('--source_organization_url', type=str, default=None) - parser.add_argument('--source_organization_logo_url', type=str, default=None) - parser.add_argument('--eval_library_name', type=str, default='helm', help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)') - parser.add_argument('--eval_library_version', type=str, default='unknown', help='Version of the evaluation library') - + parser.add_argument( + '--source_organization_logo_url', type=str, default=None + ) + parser.add_argument( + '--eval_library_name', + type=str, + default='helm', + help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)', + ) + parser.add_argument( + '--eval_library_version', + type=str, + default='unknown', + help='Version of the evaluation library', + ) args = parser.parse_args() return args @@ -39,22 +67,36 @@ def default(self, obj): return obj.value return super().default(obj) + class HELMEvalLogConverter: - def __init__(self, log_path: str | Path, output_dir: str = 'unified_schema/helm'): - ''' + def __init__( + self, log_path: str | Path, output_dir: str = 'unified_schema/helm' + ): + """ HELM generates log file for an evaluation. - ''' + """ self.log_path = Path(log_path) - + self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) - def convert_to_unified_schema(self, metadata_args: Dict[str, Any] = None) -> Union[EvaluationLog, List[EvaluationLog]]: - return HELMAdapter().transform_from_directory(self.log_path, self.output_dir, metadata_args=metadata_args) + def convert_to_unified_schema( + self, metadata_args: Dict[str, Any] = None + ) -> Union[EvaluationLog, List[EvaluationLog]]: + return HELMAdapter().transform_from_directory( + self.log_path, self.output_dir, metadata_args=metadata_args + ) - def save_to_file(self, unified_eval_log: EvaluationLog, output_filedir: str, output_filepath: str) -> bool: + def save_to_file( + self, + unified_eval_log: EvaluationLog, + output_filedir: str, + output_filepath: str, + ) -> bool: try: - json_str = unified_eval_log.model_dump_json(indent=4, exclude_none=True) + json_str = unified_eval_log.model_dump_json( + indent=4, exclude_none=True + ) unified_eval_log_dir = Path(f'{self.output_dir}/{output_filedir}') unified_eval_log_dir.mkdir(parents=True, exist_ok=True) @@ -63,15 +105,18 @@ def save_to_file(self, unified_eval_log: EvaluationLog, output_filedir: str, out with open(unified_eval_path, 'w') as json_file: json_file.write(json_str) - print(f'Unified eval log was successfully saved to {output_filepath} file.') + print( + f'Unified eval log was successfully saved to {output_filepath} file.' + ) except Exception as e: - print(f"Problem with saving unified eval log to file: {e}") + print(f'Problem with saving unified eval log to file: {e}') raise e + def save_evaluation_log( unified_output: EvaluationLog, helm_converter: HELMEvalLogConverter, - file_uuid: str + file_uuid: str, ) -> bool: try: model_developer, model_name = unified_output.model_info.id.split('/') @@ -80,7 +125,9 @@ def save_evaluation_log( helm_converter.save_to_file(unified_output, filedir, filename) return True except Exception as e: - print(f'Failed to save eval log {unified_output.evaluation_id} to file.\n{str(e)}') + print( + f'Failed to save eval log {unified_output.evaluation_id} to file.\n{str(e)}' + ) return False @@ -88,17 +135,18 @@ def save_evaluation_log( args = parse_args() helm_converter = HELMEvalLogConverter( - log_path=args.log_path, - output_dir=args.output_dir + log_path=args.log_path, output_dir=args.output_dir ) - + file_uuid = str(uuid.uuid4()) metadata_args = { 'source_organization_name': args.source_organization_name, 'source_organization_url': args.source_organization_url, 'source_organization_logo_url': args.source_organization_logo_url, - 'evaluator_relationship': EvaluatorRelationship(args.evaluator_relationship), + 'evaluator_relationship': EvaluatorRelationship( + args.evaluator_relationship + ), 'file_uuid': file_uuid, 'parent_eval_output_dir': args.output_dir, 'eval_library_name': args.eval_library_name, @@ -108,18 +156,12 @@ def save_evaluation_log( unified_output = helm_converter.convert_to_unified_schema(metadata_args) if unified_output and isinstance(unified_output, EvaluationLog): - save_evaluation_log( - unified_output, - helm_converter, - file_uuid - ) + save_evaluation_log(unified_output, helm_converter, file_uuid) elif unified_output and isinstance(unified_output, List): for single_unified_output in unified_output: save_evaluation_log( - single_unified_output, - helm_converter, - file_uuid + single_unified_output, helm_converter, file_uuid ) else: - print("Missing unified schema result!") + print('Missing unified schema result!') diff --git a/every_eval_ever/converters/helm/adapter.py b/every_eval_ever/converters/helm/adapter.py index 25f641dff..4367462b6 100644 --- a/every_eval_ever/converters/helm/adapter.py +++ b/every_eval_ever/converters/helm/adapter.py @@ -1,65 +1,73 @@ +import datetime import json import os -import datetime -from typing import Any, Dict, List, Tuple from pathlib import Path +from typing import Any, Dict, List, Tuple _HELM_IMPORT_ERROR: Exception | None = None try: from dacite import from_dict - from helm.benchmark.metrics.metric import PerInstanceStats from helm.benchmark.adaptation.scenario_state import ( AdapterSpec, RequestState, ScenarioState, ) + from helm.benchmark.config_registry import ( + register_builtin_configs_from_helm_package, + ) + from helm.benchmark.metrics.metric import PerInstanceStats from helm.benchmark.metrics.statistic import Stat - from helm.benchmark.config_registry import register_builtin_configs_from_helm_package from helm.benchmark.model_deployment_registry import get_model_deployment from helm.benchmark.run_spec import RunSpec from helm.common.codec import from_json -except Exception as ex: # pragma: no cover - exercised only when optional deps missing +except ( + Exception +) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex from_dict = None # type: ignore[assignment] - PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = RunSpec = Any # type: ignore[assignment] - get_model_deployment = register_builtin_configs_from_helm_package = from_json = None # type: ignore[assignment] + PerInstanceStats = AdapterSpec = RequestState = ScenarioState = Stat = ( + RunSpec + ) = Any # type: ignore[assignment] + get_model_deployment = register_builtin_configs_from_helm_package = ( + from_json + ) = None # type: ignore[assignment] +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.adapter import ( + AdapterMetadata, + BaseEvaluationAdapter, + SupportedLibrary, +) +from every_eval_ever.converters.common.utils import sha256_file +from every_eval_ever.converters.helm.instance_level_adapter import ( + HELMInstanceLevelDataAdapter, +) +from every_eval_ever.converters.helm.utils import extract_reasoning from every_eval_ever.eval_types import ( DetailedEvaluationResults, EvalLibrary, EvaluationLog, EvaluationResult, + Format, + GenerationArgs, + GenerationConfig, + HashAlgorithm, MetricConfig, ModelInfo, - ScoreType, ScoreDetails, + ScoreType, + SourceDataHf, SourceMetadata, SourceType, - SourceDataHf, - GenerationConfig, - GenerationArgs, - Format, - HashAlgorithm, - Uncertainty + Uncertainty, ) - -from every_eval_ever.instance_level_types import ( - InstanceLevelEvaluationLog -) - -from every_eval_ever.converters.common.adapter import AdapterMetadata, BaseEvaluationAdapter, SupportedLibrary -from every_eval_ever.converters.common.utils import sha256_file -from every_eval_ever.converters.helm.utils import extract_reasoning -from every_eval_ever.converters.helm.instance_level_adapter import ( - HELMInstanceLevelDataAdapter -) -from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog def _require_helm_dependencies() -> None: if _HELM_IMPORT_ERROR is not None: raise ImportError( - "HELM converter dependencies are missing. " + 'HELM converter dependencies are missing. ' "Install with: pip install 'every_eval_ever[helm]'" ) from _HELM_IMPORT_ERROR @@ -73,19 +81,25 @@ class HELMAdapter(BaseEvaluationAdapter): Adapter for HELM outputs that dynamically extracts all metrics and consolidates instance-level logs into a single JSONL file. """ + SCENARIO_STATE_FILE = 'scenario_state.json' RUN_SPEC_FILE = 'run_spec.json' SCENARIO_FILE = 'scenario.json' STATS_FILE = 'stats.json' PER_INSTANCE_STATS_FILE = 'per_instance_stats.json' - REQUIRED_LOG_FILES = [SCENARIO_STATE_FILE, RUN_SPEC_FILE, SCENARIO_FILE, PER_INSTANCE_STATS_FILE] + REQUIRED_LOG_FILES = [ + SCENARIO_STATE_FILE, + RUN_SPEC_FILE, + SCENARIO_FILE, + PER_INSTANCE_STATS_FILE, + ] @property def metadata(self) -> AdapterMetadata: return AdapterMetadata( - name="HELMAdapter", - version="0.0.1", - description="HELM adapter with dynamic metrics and unified JSONL instance logging" + name='HELMAdapter', + version='0.0.1', + description='HELM adapter with dynamic metrics and unified JSONL instance logging', ) @property @@ -95,54 +109,63 @@ def supported_library(self) -> SupportedLibrary: def _directory_contains_required_files(self, dir_path): if os.path.isdir(dir_path): files = os.listdir(dir_path) - return all(required_file in files for required_file in self.REQUIRED_LOG_FILES) - + return all( + required_file in files + for required_file in self.REQUIRED_LOG_FILES + ) + return False - + def _extract_model_info(self, model_deployment_name: str) -> ModelInfo: """Extracts model metadata from the HELM deployment registry.""" deployment = get_model_deployment(model_deployment_name) - client_args = getattr(deployment.client_spec, "args", None) + client_args = getattr(deployment.client_spec, 'args', None) - if "huggingface" in deployment.name or not client_args: - model_id = deployment.model_name + if 'huggingface' in deployment.name or not client_args: + model_id = deployment.model_name else: - model_id = client_args.get("pretrained_model_name_or_path", deployment.model_name) + model_id = client_args.get( + 'pretrained_model_name_or_path', deployment.model_name + ) return ModelInfo( name=deployment.model_name, id=model_id, - developer=deployment.model_name.split("/", 1)[0], - inference_platform=deployment.name.split("/", 1)[0] + developer=deployment.model_name.split('/', 1)[0], + inference_platform=deployment.name.split('/', 1)[0], ) - + def _load_file_if_exists(self, dir_path, file_name) -> Any: path = Path(f'{dir_path}/{file_name}') if path.exists(): return self._load_file(path) - + return None def _load_evaluation_run_logfiles(self, dir_path) -> Dict: - scenario_state_dict = self._load_file_if_exists(dir_path, self.SCENARIO_STATE_FILE) + scenario_state_dict = self._load_file_if_exists( + dir_path, self.SCENARIO_STATE_FILE + ) run_spec_dict = self._load_file_if_exists(dir_path, self.RUN_SPEC_FILE) scenario_dict = self._load_file_if_exists(dir_path, self.SCENARIO_FILE) stats = self._load_file_if_exists(dir_path, self.STATS_FILE) - - with open(f'{dir_path}/{self.PER_INSTANCE_STATS_FILE}', "r") as f: + + with open(f'{dir_path}/{self.PER_INSTANCE_STATS_FILE}', 'r') as f: per_instance_stats = from_json(f.read(), List[PerInstanceStats]) - + return { - 'per_instance_stats': per_instance_stats, - 'run_spec_dict': run_spec_dict, - 'scenario_dict': scenario_dict, - 'scenario_state_dict': scenario_state_dict, - 'stats': stats - } - - def transform_from_directory(self, dir_path: str, output_path: str, metadata_args: Dict[str, Any]): + 'per_instance_stats': per_instance_stats, + 'run_spec_dict': run_spec_dict, + 'scenario_dict': scenario_dict, + 'scenario_state_dict': scenario_state_dict, + 'stats': stats, + } + + def transform_from_directory( + self, dir_path: str, output_path: str, metadata_args: Dict[str, Any] + ): """ - Transforms HELM results into one aggregate EvaluationLog and one + Transforms HELM results into one aggregate EvaluationLog and one instance-level JSONL file containing all samples. """ # all_instance_logs: List[InstanceLevelEvaluationLog] = [] @@ -154,7 +177,9 @@ def transform_from_directory(self, dir_path: str, output_path: str, metadata_arg aggregate_logs.append(agg) else: for entry in os.scandir(dir_path): - if entry.is_dir() and self._directory_contains_required_files(entry.path): + if entry.is_dir() and self._directory_contains_required_files( + entry.path + ): data = self._load_evaluation_run_logfiles(entry.path) agg = self._transform_single(data, metadata_args) aggregate_logs.append(agg) @@ -163,26 +188,31 @@ def transform_from_directory(self, dir_path: str, output_path: str, metadata_arg # with open(output_path, 'w', encoding='utf-8') as f: # for log in all_instance_logs: # f.write(json.dumps(log.model_dump(), ensure_ascii=False) + '\n') - - return aggregate_logs + return aggregate_logs def _extract_generation_args( - self, - adapter_spec: AdapterSpec, - request_state: RequestState + self, adapter_spec: AdapterSpec, request_state: RequestState ) -> GenerationArgs: """ Extracts generation arguments from HELM objects. - + Args: adapter_spec: The global adapter specification from run_spec.json. request: The specific request object from scenario_state.json (optional). """ - temperature = request_state.request.temperature or getattr(adapter_spec, 'temperature', None) - max_tokens = request_state.request.max_tokens or getattr(adapter_spec, 'max_tokens', None) - top_p = request_state.request.top_p or getattr(adapter_spec, 'top_p', None) - top_k = request_state.request.top_k_per_token or getattr(adapter_spec, 'top_k_per_token', None) + temperature = request_state.request.temperature or getattr( + adapter_spec, 'temperature', None + ) + max_tokens = request_state.request.max_tokens or getattr( + adapter_spec, 'max_tokens', None + ) + top_p = request_state.request.top_p or getattr( + adapter_spec, 'top_p', None + ) + top_k = request_state.request.top_k_per_token or getattr( + adapter_spec, 'top_k_per_token', None + ) is_reasoning = extract_reasoning(request_state) is not None @@ -191,23 +221,25 @@ def _extract_generation_args( top_p=top_p, top_k=top_k, max_tokens=max_tokens, - reasoning=is_reasoning + reasoning=is_reasoning, ) - - def _extract_evaluation_time(self, request_states: List[RequestState]) -> str | None: + def _extract_evaluation_time( + self, request_states: List[RequestState] + ) -> str | None: request_datetimes = [ state.result.request_datetime for state in request_states if state.result and state.result.request_datetime ] return str(min(request_datetimes)) if request_datetimes else None - - def _extract_dataset_name(self, run_spec_name: str, scenario_name: str | None) -> str: + def _extract_dataset_name( + self, run_spec_name: str, scenario_name: str | None + ) -> str: if scenario_name: return scenario_name - + if 'dataset' in run_spec_name: eval_metadata = run_spec_name.split(':', 1) if len(eval_metadata) > 1: @@ -219,7 +251,6 @@ def _extract_dataset_name(self, run_spec_name: str, scenario_name: str | None) - return run_spec_name.split(':')[0] - def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: metric_names = [] for metric_spec in run_spec.metric_specs: @@ -232,41 +263,49 @@ def _extract_metric_names(self, run_spec: RunSpec) -> List[str]: return metric_names def _transform_single( - self, - raw_data: Dict, - metadata_args: Dict[str, Any] + self, raw_data: Dict, metadata_args: Dict[str, Any] ) -> Tuple[EvaluationLog, List[InstanceLevelEvaluationLog]]: run_spec = from_dict(data_class=RunSpec, data=raw_data['run_spec_dict']) - scenario_state = from_dict(data_class=ScenarioState, data=raw_data['scenario_state_dict']) + scenario_state = from_dict( + data_class=ScenarioState, data=raw_data['scenario_state_dict'] + ) scenario_dict = raw_data['scenario_dict'] - stats_raw = [from_dict(data_class=Stat, data=s) for s in (raw_data.get('stats') or [])] + stats_raw = [ + from_dict(data_class=Stat, data=s) + for s in (raw_data.get('stats') or []) + ] per_instance_stats_list = raw_data['per_instance_stats'] or [] - + adapter_spec = run_spec.adapter_spec request_states = scenario_state.request_states - - retrieved_timestamp=str(int(datetime.datetime.now().timestamp())) - evaluation_timestamp = self._extract_evaluation_time(request_states) or retrieved_timestamp - + + retrieved_timestamp = str(int(datetime.datetime.now().timestamp())) + evaluation_timestamp = ( + self._extract_evaluation_time(request_states) or retrieved_timestamp + ) + model_info = self._extract_model_info(adapter_spec.model_deployment) dataset_name = self._extract_dataset_name( - run_spec.name, - scenario_dict.get('name') if scenario_dict else None + run_spec.name, scenario_dict.get('name') if scenario_dict else None ) - - source_data = SourceDataHf( # TODO check if always available HF dataset + + source_data = SourceDataHf( # TODO check if always available HF dataset dataset_name=dataset_name, - source_type="hf_dataset", - samples_number=len(set(state.instance.id for state in request_states)), + source_type='hf_dataset', + samples_number=len( + set(state.instance.id for state in request_states) + ), sample_ids=[str(state.instance.id) for state in request_states], additional_details={ - "scenario_name": str(run_spec.scenario_spec.class_name), - "scenario_args": json.dumps(run_spec.scenario_spec.args) if run_spec.scenario_spec.args else "" - } + 'scenario_name': str(run_spec.scenario_spec.class_name), + 'scenario_args': json.dumps(run_spec.scenario_spec.args) + if run_spec.scenario_spec.args + else '', + }, ) - evaluation_id = f"{source_data.dataset_name}/{model_info.id.replace('/', '_')}/{evaluation_timestamp}" + evaluation_id = f'{source_data.dataset_name}/{model_info.id.replace("/", "_")}/{evaluation_timestamp}' metric_names = self._extract_metric_names(run_spec) @@ -275,13 +314,17 @@ def _transform_single( for metric_name in set(metric_names): metric_config = MetricConfig( evaluation_description=metric_name, - lower_is_better=False, # TODO schema.json check + lower_is_better=False, # TODO schema.json check score_type=ScoreType.continuous, min_score=0, - max_score=1 + max_score=1, ) - - matching_stats = [s for s in stats_raw if s.name.name == metric_name and not s.name.perturbation] + + matching_stats = [ + s + for s in stats_raw + if s.name.name == metric_name and not s.name.perturbation + ] for stat in matching_stats: evaluation_name = ( @@ -297,45 +340,66 @@ def _transform_single( evaluation_timestamp=evaluation_timestamp, metric_config=metric_config, score_details=ScoreDetails( - score=stat.mean or (stat.sum / stat.count if stat.count else 0.0), + score=stat.mean + or (stat.sum / stat.count if stat.count else 0.0), uncertainty=Uncertainty( standard_deviation=stat.stddev, - num_samples=adapter_spec.max_eval_instances or len(request_states) + num_samples=adapter_spec.max_eval_instances + or len(request_states), ), details={ - "count": str(stat.count), - "split": str(stat.name.split) if stat.name.split else "", - "perturbation": str(stat.name.perturbation) if stat.name.perturbation else "" - } + 'count': str(stat.count), + 'split': str(stat.name.split) + if stat.name.split + else '', + 'perturbation': str(stat.name.perturbation) + if stat.name.perturbation + else '', + }, ), generation_config=GenerationConfig( - generation_args=self._extract_generation_args(adapter_spec=adapter_spec, request_state=request_states[0]), + generation_args=self._extract_generation_args( + adapter_spec=adapter_spec, + request_state=request_states[0], + ), additional_details={ - "stop_sequences": json.dumps(request_states[0].request.stop_sequences) if request_states[0].request.stop_sequences else "[]", - "presence_penalty": str(request_states[0].request.presence_penalty), - "frequency_penalty": str(request_states[0].request.frequency_penalty), - "num_completions": str(request_states[0].request.num_completions) - } - ) + 'stop_sequences': json.dumps( + request_states[0].request.stop_sequences + ) + if request_states[0].request.stop_sequences + else '[]', + 'presence_penalty': str( + request_states[0].request.presence_penalty + ), + 'frequency_penalty': str( + request_states[0].request.frequency_penalty + ), + 'num_completions': str( + request_states[0].request.num_completions + ), + }, + ), ) ) if request_states: parent_eval_output_dir = metadata_args.get('parent_eval_output_dir') - detailed_results_id = f"{metadata_args.get('file_uuid')}_samples" + detailed_results_id = f'{metadata_args.get("file_uuid")}_samples' model_dev, model_name = model_info.id.split('/', 1) evaluation_dir = f'{parent_eval_output_dir}/{source_data.dataset_name}/{model_dev}/{model_name}' - instance_level_log_path, instance_level_rows_number = HELMInstanceLevelDataAdapter( - detailed_results_id, - Format.jsonl.value, - HashAlgorithm.sha256.value, - evaluation_dir - ).convert_instance_level_logs( - dataset_name, - model_info.id, - request_states, - per_instance_stats_list + instance_level_log_path, instance_level_rows_number = ( + HELMInstanceLevelDataAdapter( + detailed_results_id, + Format.jsonl.value, + HashAlgorithm.sha256.value, + evaluation_dir, + ).convert_instance_level_logs( + dataset_name, + model_info.id, + request_states, + per_instance_stats_list, + ) ) detailed_evaluation_results = DetailedEvaluationResults( @@ -343,7 +407,7 @@ def _transform_single( file_path=instance_level_log_path, hash_algorithm=HashAlgorithm.sha256, checksum=sha256_file(instance_level_log_path), - total_rows=instance_level_rows_number + total_rows=instance_level_rows_number, ) else: detailed_evaluation_results = None @@ -356,21 +420,32 @@ def _transform_single( source_metadata=SourceMetadata( source_name='HELM', source_type=SourceType.evaluation_run, - source_organization_name=metadata_args.get('source_organization_name') or 'Stanford CRFM', - source_organization_url=metadata_args.get('source_organization_url'), - source_organization_logo_url=metadata_args.get('source_organization_logo_url'), - evaluator_relationship=metadata_args.get('evaluator_relationship') or 'third_party', + source_organization_name=metadata_args.get( + 'source_organization_name' + ) + or 'Stanford CRFM', + source_organization_url=metadata_args.get( + 'source_organization_url' + ), + source_organization_logo_url=metadata_args.get( + 'source_organization_logo_url' + ), + evaluator_relationship=metadata_args.get( + 'evaluator_relationship' + ) + or 'third_party', ), eval_library=EvalLibrary( - name=metadata_args.get("eval_library_name", "helm"), - version=metadata_args.get("eval_library_version", "unknown"), + name=metadata_args.get('eval_library_name', 'helm'), + version=metadata_args.get('eval_library_version', 'unknown'), ), model_info=model_info, evaluation_results=evaluation_results, - detailed_evaluation_results=detailed_evaluation_results + detailed_evaluation_results=detailed_evaluation_results, ) return eval_log + def __init__(self, strict_validation: bool = True): _require_helm_dependencies() super().__init__(strict_validation) diff --git a/every_eval_ever/converters/helm/instance_level_adapter.py b/every_eval_ever/converters/helm/instance_level_adapter.py index e1c94bd36..cb2c07193 100644 --- a/every_eval_ever/converters/helm/instance_level_adapter.py +++ b/every_eval_ever/converters/helm/instance_level_adapter.py @@ -1,12 +1,13 @@ import json - from pathlib import Path from typing import Any, List, Tuple _HELM_IMPORT_ERROR: Exception | None = None try: from helm.benchmark.adaptation.scenario_state import RequestState -except Exception as ex: # pragma: no cover - exercised only when optional deps missing +except ( + Exception +) as ex: # pragma: no cover - exercised only when optional deps missing _HELM_IMPORT_ERROR = ex RequestState = Any # type: ignore[assignment] @@ -14,33 +15,33 @@ def _require_helm_dependencies() -> None: if _HELM_IMPORT_ERROR is not None: raise ImportError( - "HELM converter dependencies are missing. " + 'HELM converter dependencies are missing. ' "Install with: pip install 'every_eval_ever[helm]'" ) from _HELM_IMPORT_ERROR + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.utils import sha256_string +from every_eval_ever.converters.helm.utils import extract_all_reasonings from every_eval_ever.instance_level_types import ( AnswerAttributionItem, Evaluation, Input, InstanceLevelEvaluationLog, InteractionType, - Performance, Output, + Performance, TokenUsage, ) -from every_eval_ever.converters import SCHEMA_VERSION -from every_eval_ever.converters.common.utils import sha256_string -from every_eval_ever.converters.helm.utils import extract_all_reasonings - class HELMInstanceLevelDataAdapter: def __init__( - self, - evaulation_id: str, - format: str, - hash_algorithm: str, - evaluation_dir: str + self, + evaulation_id: str, + format: str, + hash_algorithm: str, + evaluation_dir: str, ): _require_helm_dependencies() self.evaluation_id = evaulation_id @@ -49,38 +50,47 @@ def __init__( self.evaluation_dir = evaluation_dir self.path = f'{evaluation_dir}/{evaulation_id}.{format}' - def _save_json( - self, - items: List[InstanceLevelEvaluationLog] - ): + def _save_json(self, items: List[InstanceLevelEvaluationLog]): eval_dir_path = Path(self.evaluation_dir) eval_dir_path.mkdir(parents=True, exist_ok=True) path = Path(self.path) - with path.open("w", encoding="utf-8") as f: + with path.open('w', encoding='utf-8') as f: for item in items: json_line = json.dumps( - item.model_dump(mode="json"), - ensure_ascii=False + item.model_dump(mode='json'), ensure_ascii=False ) - f.write(json_line + "\n") - - print(f'Instance-level eval log was successfully saved to {self.path} path.') + f.write(json_line + '\n') + + print( + f'Instance-level eval log was successfully saved to {self.path} path.' + ) def convert_instance_level_logs( - self, + self, evaluation_name: str, model_id: str, request_states: List[RequestState], - per_instance_stats_list: List + per_instance_stats_list: List, ) -> Tuple[str, int]: instance_level_logs: List[InstanceLevelEvaluationLog] = [] for state in request_states: - inst_stats = next((s for s in per_instance_stats_list if s.instance_id == state.instance.id), None) - - correct_refs = [r.output.text for r in state.instance.references if "correct" in r.tags] + inst_stats = next( + ( + s + for s in per_instance_stats_list + if s.instance_id == state.instance.id + ), + None, + ) + + correct_refs = [ + r.output.text + for r in state.instance.references + if 'correct' in r.tags + ] completions = ( - [c.text for c in state.result.completions] + [c.text for c in state.result.completions] if state.result and state.result.completions else [] ) @@ -91,65 +101,110 @@ def convert_instance_level_logs( is_correct = False score = 0.0 if inst_stats: - em_stat = next((s for s in inst_stats.stats if s.name.name == "exact_match"), None) + em_stat = next( + ( + s + for s in inst_stats.stats + if s.name.name == 'exact_match' + ), + None, + ) if em_stat: score = em_stat.mean is_correct = em_stat.mean > 0 - else: # TODO check for more specific tasks - correct_completions = sum(1 for c in completions if c.strip() in correct_refs) + else: # TODO check for more specific tasks + correct_completions = sum( + 1 for c in completions if c.strip() in correct_refs + ) score = correct_completions / len(completions) is_correct = score > 0 - - + token_usage = None if inst_stats: - p_tokens = next((s.sum for s in inst_stats.stats if s.name.name == "num_prompt_tokens"), 0) - c_tokens = next((s.sum for s in inst_stats.stats if s.name.name == "num_completion_tokens"), 0) - o_tokens = next((s.sum for s in inst_stats.stats if s.name.name == "num_output_tokens"), 0) + p_tokens = next( + ( + s.sum + for s in inst_stats.stats + if s.name.name == 'num_prompt_tokens' + ), + 0, + ) + c_tokens = next( + ( + s.sum + for s in inst_stats.stats + if s.name.name == 'num_completion_tokens' + ), + 0, + ) + o_tokens = next( + ( + s.sum + for s in inst_stats.stats + if s.name.name == 'num_output_tokens' + ), + 0, + ) cot_tokens = int(c_tokens) - int(o_tokens) - + token_usage = TokenUsage( input_tokens=int(p_tokens), output_tokens=int(o_tokens), reasoning_tokens=cot_tokens if cot_tokens else None, - total_tokens=int(p_tokens + c_tokens) + total_tokens=int(p_tokens + c_tokens), ) - instance_level_logs.append(InstanceLevelEvaluationLog( - schema_version=SCHEMA_VERSION, - evaluation_id=self.evaluation_id, - model_id=model_id, - evaluation_name=evaluation_name, - sample_id=str(state.instance.id), - sample_hash=sha256_string(state.request.prompt + correct_refs[0]), # TODO use all references - interaction_type=InteractionType.single_turn, - input=Input( - raw=state.request.prompt, - reference=correct_refs if correct_refs else [], - choices=( - list(state.output_mapping.values()) - if state.output_mapping - else [ref.output.text for ref in state.instance.references] - ) - ), - output=Output( - raw=completions, - reasoning_trace=reasoning_traces - ), - answer_attribution=[AnswerAttributionItem( - turn_idx=0, - source="output.raw", - extracted_value=state.result.completions[0].text.strip() if state.result and state.result.completions else "", - extraction_method="exact_match", - is_terminal=True - )], - evaluation=Evaluation(score=float(score), is_correct=is_correct), - token_usage=token_usage, - performance=Performance( - generation_time_ms=state.result.request_time * 1000 if state.result.request_time else None + instance_level_logs.append( + InstanceLevelEvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=self.evaluation_id, + model_id=model_id, + evaluation_name=evaluation_name, + sample_id=str(state.instance.id), + sample_hash=sha256_string( + state.request.prompt + correct_refs[0] + ), # TODO use all references + interaction_type=InteractionType.single_turn, + input=Input( + raw=state.request.prompt, + reference=correct_refs if correct_refs else [], + choices=( + list(state.output_mapping.values()) + if state.output_mapping + else [ + ref.output.text + for ref in state.instance.references + ] + ), + ), + output=Output( + raw=completions, reasoning_trace=reasoning_traces + ), + answer_attribution=[ + AnswerAttributionItem( + turn_idx=0, + source='output.raw', + extracted_value=state.result.completions[ + 0 + ].text.strip() + if state.result and state.result.completions + else '', + extraction_method='exact_match', + is_terminal=True, + ) + ], + evaluation=Evaluation( + score=float(score), is_correct=is_correct + ), + token_usage=token_usage, + performance=Performance( + generation_time_ms=state.result.request_time * 1000 + if state.result.request_time + else None + ), ) - )) + ) self._save_json(instance_level_logs) return self.path, len(instance_level_logs) diff --git a/every_eval_ever/converters/helm/utils.py b/every_eval_ever/converters/helm/utils.py index f2846e2a4..9e15f1ff0 100644 --- a/every_eval_ever/converters/helm/utils.py +++ b/every_eval_ever/converters/helm/utils.py @@ -2,29 +2,29 @@ try: from helm.benchmark.adaptation.scenario_state import RequestState -except Exception: # pragma: no cover - exercised only when optional deps missing +except ( + Exception +): # pragma: no cover - exercised only when optional deps missing RequestState = Any # type: ignore[assignment] + def extract_reasoning(request_state: RequestState) -> str | None: if request_state.result and request_state.result.completions: return getattr( - getattr( - request_state.result.completions[0], - "thinking", - None - ), - "text", - None + getattr(request_state.result.completions[0], 'thinking', None), + 'text', + None, ) - + return None + def extract_all_reasonings(request_state: RequestState) -> List[str] | None: if not (request_state.result and request_state.result.completions): return None return [ - getattr(getattr(c, "thinking", None), "text", None) + getattr(getattr(c, 'thinking', None), 'text', None) for c in request_state.result.completions - if getattr(c, "thinking", None) is not None + if getattr(c, 'thinking', None) is not None ] diff --git a/every_eval_ever/converters/inspect/__main__.py b/every_eval_ever/converters/inspect/__main__.py index 3dd076698..e6e0968c4 100644 --- a/every_eval_ever/converters/inspect/__main__.py +++ b/every_eval_ever/converters/inspect/__main__.py @@ -1,38 +1,68 @@ from __future__ import annotations -from argparse import ArgumentParser + import json import logging import uuid +from argparse import ArgumentParser from enum import Enum from pathlib import Path from typing import Any, Dict, List, Tuple, Union try: from inspect_ai.log import list_eval_logs + from every_eval_ever.converters.inspect.adapter import InspectAIAdapter except ImportError as exc: raise SystemExit( "The 'inspect-ai' package is required to use the Inspect AI converter.\n" - "Install it with: uv sync --extra inspect" + 'Install it with: uv sync --extra inspect' ) from exc -from every_eval_ever.eval_types import EvaluatorRelationship, EvaluationLog +from every_eval_ever.eval_types import EvaluationLog, EvaluatorRelationship from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog logger = logging.getLogger(__name__) + def parse_args(): parser = ArgumentParser() - parser.add_argument('--log_path', type=str, default='tests/data/inspect/data.json', help='Inspect evalaution log file with extension eval or json.') + parser.add_argument( + '--log_path', + type=str, + default='tests/data/inspect/data.json', + help='Inspect evalaution log file with extension eval or json.', + ) parser.add_argument('--output_dir', type=str, default='data') - parser.add_argument('--source_organization_name', type=str, default='unknown', help='Orgnization which pushed evaluation to the every-eval-ever.') - parser.add_argument('--evaluator_relationship', type=str, default='third_party', help='Relationship of evaluation author to the model', choices=['first_party', 'third_party', 'collaborative', 'other']) + parser.add_argument( + '--source_organization_name', + type=str, + default='unknown', + help='Orgnization which pushed evaluation to the every-eval-ever.', + ) + parser.add_argument( + '--evaluator_relationship', + type=str, + default='third_party', + help='Relationship of evaluation author to the model', + choices=['first_party', 'third_party', 'collaborative', 'other'], + ) parser.add_argument('--source_organization_url', type=str, default=None) - parser.add_argument('--source_organization_logo_url', type=str, default=None) - parser.add_argument('--eval_library_name', type=str, default='inspect_ai', help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)') - parser.add_argument('--eval_library_version', type=str, default='unknown', help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.') - + parser.add_argument( + '--source_organization_logo_url', type=str, default=None + ) + parser.add_argument( + '--eval_library_name', + type=str, + default='inspect_ai', + help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)', + ) + parser.add_argument( + '--eval_library_version', + type=str, + default='unknown', + help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.', + ) args = parser.parse_args() return args @@ -44,43 +74,48 @@ def default(self, obj): return obj.value return super().default(obj) + class InspectEvalLogConverter: - def __init__(self, log_path: str | Path, output_dir: str = 'unified_schema/inspect_ai'): - ''' + def __init__( + self, + log_path: str | Path, + output_dir: str = 'unified_schema/inspect_ai', + ): + """ InspectAI generates log file for an evaluation. - ''' + """ self.log_path = Path(log_path) self.is_log_path_directory = self.log_path.is_dir() - + self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) def convert_to_unified_schema( - self, + self, metadata_args: Dict[str, Any] = None, ) -> Union[ Tuple[EvaluationLog, InstanceLevelEvaluationLog], - List[Tuple[EvaluationLog, InstanceLevelEvaluationLog]] + List[Tuple[EvaluationLog, InstanceLevelEvaluationLog]], ]: if self.is_log_path_directory: return InspectAIAdapter().transform_from_directory( - self.log_path, - metadata_args=metadata_args + self.log_path, metadata_args=metadata_args ) else: return InspectAIAdapter().transform_from_file( - self.log_path, - metadata_args=metadata_args + self.log_path, metadata_args=metadata_args ) def save_to_file( - self, - unified_eval_log: EvaluationLog, - output_filedir: str, - output_filepath: str + self, + unified_eval_log: EvaluationLog, + output_filedir: str, + output_filepath: str, ) -> bool: try: - json_str = unified_eval_log.model_dump_json(indent=4, exclude_none=True) + json_str = unified_eval_log.model_dump_json( + indent=4, exclude_none=True + ) unified_eval_log_dir = Path(f'{self.output_dir}/{output_filedir}') unified_eval_log_dir.mkdir(parents=True, exist_ok=True) @@ -90,17 +125,20 @@ def save_to_file( json_file.write(json_str) logger.info( - "Unified eval log was successfully saved to %s path.", + 'Unified eval log was successfully saved to %s path.', unified_eval_path, ) except Exception as e: - logger.exception("Problem with saving unified eval log to file: %s", e) + logger.exception( + 'Problem with saving unified eval log to file: %s', e + ) raise e + def save_evaluation_log( unified_output: EvaluationLog, inspect_converter: InspectEvalLogConverter, - file_uuid: str + file_uuid: str, ) -> bool: try: model_developer, model_name = unified_output.model_info.id.split('/') @@ -110,17 +148,18 @@ def save_evaluation_log( return True except Exception as e: logger.error( - "Failed to save eval log %s to file. %s", + 'Failed to save eval log %s to file. %s', unified_output.evaluation_id, str(e), ) return False + def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None: detailed = unified_output.detailed_evaluation_results if detailed and detailed.file_path: stem = Path(detailed.file_path).stem - suffix = "_samples" + suffix = '_samples' if stem.endswith(suffix): return stem[: -len(suffix)] return None @@ -131,40 +170,49 @@ def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None: args = parse_args() inspect_converter = InspectEvalLogConverter( - log_path=args.log_path, - output_dir=args.output_dir + log_path=args.log_path, output_dir=args.output_dir ) base_metadata_args = { 'source_organization_name': args.source_organization_name, 'source_organization_url': args.source_organization_url, 'source_organization_logo_url': args.source_organization_logo_url, - 'evaluator_relationship': EvaluatorRelationship(args.evaluator_relationship), + 'evaluator_relationship': EvaluatorRelationship( + args.evaluator_relationship + ), 'parent_eval_output_dir': args.output_dir, 'eval_library_name': args.eval_library_name, 'eval_library_version': args.eval_library_version, } if inspect_converter.is_log_path_directory: - log_paths: List[Path] = list_eval_logs(inspect_converter.log_path.absolute().as_posix()) + log_paths: List[Path] = list_eval_logs( + inspect_converter.log_path.absolute().as_posix() + ) if not log_paths: - logger.warning("Missing evaluations logs to convert!") + logger.warning('Missing evaluations logs to convert!') else: file_uuids = [str(uuid.uuid4()) for _ in log_paths] metadata_args = { **base_metadata_args, - "file_uuids": file_uuids, + 'file_uuids': file_uuids, } - unified_output = inspect_converter.convert_to_unified_schema(metadata_args) + unified_output = inspect_converter.convert_to_unified_schema( + metadata_args + ) if unified_output and isinstance(unified_output, List): for idx, single_unified_output in enumerate(unified_output): - file_uuid = file_uuids[idx] if idx < len(file_uuids) else None + file_uuid = ( + file_uuids[idx] if idx < len(file_uuids) else None + ) if not file_uuid: - file_uuid = extract_file_uuid_from_output(single_unified_output) + file_uuid = extract_file_uuid_from_output( + single_unified_output + ) if not file_uuid: file_uuid = str(uuid.uuid4()) logger.warning( - "Missing UUID for output %s; generated %s for aggregate save.", + 'Missing UUID for output %s; generated %s for aggregate save.', single_unified_output.evaluation_id, file_uuid, ) @@ -174,7 +222,7 @@ def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None: file_uuid, ) else: - logger.warning("Missing unified schema result!") + logger.warning('Missing unified schema result!') else: file_uuid = str(uuid.uuid4()) metadata_args = { @@ -182,8 +230,10 @@ def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None: 'file_uuid': file_uuid, } - unified_output = inspect_converter.convert_to_unified_schema(metadata_args) - + unified_output = inspect_converter.convert_to_unified_schema( + metadata_args + ) + if unified_output: save_evaluation_log( unified_output, @@ -191,4 +241,4 @@ def extract_file_uuid_from_output(unified_output: EvaluationLog) -> str | None: file_uuid, ) else: - logger.warning("Missing unified schema result!") + logger.warning('Missing unified schema result!') diff --git a/every_eval_ever/converters/inspect/adapter.py b/every_eval_ever/converters/inspect/adapter.py index e586eecf5..6ba1ce18a 100644 --- a/every_eval_ever/converters/inspect/adapter.py +++ b/every_eval_ever/converters/inspect/adapter.py @@ -1,8 +1,7 @@ import json +import logging import os import uuid -import logging - from pathlib import Path from typing import Any, Dict, List, Tuple, Union from urllib.parse import urlparse @@ -17,34 +16,57 @@ EvalSample, EvalSampleSummary, EvalScore, - EvalStats, EvalSpec, + EvalStats, list_eval_logs, read_eval_log, read_eval_log_sample, read_eval_log_sample_summaries, ) from inspect_ai.log import EvalPlan as InspectEvalPlan -except Exception as ex: # pragma: no cover - exercised only when optional deps missing +except ( + Exception +) as ex: # pragma: no cover - exercised only when optional deps missing _INSPECT_IMPORT_ERROR = ex - EvalDataset = EvalLog = EvalMetric = EvalResults = EvalSample = EvalSampleSummary = EvalScore = EvalStats = EvalSpec = Any # type: ignore[assignment] + EvalDataset = EvalLog = EvalMetric = EvalResults = EvalSample = ( + EvalSampleSummary + ) = EvalScore = EvalStats = EvalSpec = Any # type: ignore[assignment] InspectEvalPlan = Any # type: ignore[assignment] def _require_inspect_dependencies() -> None: if _INSPECT_IMPORT_ERROR is not None: raise ImportError( - "Inspect converter dependencies are missing. " + 'Inspect converter dependencies are missing. ' "Install with: pip install 'every_eval_ever[inspect]'" ) from _INSPECT_IMPORT_ERROR + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.adapter import ( + AdapterMetadata, + BaseEvaluationAdapter, + SupportedLibrary, +) +from every_eval_ever.converters.common.error import AdapterError +from every_eval_ever.converters.common.utils import ( + convert_timestamp_to_unix_format, + get_current_unix_timestamp, + sha256_file, +) +from every_eval_ever.converters.inspect.instance_level_adapter import ( + InspectInstanceLevelDataAdapter, +) +from every_eval_ever.converters.inspect.utils import ( + extract_model_info_from_model_path, +) from every_eval_ever.eval_types import ( AgenticEvalConfig, AvailableTool, DetailedEvaluationResults, + EvalLibrary, EvalLimits, EvalPlan, - EvalLibrary, EvaluationLog, EvaluationResult, EvaluatorRelationship, @@ -53,39 +75,19 @@ def _require_inspect_dependencies() -> None: GenerationConfig, HashAlgorithm, JudgeConfig, + LlmScoring, MetricConfig, ModelInfo, - LlmScoring, Sandbox, - ScoreType, ScoreDetails, + ScoreType, SourceDataHf, SourceMetadata, SourceType, StandardError, - Uncertainty -) - -from every_eval_ever.converters.common.adapter import ( - AdapterMetadata, - BaseEvaluationAdapter, - SupportedLibrary + Uncertainty, ) -from every_eval_ever.converters.common.error import AdapterError -from every_eval_ever.converters.common.utils import ( - convert_timestamp_to_unix_format, - get_current_unix_timestamp -) -from every_eval_ever.converters.inspect.instance_level_adapter import ( - InspectInstanceLevelDataAdapter -) -from every_eval_ever.converters.common.utils import sha256_file -from every_eval_ever.converters.inspect.utils import ( - extract_model_info_from_model_path -) -from every_eval_ever.converters import SCHEMA_VERSION - logger = logging.getLogger(__name__) @@ -101,27 +103,24 @@ def __init__(self, strict_validation: bool = True): @property def metadata(self) -> AdapterMetadata: return AdapterMetadata( - name="InspectAdapter", - version="0.0.1", - description="Adapter for transforming Inspect evaluation outputs to unified schema format" - ) + name='InspectAdapter', + version='0.0.1', + description='Adapter for transforming Inspect evaluation outputs to unified schema format', + ) @property def supported_library(self) -> SupportedLibrary: return SupportedLibrary.INSPECT_AI def _extract_uncertainty( - self, - stderr_value: float, - stddev_value: float, - num_samples: int + self, stderr_value: float, stddev_value: float, num_samples: int ) -> Uncertainty: return Uncertainty( - standard_error=StandardError( - value=stderr_value - ) if stderr_value else None, + standard_error=StandardError(value=stderr_value) + if stderr_value + else None, standard_deviation=stddev_value, - num_samples=num_samples + num_samples=num_samples, ) def _build_evaluation_result( @@ -135,27 +134,25 @@ def _build_evaluation_result( generation_config: Dict[str, Any], stderr_value: float | None = None, stddev_value: float | None = None, - num_samples: int = 0 + num_samples: int = 0, ) -> EvaluationResult: return EvaluationResult( - evaluation_name=f"{evaluation_task_name} - {scorer_name}", + evaluation_name=f'{evaluation_task_name} - {scorer_name}', source_data=source_data, evaluation_timestamp=evaluation_timestamp, metric_config=MetricConfig( evaluation_description=metric_info.name, - lower_is_better=False, # no metadata available + lower_is_better=False, # no metadata available score_type=ScoreType.continuous, min_score=0, max_score=1, - llm_scoring=llm_grader + llm_scoring=llm_grader, ), score_details=ScoreDetails( score=metric_info.value, uncertainty=self._extract_uncertainty( - stderr_value, - stddev_value, - num_samples - ) + stderr_value, stddev_value, num_samples + ), ), generation_config=generation_config, ) @@ -167,40 +164,52 @@ def _extract_evaluation_results( source_data: SourceDataHf, generation_config: Dict[str, Any], num_samples: int, - timestamp: str + timestamp: str, ) -> List[EvaluationResult]: results: List[EvaluationResult] = [] for scorer in scores: llm_grader = None - if scorer.params and scorer.params.get("grader_model"): + if scorer.params and scorer.params.get('grader_model'): llm_grader = LlmScoring( judges=[ JudgeConfig( model_info=extract_model_info_from_model_path( - self._safe_get(scorer.params.get("grader_model"), "model") + self._safe_get( + scorer.params.get('grader_model'), 'model' + ) ) ) ], - input_prompt=self._safe_get(scorer.params, "grader_template") + input_prompt=self._safe_get( + scorer.params, 'grader_template' + ), ) - + stderr_value = next( - (m.value for m in scorer.metrics.values() if m.name == "stderr"), + ( + m.value + for m in scorer.metrics.values() + if m.name == 'stderr' + ), None, ) stddev_value = next( - (m.value for m in scorer.metrics.values() if m.name in {"std", "stddev"}), + ( + m.value + for m in scorer.metrics.values() + if m.name in {'std', 'stddev'} + ), None, ) for _, metric_info in scorer.metrics.items(): - if metric_info.name == "stderr": + if metric_info.name == 'stderr': continue scorer_name = scorer.name or scorer.scorer - + results.append( self._build_evaluation_result( evaluation_task_name=evaluation_task_name, @@ -212,37 +221,37 @@ def _extract_evaluation_results( generation_config=generation_config, stderr_value=stderr_value, stddev_value=stddev_value, - num_samples=num_samples + num_samples=num_samples, ) ) return results - + def _extract_source_data( - self, - dataset: EvalDataset, - task_name: str + self, dataset: EvalDataset, task_name: str ) -> SourceDataHf: dataset_name = ( dataset.name.split('/')[-1] - if dataset.name + if dataset.name else task_name.split('/')[-1] ) - return SourceDataHf( # TODO add hf_split + return SourceDataHf( # TODO add hf_split source_type='hf_dataset', dataset_name=dataset_name, hf_repo=dataset.location, samples_number=dataset.samples, - sample_ids=[str(sid) for sid in dataset.sample_ids] if dataset.sample_ids is not None else None, - additional_details={"shuffled": str(dataset.shuffled)} + sample_ids=[str(sid) for sid in dataset.sample_ids] + if dataset.sample_ids is not None + else None, + additional_details={'shuffled': str(dataset.shuffled)}, ) def _safe_get(self, obj: Any, field: str): cur = obj - + if cur is None: return None - + if isinstance(cur, dict): cur = cur.get(field) else: @@ -251,24 +260,24 @@ def _safe_get(self, obj: Any, field: str): return cur def _extract_available_tools( - self, - eval_plan: InspectEvalPlan + self, eval_plan: InspectEvalPlan ) -> List[AvailableTool]: """Extracts and flattens tools from the evaluation plan steps.""" - + tools_in_plan_steps = [ - step.params.get("tools", []) - for step in eval_plan.steps - if step.solver == "use_tools" + step.params.get('tools', []) + for step in eval_plan.steps + if step.solver == 'use_tools' ] - + return [ AvailableTool( name=self._safe_get(tool, 'name'), description=self._safe_get(tool, 'description'), parameters=( {str(k): json.dumps(v) for k, v in raw_params.items()} - if (raw_params := self._safe_get(tool, 'params')) and isinstance(raw_params, dict) + if (raw_params := self._safe_get(tool, 'params')) + and isinstance(raw_params, dict) else None ), ) @@ -276,28 +285,24 @@ def _extract_available_tools( if isinstance(tool_list, list) and tool_list for tool in tool_list[0] ] - - def _extract_prompt_template( - self, - plan: InspectEvalPlan - ) -> str | None: + + def _extract_prompt_template(self, plan: InspectEvalPlan) -> str | None: for step in plan.steps: - if step.solver == "prompt_template": - return self._safe_get(step.params, "template") - + if step.solver == 'prompt_template': + return self._safe_get(step.params, 'template') + return None def _extract_generation_config( - self, - spec: EvalSpec, - inspect_plan: InspectEvalPlan + self, spec: EvalSpec, inspect_plan: InspectEvalPlan ) -> GenerationConfig: eval_config = spec.model_generate_config eval_generation_config = { - gen_config: json.dumps(value) - for gen_config, value in vars(eval_config).items() if value is not None + gen_config: json.dumps(value) + for gen_config, value in vars(eval_config).items() + if value is not None } - eval_sandbox = spec.task_args.get("sandbox", None) + eval_sandbox = spec.task_args.get('sandbox', None) if eval_sandbox and not isinstance(eval_sandbox, list): eval_sandbox = [eval_sandbox] sandbox_type, sandbox_config = ((eval_sandbox or []) + [None, None])[:2] @@ -305,23 +310,34 @@ def _extract_generation_config( eval_plan = EvalPlan( name=inspect_plan.name, steps=[ - json.dumps(step.model_dump() if hasattr(step, 'model_dump') else vars(step)) + json.dumps( + step.model_dump() + if hasattr(step, 'model_dump') + else vars(step) + ) for step in inspect_plan.steps ], - config={str(k): json.dumps(v) for k, v in inspect_plan.config.model_dump().items() if v is not None}, + config={ + str(k): json.dumps(v) + for k, v in inspect_plan.config.model_dump().items() + if v is not None + }, ) eval_limits = EvalLimits( time_limit=spec.config.time_limit, message_limit=spec.config.message_limit, - token_limit=spec.config.token_limit + token_limit=spec.config.token_limit, ) - max_attempts = spec.task_args.get("max_attempts") or eval_config.max_retries # TODO not sure if max_attempts == max_retries in this case + max_attempts = ( + spec.task_args.get('max_attempts') or eval_config.max_retries + ) # TODO not sure if max_attempts == max_retries in this case reasoning = ( - True - if eval_config.reasoning_effort and eval_config.reasoning_effort.lower() != 'none' + True + if eval_config.reasoning_effort + and eval_config.reasoning_effort.lower() != 'none' else False ) @@ -341,35 +357,25 @@ def _extract_generation_config( ), eval_plan=eval_plan, eval_limits=eval_limits, - sandbox=Sandbox( - type=sandbox_type, - config=sandbox_config - ), + sandbox=Sandbox(type=sandbox_type, config=sandbox_config), max_attempts=max_attempts, ) additional_details = eval_generation_config - + return GenerationConfig( generation_args=generation_args, - additional_details=additional_details or None + additional_details=additional_details or None, ) - - def _extract_library_version( - self, - packages: Dict[str, str] - ) -> str: + + def _extract_library_version(self, packages: Dict[str, str]) -> str: parts = [ - f"{name}:{version}" - for name, version in packages.items() - if version + f'{name}:{version}' for name, version in packages.items() if version ] - return ",".join(parts) + return ','.join(parts) def transform_from_directory( - self, - dir_path: Union[str, Path], - metadata_args: Dict[str, Any] = None + self, dir_path: Union[str, Path], metadata_args: Dict[str, Any] = None ) -> List[EvaluationLog]: metadata_args = metadata_args or {} @@ -377,10 +383,12 @@ def transform_from_directory( dir_path = Path(dir_path) if not dir_path.exists(): - raise FileNotFoundError(f"Directory path {dir_path} does not exist!") - + raise FileNotFoundError( + f'Directory path {dir_path} does not exist!' + ) + log_paths: List[Path] = list_eval_logs(dir_path.absolute().as_posix()) - file_uuids = metadata_args.get("file_uuids") + file_uuids = metadata_args.get('file_uuids') try: transformed_logs: List[EvaluationLog] = [] for idx, log_path in enumerate(log_paths): @@ -389,7 +397,9 @@ def transform_from_directory( file_uuid = None if isinstance(file_uuids, list) and idx < len(file_uuids): file_uuid = file_uuids[idx] - per_log_metadata_args["file_uuid"] = file_uuid or str(uuid.uuid4()) + per_log_metadata_args['file_uuid'] = file_uuid or str( + uuid.uuid4() + ) transformed_logs.append( self.transform_from_file( urlparse(log_path.name).path, @@ -398,37 +408,40 @@ def transform_from_directory( ) return transformed_logs except Exception as e: - raise AdapterError(f"Failed to load file from directory {dir_path}: {str(e)} for InspectAIAdapter") + raise AdapterError( + f'Failed to load file from directory {dir_path}: {str(e)} for InspectAIAdapter' + ) def transform_from_file( - self, - file_path: Union[str, Path], + self, + file_path: Union[str, Path], metadata_args: Dict[str, Any] = None, - header_only: bool = False - ) -> Union[ - EvaluationLog, - List[EvaluationLog] - ]: + header_only: bool = False, + ) -> Union[EvaluationLog, List[EvaluationLog]]: metadata_args = metadata_args or {} if not os.path.exists(file_path): raise FileNotFoundError(f'File path {file_path} does not exists!') - + try: - file_path = Path(file_path) if isinstance(file_path, str) else file_path - eval_data: Tuple[EvalLog, List[EvalSampleSummary], EvalSample | None] = ( - self._load_file(file_path, header_only=header_only) + file_path = ( + Path(file_path) if isinstance(file_path, str) else file_path ) + eval_data: Tuple[ + EvalLog, List[EvalSampleSummary], EvalSample | None + ] = self._load_file(file_path, header_only=header_only) return self.transform(eval_data, metadata_args) except AdapterError as e: raise e except Exception as e: - raise AdapterError(f"Failed to load file {file_path}: {str(e)} for InspectAIAdapter") + raise AdapterError( + f'Failed to load file {file_path}: {str(e)} for InspectAIAdapter' + ) def _transform_single( - self, - raw_data: Tuple[EvalLog, List[EvalSampleSummary], EvalSample | None], - metadata_args: Dict[str, Any] + self, + raw_data: Tuple[EvalLog, List[EvalSampleSummary], EvalSample | None], + metadata_args: Dict[str, Any], ) -> EvaluationLog: metadata_args = metadata_args or {} @@ -437,7 +450,9 @@ def _transform_single( eval_stats: EvalStats = raw_eval_log.stats evaluation_timestamp = eval_stats.started_at or eval_spec.created - evaluation_unix_timestamp = convert_timestamp_to_unix_format(evaluation_timestamp) + evaluation_unix_timestamp = convert_timestamp_to_unix_format( + evaluation_timestamp + ) retrieved_unix_timestamp = get_current_unix_timestamp() if not evaluation_unix_timestamp: @@ -445,15 +460,18 @@ def _transform_single( library_version = self._extract_library_version(eval_spec.packages) eval_library = EvalLibrary( - name=metadata_args.get("eval_library_name", "inspect_ai"), - version=library_version or metadata_args.get("eval_library_version", "unknown"), + name=metadata_args.get('eval_library_name', 'inspect_ai'), + version=library_version + or metadata_args.get('eval_library_version', 'unknown'), ) evaluator_relationship = metadata_args.get( - "evaluator_relationship", EvaluatorRelationship.third_party + 'evaluator_relationship', EvaluatorRelationship.third_party ) if isinstance(evaluator_relationship, str): - evaluator_relationship = EvaluatorRelationship(evaluator_relationship) + evaluator_relationship = EvaluatorRelationship( + evaluator_relationship + ) source_metadata = SourceMetadata( source_name='inspect_ai', @@ -461,8 +479,12 @@ def _transform_single( source_organization_name=metadata_args.get( 'source_organization_name', 'unknown' ), - source_organization_url=metadata_args.get('source_organization_url'), - source_organization_logo_url=metadata_args.get('source_organization_logo_url'), + source_organization_url=metadata_args.get( + 'source_organization_url' + ), + source_organization_logo_url=metadata_args.get( + 'source_organization_logo_url' + ), evaluator_relationship=evaluator_relationship, ) @@ -473,24 +495,28 @@ def _transform_single( model_path = eval_spec.model num_samples = len(raw_eval_log.samples) if raw_eval_log.samples else 0 - single_sample = raw_eval_log.samples[0] if raw_eval_log.samples else single_sample + single_sample = ( + raw_eval_log.samples[0] if raw_eval_log.samples else single_sample + ) if single_sample: detailed_model_name = single_sample.output.model - if "/" in model_path: - prefix, rest = model_path.split("/", 1) + if '/' in model_path: + prefix, rest = model_path.split('/', 1) if rest != detailed_model_name: - model_path = f"{prefix}/{detailed_model_name}" + model_path = f'{prefix}/{detailed_model_name}' else: - model_path = f"{prefix}/{rest}" + model_path = f'{prefix}/{rest}' else: model_path = detailed_model_name model_info: ModelInfo = extract_model_info_from_model_path(model_path) - - generation_config = self._extract_generation_config(eval_spec, raw_eval_log.plan) + + generation_config = self._extract_generation_config( + eval_spec, raw_eval_log.plan + ) results: EvalResults | None = raw_eval_log.results @@ -503,38 +529,43 @@ def _transform_single( source_data, generation_config, num_samples, - evaluation_unix_timestamp + evaluation_unix_timestamp, ) if results and results.scores else [] ) - evaluation_id = f'{source_data.dataset_name}/{model_path.replace('/', '_')}/{evaluation_unix_timestamp}' + evaluation_id = f'{source_data.dataset_name}/{model_path.replace("/", "_")}/{evaluation_unix_timestamp}' - parent_eval_output_dir = metadata_args.get("parent_eval_output_dir", "data") + parent_eval_output_dir = metadata_args.get( + 'parent_eval_output_dir', 'data' + ) if raw_eval_log.samples and parent_eval_output_dir: - file_uuid = metadata_args.get("file_uuid") + file_uuid = metadata_args.get('file_uuid') if not file_uuid: file_uuid = str(uuid.uuid4()) - metadata_args["file_uuid"] = file_uuid + metadata_args['file_uuid'] = file_uuid logging.warning( f"Missing metadata_args['file_uuid']; generated one for instance-level log: {file_uuid}. " - "Save unified aggregate log with the same uuid." + 'Save unified aggregate log with the same uuid.' ) - if "/" in model_info.id: - model_dev, model_name = model_info.id.split("/", 1) + if '/' in model_info.id: + model_dev, model_name = model_info.id.split('/', 1) else: - model_dev, model_name = "unknown", model_info.id - evaluation_dir = ( - f"{parent_eval_output_dir}/{source_data.dataset_name}/{model_dev}/{model_name}" - ) - detailed_results_id = f"{file_uuid}_samples" - - instance_level_log_path, instance_level_rows_number = InspectInstanceLevelDataAdapter( - detailed_results_id, Format.jsonl.value, HashAlgorithm.sha256.value, evaluation_dir - ).convert_instance_level_logs( - evaluation_task_name, model_info.id, raw_eval_log.samples + model_dev, model_name = 'unknown', model_info.id + evaluation_dir = f'{parent_eval_output_dir}/{source_data.dataset_name}/{model_dev}/{model_name}' + detailed_results_id = f'{file_uuid}_samples' + + instance_level_log_path, instance_level_rows_number = ( + InspectInstanceLevelDataAdapter( + detailed_results_id, + Format.jsonl.value, + HashAlgorithm.sha256.value, + evaluation_dir, + ).convert_instance_level_logs( + evaluation_task_name, model_info.id, raw_eval_log.samples + ) ) detailed_evaluation_results = DetailedEvaluationResults( @@ -542,7 +573,7 @@ def _transform_single( file_path=instance_level_log_path, hash_algorithm=HashAlgorithm.sha256.value, checksum=sha256_file(instance_level_log_path), - total_rows=instance_level_rows_number + total_rows=instance_level_rows_number, ) else: detailed_evaluation_results = None @@ -556,9 +587,9 @@ def _transform_single( eval_library=eval_library, model_info=model_info, evaluation_results=evaluation_results, - detailed_evaluation_results=detailed_evaluation_results + detailed_evaluation_results=detailed_evaluation_results, ) - + def _load_file( self, file_path, header_only=False ) -> Tuple[EvalLog, List[EvalSampleSummary], EvalSample | None]: @@ -567,9 +598,7 @@ def _load_file( summaries = read_eval_log_sample_summaries(file_path) first_sample = ( read_eval_log_sample( - file_path, - summaries[0].id, - summaries[0].epoch + file_path, summaries[0].id, summaries[0].epoch ) if summaries else None diff --git a/every_eval_ever/converters/inspect/instance_level_adapter.py b/every_eval_ever/converters/inspect/instance_level_adapter.py index 57893be7f..a5c0ef3ad 100644 --- a/every_eval_ever/converters/inspect/instance_level_adapter.py +++ b/every_eval_ever/converters/inspect/instance_level_adapter.py @@ -4,6 +4,7 @@ _INSPECT_IMPORT_ERROR: Exception | None = None try: + from inspect_ai.log import EvalSample from inspect_ai.model import ( ChatMessage, ChatMessageAssistant, @@ -11,38 +12,47 @@ ChatMessageUser, ModelUsage, ) - from inspect_ai.log import EvalSample -except Exception as ex: # pragma: no cover - exercised only when optional deps missing +except ( + Exception +) as ex: # pragma: no cover - exercised only when optional deps missing _INSPECT_IMPORT_ERROR = ex - ChatMessage = ChatMessageAssistant = ChatMessageTool = ChatMessageUser = ModelUsage = EvalSample = Any # type: ignore[assignment] + ChatMessage = ChatMessageAssistant = ChatMessageTool = ChatMessageUser = ( + ModelUsage + ) = EvalSample = Any # type: ignore[assignment] def _require_inspect_dependencies() -> None: if _INSPECT_IMPORT_ERROR is not None: raise ImportError( - "Inspect converter dependencies are missing. " + 'Inspect converter dependencies are missing. ' "Install with: pip install 'every_eval_ever[inspect]'" ) from _INSPECT_IMPORT_ERROR + +from every_eval_ever.converters import SCHEMA_VERSION +from every_eval_ever.converters.common.utils import sha256_string from every_eval_ever.instance_level_types import ( AnswerAttributionItem, Evaluation, Input, InstanceLevelEvaluationLog, - Message, InteractionType, - Performance, + Message, Output, + Performance, TokenUsage, - ToolCall + ToolCall, ) -from every_eval_ever.converters import SCHEMA_VERSION -from every_eval_ever.converters.common.utils import sha256_string - class InspectInstanceLevelDataAdapter: - def __init__(self, evaulation_id: str, format: str, hash_algorithm: str, evaluation_dir: str): + def __init__( + self, + evaulation_id: str, + format: str, + hash_algorithm: str, + evaluation_dir: str, + ): _require_inspect_dependencies() self.evaluation_id = evaulation_id self.format = format @@ -57,44 +67,44 @@ def _serialize_input(self, raw_input) -> str: for msg in raw_input: if not isinstance(msg, ChatMessageUser): continue - content = getattr(msg, "content", "") + content = getattr(msg, 'content', '') if isinstance(content, list): - content = " ".join( - block.text if hasattr(block, "text") else str(block) + content = ' '.join( + block.text if hasattr(block, 'text') else str(block) for block in content ) parts.append(content) - return "\n".join(parts) + return '\n'.join(parts) def _parse_content_with_reasoning( - self, - content: List[Any] + self, content: List[Any] ) -> Tuple[str, str]: response = None reasoning_trace = None for part in content: - if part.type and part.type == "reasoning": - reasoning_trace = part.reasoning # or part.summary - elif part.type and part.type == "text": + if part.type and part.type == 'reasoning': + reasoning_trace = part.reasoning # or part.summary + elif part.type and part.type == 'text': response = part.text return response, reasoning_trace - def _get_token_usage(self, usage: ModelUsage | None): - return TokenUsage( - input_tokens=usage.input_tokens, - output_tokens=usage.output_tokens, - total_tokens=usage.total_tokens, - input_tokens_cache_write=usage.input_tokens_cache_write, - input_tokens_cache_read=usage.input_tokens_cache_read, - reasoning_tokens=usage.reasoning_tokens, - ) if usage else None + return ( + TokenUsage( + input_tokens=usage.input_tokens, + output_tokens=usage.output_tokens, + total_tokens=usage.total_tokens, + input_tokens_cache_write=usage.input_tokens_cache_write, + input_tokens_cache_read=usage.input_tokens_cache_read, + reasoning_tokens=usage.reasoning_tokens, + ) + if usage + else None + ) def _handle_chat_messages( - self, - turn_idx: int, - message: ChatMessage + self, turn_idx: int, message: ChatMessage ) -> Message: role = message.role content = message.content @@ -110,13 +120,24 @@ def _handle_chat_messages( ToolCall( id=tool_call.id, name=tool_call.function, - arguments={str(k): json.dumps(v) for k, v in tool_call.arguments.items()} if tool_call.arguments else None + arguments={ + str(k): json.dumps(v) + for k, v in tool_call.arguments.items() + } + if tool_call.arguments + else None, ) for tool_call in message.tool_calls or [] ] - if isinstance(message, ChatMessageUser) or isinstance(message, ChatMessageTool): - tool_call_id = [message.tool_call_id] if isinstance(message.tool_call_id, str) else message.tool_call_id + if isinstance(message, ChatMessageUser) or isinstance( + message, ChatMessageTool + ): + tool_call_id = ( + [message.tool_call_id] + if isinstance(message.tool_call_id, str) + else message.tool_call_id + ) return Message( turn_idx=turn_idx, @@ -124,40 +145,37 @@ def _handle_chat_messages( content=content, reasoning_trace=reasoning, tool_calls=tool_calls, - tool_call_id=tool_call_id + tool_call_id=tool_call_id, ) - def _save_json( - self, - items: list[InstanceLevelEvaluationLog] - ): + def _save_json(self, items: list[InstanceLevelEvaluationLog]): eval_dir_path = Path(self.evaluation_dir) eval_dir_path.mkdir(parents=True, exist_ok=True) path = Path(self.path) - with path.open("w", encoding="utf-8") as f: + with path.open('w', encoding='utf-8') as f: for item in items: json_line = json.dumps( - item.model_dump(mode="json"), - ensure_ascii=False + item.model_dump(mode='json'), ensure_ascii=False ) - f.write(json_line + "\n") + f.write(json_line + '\n') - print(f'Instance-level eval log was successfully saved to {self.path} path.') + print( + f'Instance-level eval log was successfully saved to {self.path} path.' + ) def convert_instance_level_logs( - self, - evaluation_name: str, - model_id: str, - samples: List[EvalSample] + self, evaluation_name: str, model_id: str, samples: List[EvalSample] ) -> Tuple[str, int]: instance_level_logs: List[InstanceLevelEvaluationLog] = [] for sample in samples: sample_input = Input( raw=self._serialize_input(sample.input), - reference=[sample.target] if isinstance(sample.target, str) else list(sample.target), - choices=sample.choices + reference=[sample.target] + if isinstance(sample.target, str) + else list(sample.target), + choices=sample.choices, ) reasoning_trace = None @@ -165,7 +183,9 @@ def convert_instance_level_logs( content = message.content if isinstance(content, list): - response, reasoning_trace = self._parse_content_with_reasoning(content) + response, reasoning_trace = self._parse_content_with_reasoning( + content + ) else: response = content @@ -182,12 +202,12 @@ def convert_instance_level_logs( for msg_idx, msg in enumerate(sample.messages) ] - counted_assistant_roles = sum([ - msg.role.lower() == 'assistant' for msg in processed_messages - ]) - counted_tool_roles = sum([ - msg.role.lower() == 'tool' for msg in processed_messages - ]) + counted_assistant_roles = sum( + [msg.role.lower() == 'assistant' for msg in processed_messages] + ) + counted_tool_roles = sum( + [msg.role.lower() == 'tool' for msg in processed_messages] + ) if counted_tool_roles: interaction_type = InteractionType.agentic @@ -196,11 +216,14 @@ def convert_instance_level_logs( else: interaction_type = InteractionType.single_turn - if interaction_type == InteractionType.single_turn: sample_output = Output( - raw=[response] if isinstance(response, str) else list(response), - reasoning_trace=[reasoning_trace] if isinstance(reasoning_trace, str) else reasoning_trace + raw=[response] + if isinstance(response, str) + else list(response), + reasoning_trace=[reasoning_trace] + if isinstance(reasoning_trace, str) + else reasoning_trace, ) messages = None else: @@ -214,7 +237,9 @@ def convert_instance_level_logs( tool_calls_count=sum( len(msg.tool_calls) if msg.tool_calls else 0 for msg in messages - ) if messages else 0 + ) + if messages + else 0, ) answer_attribution: List[AnswerAttributionItem] = [] @@ -223,8 +248,10 @@ def convert_instance_level_logs( if sample.total_time and sample.working_time: performance = Performance( - latency_ms=int((sample.total_time - sample.working_time) * 1000), - generation_time_ms=int(sample.working_time * 1000) + latency_ms=int( + (sample.total_time - sample.working_time) * 1000 + ), + generation_time_ms=int(sample.working_time * 1000), ) else: performance = None @@ -235,7 +262,9 @@ def convert_instance_level_logs( model_id=model_id, evaluation_name=evaluation_name, sample_id=str(sample.id), - sample_hash=sha256_string(sample_input.raw + ''.join(sample_input.reference)), + sample_hash=sha256_string( + sample_input.raw + ''.join(sample_input.reference) + ), interaction_type=interaction_type, input=sample_input, output=sample_output, @@ -244,11 +273,15 @@ def convert_instance_level_logs( evaluation=evaluation, token_usage=token_usage, performance=performance, - error=f'{sample.error.message}\n{sample.error.traceback}' if sample.error else None, + error=f'{sample.error.message}\n{sample.error.traceback}' + if sample.error + else None, metadata={ - 'stop_reason': str(sample.output.stop_reason) if sample.output.stop_reason else '', - 'epoch': str(sample.epoch) - } + 'stop_reason': str(sample.output.stop_reason) + if sample.output.stop_reason + else '', + 'epoch': str(sample.epoch), + }, ) instance_level_logs.append(instance_level_log) diff --git a/every_eval_ever/converters/inspect/utils.py b/every_eval_ever/converters/inspect/utils.py index b6808ffca..45ce34d74 100644 --- a/every_eval_ever/converters/inspect/utils.py +++ b/every_eval_ever/converters/inspect/utils.py @@ -1,27 +1,25 @@ -import hashlib import re - from pathlib import Path +from typing import Dict, List, Type + from pydantic import BaseModel -from typing import Dict, Type, List -from every_eval_ever.eval_types import ( - InferenceEngine, - ModelInfo -) from every_eval_ever.converters.common.utils import get_model_organization_info +from every_eval_ever.eval_types import InferenceEngine, ModelInfo class ModelPathHandler: """Base class for all model path parsing strategies.""" + def __init__(self, model_path: str): self.model_path = model_path self.parts = model_path.split('/') - + def handle(self) -> ModelInfo: """Must be implemented by subclasses to return the parsed ModelInfo.""" raise NotImplementedError + def normalize_claude_model_name(model_str: str) -> str: """ Normalizes any Claude model identifier to the canonical format: @@ -30,29 +28,35 @@ def normalize_claude_model_name(model_str: str) -> str: base_match = re.search(r'(claude-\d+-\d+-[a-z]+)', model_str) if not base_match: # Return the original string if the base model name is not found - return model_str + return model_str base = base_match.group(1) # Use '00000000' as a default date if not found date_match = re.search(r'@?(\d{8})', model_str) - date = date_match.group(1) if date_match else "00000000" + date = date_match.group(1) if date_match else '00000000' + + return f'{base}@{date}' - return f"{base}@{date}" class ClosedApiHandler(ModelPathHandler): """Handles paths for closed API providers like OpenAI, Anthropic, Google, etc.""" + def handle(self) -> ModelInfo: developer = self.parts[0] inference_platform = self.parts[0] model_id = self.model_path - + # Special handling for Anthropic models running on cloud platforms (Vertex/Bedrock) - if self.model_path.startswith('anthropic/vertex') or self.model_path.startswith('anthropic/bedrock'): + if self.model_path.startswith( + 'anthropic/vertex' + ) or self.model_path.startswith('anthropic/bedrock'): if len(self.parts) >= 3: # e.g., anthropic/vertex/claude-3-5-sonnet-v2@20241022 developer = self.parts[0] inference_platform = self.parts[1] - model_id = f'{developer}/{normalize_claude_model_name(self.parts[2])}' + model_id = ( + f'{developer}/{normalize_claude_model_name(self.parts[2])}' + ) # General handling for Azure or Vertex (non-Anthropic) elif 'azure' in self.model_path or 'vertex' in self.model_path: @@ -66,11 +70,11 @@ def handle(self) -> ModelInfo: name=self.model_path, id=model_id, developer=developer, - inference_platform=inference_platform + inference_platform=inference_platform, ) -class BedrockParser: +class BedrockParser: @staticmethod def parse(model_path: str) -> ModelInfo: # Remove any variant suffix after a colon (e.g. ':2') @@ -86,65 +90,87 @@ def parse(model_path: str) -> ModelInfo: return ModelInfo( name=model_path, - id=f"{developer}/{model}" if developer else model, + id=f'{developer}/{model}' if developer else model, developer=developer, - inference_platform='bedrock' + inference_platform='bedrock', ) -class AzureAiParser: +class AzureAiParser: @staticmethod def parse(model_name: str) -> ModelInfo: parts = model_name.split('/') inference_platform = parts[0] base_model_name = parts[-1] - + inferred_org_name = get_model_organization_info(base_model_name) - developer = inferred_org_name if inferred_org_name and inferred_org_name != 'not_found' else inference_platform - + developer = ( + inferred_org_name + if inferred_org_name and inferred_org_name != 'not_found' + else inference_platform + ) + return ModelInfo( name=model_name, - id=f'{developer}/{base_model_name}', # Corrected 'id' logic + id=f'{developer}/{base_model_name}', # Corrected 'id' logic developer=developer, - inference_platform=inference_platform + inference_platform=inference_platform, ) + class CloudApiHandler(ModelPathHandler): """Handles paths for cloud API providers like AWS Bedrock and Azure AI.""" + def handle(self) -> ModelInfo: if self.model_path.startswith('bedrock'): - return BedrockParser.parse(self.model_path) - + return BedrockParser.parse(self.model_path) + elif self.model_path.startswith('azureai'): return AzureAiParser.parse(self.model_path) - - return ModelInfo(name=self.model_path, id=self.model_path, developer='unknown', inference_platform='cloud_api') + + return ModelInfo( + name=self.model_path, + id=self.model_path, + developer='unknown', + inference_platform='cloud_api', + ) + class HostedOpenHandler(ModelPathHandler): """Handles paths for hosted open models (Together, Groq, Fireworks, etc.).""" # Model name to developer map for Sambanova SAMBANOVA_DEV_MAP = { - 'llama': 'meta-llama', 'gpt': 'openai', 'qwen': 'qwen', - 'swallow': 'tokyotech-llm', 'allam': 'humain-ai', - 'mistral': 'mistral', 'deepseek': 'deepseek-ai' + 'llama': 'meta-llama', + 'gpt': 'openai', + 'qwen': 'qwen', + 'swallow': 'tokyotech-llm', + 'allam': 'humain-ai', + 'mistral': 'mistral', + 'deepseek': 'deepseek-ai', } def handle(self) -> ModelInfo: inference_platform = self.parts[0] model_id = self.model_path developer = 'unknown' - + path_lower = self.model_path.lower() # Group 1: Platform/Developer/Model format - platforms = ['together', 'cf', 'openrouter', 'openai-api', 'hf-inference-providers'] + platforms = [ + 'together', + 'cf', + 'openrouter', + 'openai-api', + 'hf-inference-providers', + ] if any(path_lower.startswith(p) for p in platforms): if len(self.parts) >= 3: developer = self.parts[1] model_id = f'{developer}/{self.parts[2]}' else: - developer = inference_platform # Fallback + developer = inference_platform # Fallback # Group 2: Groq specific logic elif path_lower.startswith('groq'): @@ -153,7 +179,7 @@ def handle(self) -> ModelInfo: is_llama = 'llama' in model_name.lower() developer = 'meta-llama' if is_llama else inference_platform model_id = f'{developer}/{model_name}' - + # Group 3: Sambanova specific logic elif path_lower.startswith('sambanova'): if len(self.parts) >= 2: @@ -164,18 +190,22 @@ def handle(self) -> ModelInfo: developer = dev break model_id = f'{developer}/{self.parts[-1]}' - + # Group 4: Fireworks specific logic elif path_lower.startswith('fireworks'): # e.g. fireworks/accounts/fireworks/models/deepseek-r1-0528 model_name = self.parts[-1] # Assuming get_model_organization_info returns an object with a 'organization' key inferred_org_name = get_model_organization_info(model_name) - developer = inferred_org_name if inferred_org_name != 'not_found' else 'unknown' + developer = ( + inferred_org_name + if inferred_org_name != 'not_found' + else 'unknown' + ) model_id = f'{developer}/{model_name}' - + if developer == 'unknown': - if len(self.parts) >= 2: + if len(self.parts) >= 2: developer = self.parts[1] model_id = f'{developer}/{self.parts[-1]}' @@ -183,15 +213,17 @@ def handle(self) -> ModelInfo: name=self.model_path, id=model_id, developer=developer, - inference_platform=inference_platform + inference_platform=inference_platform, ) + class InferenceEngineHandler(ModelPathHandler): """Handles paths for inference engines (vLLM, Ollama, SGLang, etc.).""" + def handle(self) -> ModelInfo: inference_engine = self.parts[0] model_id = self.model_path - developer = 'unknown' # Default value + developer = 'unknown' # Default value # Group 1: /Engine/Developer/Model format (e.g., vllm/meta-llama/Llama-2-7b-chat) if any(self.model_path.startswith(e) for e in ['vllm', 'sglang', 'hf']): @@ -200,23 +232,27 @@ def handle(self) -> ModelInfo: model_id = f'{self.parts[1]}/{self.parts[2]}' else: developer = inference_engine - + # Group 2: Ollama and Llama-cpp-python format (e.g., ollama/llama2:7b) - elif any(self.model_path.startswith(e) for e in ['ollama', 'llama-cpp-python']): + elif any( + self.model_path.startswith(e) + for e in ['ollama', 'llama-cpp-python'] + ): if len(self.parts) >= 2: developer = self.parts[0] model_name = self.parts[1].replace(':', '-') model_id = f'{self.parts[0]}/{model_name}' - + return ModelInfo( name=self.model_path, id=model_id, developer=developer, inference_engine=InferenceEngine( - name=inference_engine # TODO add version if possible - ) + name=inference_engine # TODO add version if possible + ), ) + # Mapping the provider/engine prefix to the specific Handler class # This is where the extension point is for *new* provider categories. MODEL_HANDLER_MAP: Dict[str, Type[ModelPathHandler]] = { @@ -225,14 +261,12 @@ def handle(self) -> ModelInfo: 'anthropic': ClosedApiHandler, 'google': ClosedApiHandler, 'grok': ClosedApiHandler, - 'mistral': ClosedApiHandler, + 'mistral': ClosedApiHandler, 'deepseek': ClosedApiHandler, 'perplexity': ClosedApiHandler, - # Cloud API Providers 'bedrock': CloudApiHandler, 'azure-ai': CloudApiHandler, - # Hosted Open Providers 'groq': HostedOpenHandler, 'together': HostedOpenHandler, @@ -242,23 +276,23 @@ def handle(self) -> ModelInfo: 'sambanova': HostedOpenHandler, 'openrouter': HostedOpenHandler, 'openai-api': HostedOpenHandler, - # Inference Engines 'hf': InferenceEngineHandler, 'vllm': InferenceEngineHandler, 'ollama': InferenceEngineHandler, 'llamacpp': InferenceEngineHandler, - 'sglang': InferenceEngineHandler + 'sglang': InferenceEngineHandler, } PROVIDER_PREFIXES: List[str] = list(MODEL_HANDLER_MAP.keys()) + def extract_model_info_from_model_path(model_path: str) -> ModelInfo: """ Infers the ModelInfo by dispatching the model_path to the appropriate handler. - + This function is now simple and focuses only on dispatching. - To add a new provider/engine, you only need to update the MODEL_HANDLER_MAP + To add a new provider/engine, you only need to update the MODEL_HANDLER_MAP and create a corresponding Handler class. """ @@ -270,17 +304,20 @@ def extract_model_info_from_model_path(model_path: str) -> ModelInfo: handler = handler_class(model_path) return handler.handle() except Exception as e: - print(f"Handler failed for {model_path}: {e}. Fallback into unknown model developer.") + print( + f'Handler failed for {model_path}: {e}. Fallback into unknown model developer.' + ) pass # Fallback return ModelInfo( - name=model_path, - id=model_path, - developer='unknown', - inference_platform='unknown' + name=model_path, + id=model_path, + developer='unknown', + inference_platform='unknown', ) + def save_to_file(path: str, obj: BaseModel) -> bool: json_str = obj.model_dump_json(indent=4, exclude_none=True) @@ -288,4 +325,4 @@ def save_to_file(path: str, obj: BaseModel) -> bool: obj_path.mkdir(parents=True, exist_ok=True) with open(obj_path, 'w') as json_file: - json_file.write(json_str) \ No newline at end of file + json_file.write(json_str) diff --git a/every_eval_ever/converters/lm_eval/__main__.py b/every_eval_ever/converters/lm_eval/__main__.py index ca738633a..397e6b7b3 100644 --- a/every_eval_ever/converters/lm_eval/__main__.py +++ b/every_eval_ever/converters/lm_eval/__main__.py @@ -13,75 +13,75 @@ def main(): parser = argparse.ArgumentParser( - description="Convert lm-evaluation-harness output to every_eval_ever format" + description='Convert lm-evaluation-harness output to every_eval_ever format' ) parser.add_argument( - "--log_path", + '--log_path', type=str, required=True, - help="Path to results JSON file or directory containing results files", + help='Path to results JSON file or directory containing results files', ) parser.add_argument( - "--output_dir", + '--output_dir', type=str, - default="data", - help="Output directory for converted files", + default='data', + help='Output directory for converted files', ) parser.add_argument( - "--source_organization_name", + '--source_organization_name', type=str, - default="", - help="Name of the organization that ran the evaluation", + default='', + help='Name of the organization that ran the evaluation', ) parser.add_argument( - "--evaluator_relationship", + '--evaluator_relationship', type=str, - default="first_party", - choices=["first_party", "third_party", "collaborative", "other"], - help="Relationship of the evaluator to the model", + default='first_party', + choices=['first_party', 'third_party', 'collaborative', 'other'], + help='Relationship of the evaluator to the model', ) parser.add_argument( - "--source_organization_url", + '--source_organization_url', type=str, default=None, - help="URL of the source organization", + help='URL of the source organization', ) parser.add_argument( - "--source_organization_logo_url", + '--source_organization_logo_url', type=str, default=None, - help="Logo of the source organization", + help='Logo of the source organization', ) parser.add_argument( - "--include_samples", - action="store_true", - help="Include instance-level sample data (requires --log_samples in original eval)", + '--include_samples', + action='store_true', + help='Include instance-level sample data (requires --log_samples in original eval)', ) parser.add_argument( - "--inference_engine", + '--inference_engine', type=str, default=None, help="Override inference engine name (e.g. 'vllm', 'transformers'). " - "Auto-detected from model type when possible.", + 'Auto-detected from model type when possible.', ) parser.add_argument( - "--inference_engine_version", + '--inference_engine_version', type=str, default=None, help="Inference engine version (e.g. '0.6.0'). " - "Not available from lm-eval logs, so must be provided manually.", + 'Not available from lm-eval logs, so must be provided manually.', ) parser.add_argument( - "--eval_library_name", + '--eval_library_name', type=str, - default="lm_eval", - help="Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)", + default='lm_eval', + help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)', ) parser.add_argument( - "--eval_library_version", + '--eval_library_version', type=str, - default="unknown", - help="Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.", + default='unknown', + help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.', ) args = parser.parse_args() @@ -90,16 +90,18 @@ def main(): output_dir = Path(args.output_dir) metadata_args = { - "source_organization_name": args.source_organization_name, - "evaluator_relationship": args.evaluator_relationship, - "source_organization_url": args.source_organization_url, - "eval_library_name": args.eval_library_name, - "eval_library_version": args.eval_library_version, + 'source_organization_name': args.source_organization_name, + 'evaluator_relationship': args.evaluator_relationship, + 'source_organization_url': args.source_organization_url, + 'eval_library_name': args.eval_library_name, + 'eval_library_version': args.eval_library_version, } if args.inference_engine: - metadata_args["inference_engine"] = args.inference_engine + metadata_args['inference_engine'] = args.inference_engine if args.inference_engine_version: - metadata_args["inference_engine_version"] = args.inference_engine_version + metadata_args['inference_engine_version'] = ( + args.inference_engine_version + ) log_path = Path(args.log_path) @@ -108,23 +110,23 @@ def main(): elif log_path.is_dir(): logs = adapter.transform_from_directory(log_path, metadata_args) else: - print(f"Error: {log_path} is not a file or directory", file=sys.stderr) + print(f'Error: {log_path} is not a file or directory', file=sys.stderr) sys.exit(1) for log in logs: # Organize as: output_dir/{evaluation_name}/{developer}/{model_name}/{uuid}.json # Use the first evaluation result's name (before any /filter suffix) as the task name if log.evaluation_results: - eval_name = log.evaluation_results[0].evaluation_name.split("/")[0] + eval_name = log.evaluation_results[0].evaluation_name.split('/')[0] else: - eval_name = "unknown" + eval_name = 'unknown' - model_parts = log.model_info.id.split("/") + model_parts = log.model_info.id.split('/') if len(model_parts) >= 2: developer = model_parts[0] - model_name = "/".join(model_parts[1:]) + model_name = '/'.join(model_parts[1:]) else: - developer = "unknown" + developer = 'unknown' model_name = log.model_info.id out_path = output_dir / eval_name / developer / model_name @@ -135,8 +137,8 @@ def main(): # Save instance-level samples if requested, using the same UUID if args.include_samples: meta = adapter.get_eval_metadata(log.evaluation_id) - parent_dir = meta.get("parent_dir") - task_name = meta.get("task_name") + parent_dir = meta.get('parent_dir') + task_name = meta.get('task_name') if parent_dir and task_name: samples_file = find_samples_file(Path(parent_dir), task_name) if samples_file: @@ -151,15 +153,17 @@ def main(): ) log.detailed_evaluation_results = detailed - out_file = out_path / f"{eval_uuid}.json" + out_file = out_path / f'{eval_uuid}.json' - with open(out_file, "w") as f: - json.dump(log.model_dump(mode="json", exclude_none=True), f, indent=2) + with open(out_file, 'w') as f: + json.dump( + log.model_dump(mode='json', exclude_none=True), f, indent=2 + ) - print(f" {out_file}") + print(f' {out_file}') - print(f"\nConverted {len(logs)} evaluation log(s).") + print(f'\nConverted {len(logs)} evaluation log(s).') -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/every_eval_ever/converters/lm_eval/adapter.py b/every_eval_ever/converters/lm_eval/adapter.py index 5d2b00b1a..2dc253cc5 100644 --- a/every_eval_ever/converters/lm_eval/adapter.py +++ b/every_eval_ever/converters/lm_eval/adapter.py @@ -32,10 +32,10 @@ ) from .utils import ( - parse_model_args, + KNOWN_METRIC_BOUNDS, MODEL_TYPE_TO_INFERENCE_ENGINE, MODEL_TYPE_TO_INFERENCE_PLATFORM, - KNOWN_METRIC_BOUNDS, + parse_model_args, ) @@ -55,10 +55,10 @@ def get_eval_metadata(self, evaluation_id: str) -> Dict[str, Any]: @property def metadata(self) -> AdapterMetadata: return AdapterMetadata( - name="lm-eval-adapter", - version="0.1.0", - supported_library_versions=["0.4.*"], - description="Converts lm-evaluation-harness output to every_eval_ever format", + name='lm-eval-adapter', + version='0.1.0', + supported_library_versions=['0.4.*'], + description='Converts lm-evaluation-harness output to every_eval_ever format', ) @property @@ -66,47 +66,53 @@ def supported_library(self) -> SupportedLibrary: return SupportedLibrary.LM_EVAL def _extract_model_info( - self, raw_data: Dict[str, Any], metadata_args: Optional[Dict[str, Any]] = None + self, + raw_data: Dict[str, Any], + metadata_args: Optional[Dict[str, Any]] = None, ) -> ModelInfo: """Extract model information from lm-eval results.""" metadata_args = metadata_args or {} - config = raw_data.get("config", {}) - model_type = config.get("model", "") - model_args_str = config.get("model_args", "") + config = raw_data.get('config', {}) + model_type = config.get('model', '') + model_args_str = config.get('model_args', '') if isinstance(model_args_str, dict): model_args = model_args_str else: model_args = parse_model_args(model_args_str) - model_name = raw_data.get("model_name", "") - pretrained = model_args.get("pretrained", model_name) + model_name = raw_data.get('model_name', '') + pretrained = model_args.get('pretrained', model_name) developer = None - if "/" in pretrained: - developer = pretrained.split("/")[0] + if '/' in pretrained: + developer = pretrained.split('/')[0] inference_platform = MODEL_TYPE_TO_INFERENCE_PLATFORM.get(model_type) # Determine inference engine name: CLI override > auto-detection from model type - engine_name = metadata_args.get("inference_engine") or MODEL_TYPE_TO_INFERENCE_ENGINE.get(model_type) - engine_version = metadata_args.get("inference_engine_version") + engine_name = metadata_args.get( + 'inference_engine' + ) or MODEL_TYPE_TO_INFERENCE_ENGINE.get(model_type) + engine_version = metadata_args.get('inference_engine_version') inference_engine = None if engine_name: - inference_engine = InferenceEngine(name=engine_name, version=engine_version) + inference_engine = InferenceEngine( + name=engine_name, version=engine_version + ) additional = {} - if config.get("model_num_parameters"): - additional["num_parameters"] = str(config["model_num_parameters"]) - if config.get("model_dtype"): - additional["dtype"] = str(config["model_dtype"]) - if config.get("model_revision"): - additional["revision"] = str(config["model_revision"]) - if config.get("model_sha"): - additional["sha"] = str(config["model_sha"]) + if config.get('model_num_parameters'): + additional['num_parameters'] = str(config['model_num_parameters']) + if config.get('model_dtype'): + additional['dtype'] = str(config['model_dtype']) + if config.get('model_revision'): + additional['revision'] = str(config['model_revision']) + if config.get('model_sha'): + additional['sha'] = str(config['model_sha']) if model_args_str: - additional["model_args"] = str(model_args_str) + additional['model_args'] = str(model_args_str) return ModelInfo( name=pretrained, @@ -119,72 +125,77 @@ def _extract_model_info( def _get_tasks(self, raw_data: Dict[str, Any]) -> List[str]: """Get task names that have actual metric results (leaf tasks and groups).""" - results = raw_data.get("results", {}) + results = raw_data.get('results', {}) tasks = [] for task_name, task_results in results.items(): # Skip group placeholder entries (only have alias and " " keys) - non_alias_keys = [k for k in task_results if k != "alias"] - if non_alias_keys == [" "]: + non_alias_keys = [k for k in task_results if k != 'alias'] + if non_alias_keys == [' ']: continue # Skip if no numeric metric values has_metric = any( isinstance(v, (int, float)) for k, v in task_results.items() - if k not in ("alias", "samples", "name", "sample_len", "sample_count") - and "_stderr," not in k + if k + not in ( + 'alias', + 'samples', + 'name', + 'sample_len', + 'sample_count', + ) + and '_stderr,' not in k ) if not has_metric: continue tasks.append(task_name) return tasks - def _build_source_data( - self, task_config: Dict[str, Any], task_name: str - ): + def _build_source_data(self, task_config: Dict[str, Any], task_name: str): """Build source_data from task config.""" - dataset_path = task_config.get("dataset_path", "") - dataset_name = task_config.get("task", task_name) + dataset_path = task_config.get('dataset_path', '') + dataset_name = task_config.get('task', task_name) if ( dataset_path - and "/" in str(dataset_path) - and not str(dataset_path).startswith("/") + and '/' in str(dataset_path) + and not str(dataset_path).startswith('/') ): return SourceDataHf( dataset_name=dataset_name, - source_type="hf_dataset", + source_type='hf_dataset', hf_repo=dataset_path, hf_split=( - task_config.get("test_split") - or task_config.get("validation_split") + task_config.get('test_split') + or task_config.get('validation_split') ), ) return SourceDataPrivate( dataset_name=dataset_name, - source_type="other", + source_type='other', ) def _build_generation_config( self, task_config: Dict[str, Any] ) -> Optional[GenerationConfig]: """Build generation config from task config.""" - gen_kwargs = task_config.get("generation_kwargs", {}) + gen_kwargs = task_config.get('generation_kwargs', {}) if not gen_kwargs: return None args = GenerationArgs( - temperature=gen_kwargs.get("temperature"), - top_p=gen_kwargs.get("top_p"), - top_k=gen_kwargs.get("top_k"), - max_tokens=gen_kwargs.get("max_gen_toks"), + temperature=gen_kwargs.get('temperature'), + top_p=gen_kwargs.get('top_p'), + top_k=gen_kwargs.get('top_k'), + max_tokens=gen_kwargs.get('max_gen_toks'), ) additional = {} for k, v in gen_kwargs.items(): - if k not in ("temperature", "top_p", "top_k", "max_gen_toks"): + if k not in ('temperature', 'top_p', 'top_k', 'max_gen_toks'): additional[k] = json.dumps(v) if not isinstance(v, str) else v - if task_config.get("num_fewshot") is not None: - additional["num_fewshot"] = str(task_config["num_fewshot"]) + if task_config.get('num_fewshot') is not None: + additional['num_fewshot'] = str(task_config['num_fewshot']) return GenerationConfig( generation_args=args, @@ -195,33 +206,41 @@ def _build_evaluation_results( self, raw_data: Dict[str, Any], task_name: str ) -> List[EvaluationResult]: """Build EvaluationResult list for a single task.""" - task_results = raw_data["results"][task_name] - task_config = raw_data.get("configs", {}).get(task_name, {}) - higher_is_better = raw_data.get("higher_is_better", {}).get(task_name, {}) - n_samples = raw_data.get("n-samples", {}).get(task_name, {}) + task_results = raw_data['results'][task_name] + task_config = raw_data.get('configs', {}).get(task_name, {}) + higher_is_better = raw_data.get('higher_is_better', {}).get( + task_name, {} + ) + n_samples = raw_data.get('n-samples', {}).get(task_name, {}) source_data = self._build_source_data(task_config, task_name) gen_config = self._build_generation_config(task_config) - eval_timestamp = raw_data.get("date") + eval_timestamp = raw_data.get('date') if eval_timestamp is not None: eval_timestamp = str(int(eval_timestamp)) results = [] for key, value in task_results.items(): - if key in ("alias", "samples", "name", "sample_len", "sample_count"): + if key in ( + 'alias', + 'samples', + 'name', + 'sample_len', + 'sample_count', + ): continue - if "_stderr," in key: + if '_stderr,' in key: continue if not isinstance(value, (int, float)): continue - if "," in key: - metric_name, filter_name = key.split(",", 1) + if ',' in key: + metric_name, filter_name = key.split(',', 1) else: metric_name = key - filter_name = "none" + filter_name = 'none' - stderr_key = f"{metric_name}_stderr,{filter_name}" + stderr_key = f'{metric_name}_stderr,{filter_name}' stderr_val = task_results.get(stderr_key) is_higher_better = higher_is_better.get(metric_name, True) @@ -231,8 +250,8 @@ def _build_evaluation_results( max_score = bounds[1] if bounds else None description = metric_name - if filter_name != "none": - description = f"{metric_name} (filter: {filter_name})" + if filter_name != 'none': + description = f'{metric_name} (filter: {filter_name})' metric_config = MetricConfig( evaluation_description=description, @@ -244,14 +263,14 @@ def _build_evaluation_results( uncertainty = None num_samples = ( - n_samples.get("effective") - or task_results.get("samples") - or task_results.get("sample_len") + n_samples.get('effective') + or task_results.get('samples') + or task_results.get('sample_len') ) if stderr_val is not None or num_samples: uncertainty = Uncertainty( standard_error=( - StandardError(value=stderr_val, method="bootstrap") + StandardError(value=stderr_val, method='bootstrap') if stderr_val is not None else None ), @@ -259,8 +278,8 @@ def _build_evaluation_results( ) eval_name = task_name - if filter_name != "none": - eval_name = f"{task_name}/{filter_name}" + if filter_name != 'none': + eval_name = f'{task_name}/{filter_name}' results.append( EvaluationResult( @@ -285,45 +304,48 @@ def _transform_single( Expects metadata_args to contain 'task_name' specifying which task. """ - task_name = metadata_args["task_name"] + task_name = metadata_args['task_name'] model_info = self._extract_model_info(raw_data, metadata_args) retrieved_timestamp = get_current_unix_timestamp() - eval_timestamp = raw_data.get("date") + eval_timestamp = raw_data.get('date') if eval_timestamp is not None: eval_timestamp = str(int(eval_timestamp)) - evaluation_id = f"{task_name}/{model_info.id}/{retrieved_timestamp}" + evaluation_id = f'{task_name}/{model_info.id}/{retrieved_timestamp}' evaluation_results = self._build_evaluation_results(raw_data, task_name) evaluator_rel_str = metadata_args.get( - "evaluator_relationship", "first_party" + 'evaluator_relationship', 'first_party' ) evaluator_relationship = EvaluatorRelationship(evaluator_rel_str) - library_version = str(raw_data.get("lm_eval_version", "")) + library_version = str(raw_data.get('lm_eval_version', '')) eval_library = EvalLibrary( - name=metadata_args.get("eval_library_name", "lm_eval"), - version=library_version or metadata_args.get("eval_library_version", "unknown"), + name=metadata_args.get('eval_library_name', 'lm_eval'), + version=library_version + or metadata_args.get('eval_library_version', 'unknown'), ) source_metadata = SourceMetadata( - source_name="lm-evaluation-harness", + source_name='lm-evaluation-harness', source_type=SourceType.evaluation_run, source_organization_name=metadata_args.get( - "source_organization_name", "" + 'source_organization_name', '' + ), + source_organization_url=metadata_args.get( + 'source_organization_url' ), - source_organization_url=metadata_args.get("source_organization_url"), source_organization_logo_url=metadata_args.get( - "source_organization_logo_url" + 'source_organization_logo_url' ), evaluator_relationship=evaluator_relationship, ) # Store metadata so callers can find sample files after transform self._eval_metadata[evaluation_id] = { - "parent_dir": metadata_args.get("parent_eval_output_dir"), - "task_name": task_name, + 'parent_dir': metadata_args.get('parent_eval_output_dir'), + 'task_name': task_name, } return EvaluationLog( @@ -349,15 +371,15 @@ def transform_from_file( tasks = self._get_tasks(raw_data) # Pass the parent directory so instance-level adapter can find samples files - if "parent_eval_output_dir" not in metadata_args: + if 'parent_eval_output_dir' not in metadata_args: metadata_args = { **metadata_args, - "parent_eval_output_dir": str(file_path.parent), + 'parent_eval_output_dir': str(file_path.parent), } results = [] for task_name in tasks: - task_metadata = {**metadata_args, "task_name": task_name} + task_metadata = {**metadata_args, 'task_name': task_name} log = self._transform_single(raw_data, task_metadata) results.append(log) @@ -371,7 +393,7 @@ def transform_from_directory( Searches for results_*.json files recursively. """ dir_path = Path(dir_path) - results_files = sorted(dir_path.glob("**/results_*.json")) + results_files = sorted(dir_path.glob('**/results_*.json')) all_logs = [] for results_file in results_files: diff --git a/every_eval_ever/converters/lm_eval/instance_level_adapter.py b/every_eval_ever/converters/lm_eval/instance_level_adapter.py index 1fb052be4..1cc393862 100644 --- a/every_eval_ever/converters/lm_eval/instance_level_adapter.py +++ b/every_eval_ever/converters/lm_eval/instance_level_adapter.py @@ -6,7 +6,11 @@ from typing import Any, Dict, List, Optional, Union from every_eval_ever.converters import SCHEMA_VERSION -from every_eval_ever.eval_types import DetailedEvaluationResults, Format, HashAlgorithm +from every_eval_ever.eval_types import ( + DetailedEvaluationResults, + Format, + HashAlgorithm, +) from every_eval_ever.instance_level_types import ( AnswerAttributionItem, Evaluation, @@ -61,21 +65,24 @@ def transform_and_save( if output_dir is None: return None - logs = self.transform_samples(samples_path, evaluation_id, model_id, task_name) + logs = self.transform_samples( + samples_path, evaluation_id, model_id, task_name + ) if not logs: return None output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if file_uuid: - out_file = output_dir / f"{file_uuid}_samples.jsonl" + out_file = output_dir / f'{file_uuid}_samples.jsonl' else: - out_file = output_dir / f"samples_{task_name}.jsonl" + out_file = output_dir / f'samples_{task_name}.jsonl' - with open(out_file, "w") as f: + with open(out_file, 'w') as f: for log in logs: f.write( - json.dumps(log.model_dump(mode="json"), ensure_ascii=False) + "\n" + json.dumps(log.model_dump(mode='json'), ensure_ascii=False) + + '\n' ) file_hash = hashlib.sha256(out_file.read_bytes()).hexdigest() @@ -97,19 +104,19 @@ def _transform_sample( ) -> InstanceLevelEvaluationLog: """Transform a single lm-eval sample into an instance-level log.""" # Extract prompt from arguments - arguments = sample.get("arguments", {}) - prompt = "" + arguments = sample.get('arguments', {}) + prompt = '' if arguments: - first_arg = arguments.get("gen_args_0", {}) - prompt = first_arg.get("arg_0", "") + first_arg = arguments.get('gen_args_0', {}) + prompt = first_arg.get('arg_0', '') - target = str(sample.get("target", "")) + target = str(sample.get('target', '')) # Extract model output raw_output = self._extract_output(sample) # Determine correctness from metric values - metrics = sample.get("metrics", []) + metrics = sample.get('metrics', []) score = None is_correct = None for metric_name in metrics: @@ -125,26 +132,28 @@ def _transform_sample( is_correct = False # Build sample hash from input + reference for cross-model comparison - hash_input = json.dumps({"raw": prompt, "reference": target}, sort_keys=True) + hash_input = json.dumps( + {'raw': prompt, 'reference': target}, sort_keys=True + ) sample_hash = hashlib.sha256(hash_input.encode()).hexdigest() # Build evaluation_name: include filter if not "none" - filter_name = sample.get("filter", "none") + filter_name = sample.get('filter', 'none') eval_name = task_name - if filter_name != "none": - eval_name = f"{task_name}/{filter_name}" + if filter_name != 'none': + eval_name = f'{task_name}/{filter_name}' # Build answer attribution # For lm-eval, the answer is always extracted from the model's single-turn output. # The extraction_method depends on the filter applied. - extraction_method = "none" - if filter_name != "none": + extraction_method = 'none' + if filter_name != 'none': extraction_method = filter_name answer_attribution = [ AnswerAttributionItem( turn_idx=0, - source="output.raw", + source='output.raw', extracted_value=raw_output, extraction_method=extraction_method, is_terminal=True, @@ -156,7 +165,7 @@ def _transform_sample( evaluation_id=evaluation_id, model_id=model_id, evaluation_name=eval_name, - sample_id=str(sample.get("doc_id", 0)), + sample_id=str(sample.get('doc_id', 0)), sample_hash=sample_hash, interaction_type=InteractionType.single_turn, input=Input( @@ -171,32 +180,32 @@ def _transform_sample( is_correct=is_correct, ), metadata={ - "doc_hash": str(sample.get("doc_hash", "")), - "prompt_hash": str(sample.get("prompt_hash", "")), - "target_hash": str(sample.get("target_hash", "")), - "filter": str(filter_name), - "lm_eval_metrics": json.dumps({ - m: sample.get(m) for m in metrics if m in sample - }), + 'doc_hash': str(sample.get('doc_hash', '')), + 'prompt_hash': str(sample.get('prompt_hash', '')), + 'target_hash': str(sample.get('target_hash', '')), + 'filter': str(filter_name), + 'lm_eval_metrics': json.dumps( + {m: sample.get(m) for m in metrics if m in sample} + ), }, ) def _is_multiple_choice(self, sample: dict[str, Any]) -> bool: """Check if a sample is multiple-choice by inspecting the arguments structure.""" - arguments = sample.get("arguments", {}) - return len(arguments) > 1 and "gen_args_1" in arguments + arguments = sample.get('arguments', {}) + return len(arguments) > 1 and 'gen_args_1' in arguments def _extract_output(self, sample: dict[str, Any]) -> str: """Extract the model's output from a sample.""" - filtered_resps = sample.get("filtered_resps", []) - resps = sample.get("resps", []) + filtered_resps = sample.get('filtered_resps', []) + resps = sample.get('resps', []) if self._is_multiple_choice(sample): # For multiple-choice, find the selected choice index from filtered_resps. # Each entry is [log_prob, is_greedy]; the model picks the highest log_prob. source = filtered_resps if filtered_resps else resps if not source: - return "" + return '' try: log_probs = [] for resp in source: @@ -204,7 +213,7 @@ def _extract_output(self, sample: dict[str, Any]) -> str: val = resp[0] if isinstance(resp[0], list) else resp log_probs.append(float(val[0])) else: - log_probs.append(float("-inf")) + log_probs.append(float('-inf')) selected_idx = log_probs.index(max(log_probs)) # Return the choice text from arguments if available choices = self._extract_choices(sample) @@ -217,24 +226,24 @@ def _extract_output(self, sample: dict[str, Any]) -> str: # For generation tasks, use the first response source = filtered_resps if filtered_resps else resps if not source: - return "" + return '' first = source[0] if isinstance(first, list): - return str(first[0]) if first else "" + return str(first[0]) if first else '' return str(first) def _extract_choices(self, sample: dict[str, Any]) -> list[str] | None: """Extract multiple choice options from arguments structure.""" - arguments = sample.get("arguments", {}) + arguments = sample.get('arguments', {}) if not self._is_multiple_choice(sample): return None # Collect arg_1 (continuation text) from each gen_args_N in order choices = [] idx = 0 - while f"gen_args_{idx}" in arguments: - arg = arguments[f"gen_args_{idx}"] - if "arg_1" in arg: - choices.append(str(arg["arg_1"]).strip()) + while f'gen_args_{idx}' in arguments: + arg = arguments[f'gen_args_{idx}'] + if 'arg_1' in arg: + choices.append(str(arg['arg_1']).strip()) idx += 1 return choices if choices else None diff --git a/every_eval_ever/converters/lm_eval/utils.py b/every_eval_ever/converters/lm_eval/utils.py index a2c544bd6..92fad5d27 100644 --- a/every_eval_ever/converters/lm_eval/utils.py +++ b/every_eval_ever/converters/lm_eval/utils.py @@ -12,14 +12,14 @@ def parse_model_args(model_args: str | None) -> Dict[str, str]: if not model_args or not isinstance(model_args, str): return {} result = {} - for part in model_args.split(","): - if "=" in part: - key, value = part.split("=", 1) + for part in model_args.split(','): + if '=' in part: + key, value = part.split('=', 1) result[key.strip()] = value.strip() elif result: # Continuation of previous value that contained a comma last_key = list(result.keys())[-1] - result[last_key] += "," + part + result[last_key] += ',' + part return result @@ -28,12 +28,12 @@ def find_samples_file(output_dir: Path, task_name: str) -> Optional[Path]: lm-eval writes samples as: samples__.jsonl """ - pattern = f"samples_{task_name}_*.jsonl" + pattern = f'samples_{task_name}_*.jsonl' matches = sorted(output_dir.glob(pattern)) if matches: return matches[-1] # Most recent # Also check subdirectories (lm-eval nests under model_name_sanitized/) - matches = sorted(output_dir.glob(f"**/{pattern}")) + matches = sorted(output_dir.glob(f'**/{pattern}')) if matches: return matches[-1] return None @@ -41,36 +41,36 @@ def find_samples_file(output_dir: Path, task_name: str) -> Optional[Path]: # Maps lm-eval config.model values to every_eval_ever inference_platform MODEL_TYPE_TO_INFERENCE_PLATFORM = { - "openai-completions": "openai", - "openai-chat-completions": "openai", - "anthropic": "anthropic", - "anthropic-chat": "anthropic", - "together": "together", + 'openai-completions': 'openai', + 'openai-chat-completions': 'openai', + 'anthropic': 'anthropic', + 'anthropic-chat': 'anthropic', + 'together': 'together', } # Maps lm-eval config.model values to inference engine names MODEL_TYPE_TO_INFERENCE_ENGINE = { - "hf": "transformers", - "vllm": "vllm", - "gguf": "llama.cpp", + 'hf': 'transformers', + 'vllm': 'vllm', + 'gguf': 'llama.cpp', } # Known metric bounds: metric_name -> (min_score, max_score) # max_score of None means unbounded KNOWN_METRIC_BOUNDS = { - "acc": (0.0, 1.0), - "acc_norm": (0.0, 1.0), - "exact_match": (0.0, 1.0), - "f1": (0.0, 1.0), - "em": (0.0, 1.0), - "mc1": (0.0, 1.0), - "mc2": (0.0, 1.0), - "mcc": (-1.0, 1.0), - "bleu": (0.0, 100.0), - "rouge1": (0.0, 1.0), - "rouge2": (0.0, 1.0), - "rougeL": (0.0, 1.0), - "rougeLsum": (0.0, 1.0), - "ter": (0.0, None), - "brier_score": (0.0, 1.0), + 'acc': (0.0, 1.0), + 'acc_norm': (0.0, 1.0), + 'exact_match': (0.0, 1.0), + 'f1': (0.0, 1.0), + 'em': (0.0, 1.0), + 'mc1': (0.0, 1.0), + 'mc2': (0.0, 1.0), + 'mcc': (-1.0, 1.0), + 'bleu': (0.0, 100.0), + 'rouge1': (0.0, 1.0), + 'rouge2': (0.0, 1.0), + 'rougeL': (0.0, 1.0), + 'rougeLsum': (0.0, 1.0), + 'ter': (0.0, None), + 'brier_score': (0.0, 1.0), } diff --git a/every_eval_ever/eval_types.py b/every_eval_ever/eval_types.py index e8b4a15b3..40035403b 100644 --- a/every_eval_ever/eval_types.py +++ b/every_eval_ever/eval_types.py @@ -3,54 +3,65 @@ # timestamp: 2026-03-19T20:30:15+00:00 from __future__ import annotations + from enum import Enum -from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator, Discriminator from typing import Annotated, Literal +from pydantic import ( + BaseModel, + ConfigDict, + Discriminator, + Field, + confloat, + conint, + model_validator, +) + class SourceType(Enum): - documentation = "documentation" - evaluation_run = "evaluation_run" + documentation = 'documentation' + evaluation_run = 'evaluation_run' class EvaluatorRelationship(Enum): - first_party = "first_party" - third_party = "third_party" - collaborative = "collaborative" - other = "other" + first_party = 'first_party' + third_party = 'third_party' + collaborative = 'collaborative' + other = 'other' class SourceMetadata(BaseModel): source_name: str | None = Field( None, - description="Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation).", + description='Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation).', ) source_type: SourceType = Field( ..., - description="Whether the data comes from a direct evaluation run or from documentation", + description='Whether the data comes from a direct evaluation run or from documentation', ) source_organization_name: str = Field( - ..., description="Name of the organization that provides the data" + ..., description='Name of the organization that provides the data' ) source_organization_url: str | None = Field( - None, description="URL for the organization that provides the data" + None, description='URL for the organization that provides the data' ) source_organization_logo_url: str | None = Field( - None, description="URL for the Logo for the organization that provides the data" + None, + description='URL for the Logo for the organization that provides the data', ) evaluator_relationship: EvaluatorRelationship = Field( - ..., description="Relationship between the evaluator and the model" + ..., description='Relationship between the evaluator and the model' ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class EvalLibrary(BaseModel): name: str = Field( ..., - description="Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)", + description='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm)', ) version: str = Field( ..., @@ -58,25 +69,25 @@ class EvalLibrary(BaseModel): ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class ScoreType(Enum): - binary = "binary" - continuous = "continuous" - levels = "levels" + binary = 'binary' + continuous = 'continuous' + levels = 'levels' class AggregationMethod(Enum): - majority_vote = "majority_vote" - average = "average" - weighted_average = "weighted_average" - median = "median" + majority_vote = 'majority_vote' + average = 'average' + weighted_average = 'weighted_average' + median = 'median' class StandardError(BaseModel): - value: float = Field(..., description="The standard error value") + value: float = Field(..., description='The standard error value') method: str | None = Field( None, description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')", @@ -84,64 +95,72 @@ class StandardError(BaseModel): class ConfidenceInterval(BaseModel): - lower: float = Field(..., description="Lower bound of the confidence interval") - upper: float = Field(..., description="Upper bound of the confidence interval") + lower: float = Field( + ..., description='Lower bound of the confidence interval' + ) + upper: float = Field( + ..., description='Upper bound of the confidence interval' + ) confidence_level: confloat(ge=0.0, le=1.0) | None = Field( - None, description="Confidence level (e.g. 0.95 for a 95% confidence interval)" + None, + description='Confidence level (e.g. 0.95 for a 95% confidence interval)', ) method: str | None = Field( - None, description="How the confidence interval was computed" + None, description='How the confidence interval was computed' ) class Uncertainty(BaseModel): standard_error: StandardError | None = Field( None, - description="Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))", + description='Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))', ) confidence_interval: ConfidenceInterval | None = Field( None, - description="Lower and upper bounds for the metric at a given confidence level.", + description='Lower and upper bounds for the metric at a given confidence level.', ) standard_deviation: float | None = Field( - None, description="Standard deviation of the per-sample scores" + None, description='Standard deviation of the per-sample scores' ) num_samples: int | None = Field( - None, description="Number of samples used to compute the uncertainty estimates" + None, + description='Number of samples used to compute the uncertainty estimates', ) num_bootstrap_samples: int | None = Field( None, - description="Number of bootstrap resamples used, if bootstrap method was applied", + description='Number of bootstrap resamples used, if bootstrap method was applied', ) class ScoreDetails(BaseModel): - score: float = Field(..., description="The score for the evaluation") + score: float = Field(..., description='The score for the evaluation') details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) uncertainty: Uncertainty | None = Field( - None, description="Quantification of uncertainty around the reported score" + None, + description='Quantification of uncertainty around the reported score', ) class AvailableTool(BaseModel): - name: str | None = Field(None, description="e.g. bash, calculator, ...") + name: str | None = Field(None, description='e.g. bash, calculator, ...') description: str | None = None parameters: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class AgenticEvalConfig(BaseModel): available_tools: list[AvailableTool] | None = Field( - None, description="List of all available tools with their configurations" + None, + description='List of all available tools with their configurations', ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) @@ -149,174 +168,190 @@ class EvalPlan(BaseModel): name: str | None = None steps: list[str] | None = Field( None, - description="Array of evaluation plan steps (each step is a JSON-serialized string)", + description='Array of evaluation plan steps (each step is a JSON-serialized string)', ) config: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class EvalLimits(BaseModel): - time_limit: int | None = Field(None, description="Time limit for evaluation.") - message_limit: int | None = Field(None, description="Message limit for evaluation.") - token_limit: int | None = Field(None, description="Token limit for evaluation.") + time_limit: int | None = Field( + None, description='Time limit for evaluation.' + ) + message_limit: int | None = Field( + None, description='Message limit for evaluation.' + ) + token_limit: int | None = Field( + None, description='Token limit for evaluation.' + ) class Sandbox(BaseModel): - type: str | None = Field(None, description="Type of sandbox e.g. docker") + type: str | None = Field(None, description='Type of sandbox e.g. docker') config: str | None = Field( None, - description="Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs", + description='Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs', ) class GenerationArgs(BaseModel): model_config = ConfigDict( - extra="forbid", + extra='forbid', ) - temperature: float | None = Field(None, description="Sampling temperature") - top_p: float | None = Field(None, description="Nucleus sampling parameter") - top_k: float | None = Field(None, description="Top-k sampling parameter") + temperature: float | None = Field(None, description='Sampling temperature') + top_p: float | None = Field(None, description='Nucleus sampling parameter') + top_k: float | None = Field(None, description='Top-k sampling parameter') max_tokens: conint(ge=1) | None = Field( - None, description="Maximum number of tokens to generate" + None, description='Maximum number of tokens to generate' ) execution_command: str | None = Field( - None, description="Command used to run the model to generate results" + None, description='Command used to run the model to generate results' ) reasoning: bool | None = Field( None, - description="Whether reasoning orchain-of-thought was used to generate results", + description='Whether reasoning orchain-of-thought was used to generate results', ) prompt_template: str | None = Field( None, - description="Input prompt template for task (should contain agentic info if needed).", + description='Input prompt template for task (should contain agentic info if needed).', ) agentic_eval_config: AgenticEvalConfig | None = Field( - None, description="General configuration for agentic evaluations." + None, description='General configuration for agentic evaluations.' ) eval_plan: EvalPlan | None = Field( None, - description="Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.", + description='Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.', ) eval_limits: EvalLimits | None = Field( None, - description="Listed evaluation limits like time limit, message limit, token limit.", + description='Listed evaluation limits like time limit, message limit, token limit.', ) sandbox: Sandbox | None = None max_attempts: int | None = Field( - 1, description="Maximum number of submission attempts (default 1)." + 1, description='Maximum number of submission attempts (default 1).' ) incorrect_attempt_feedback: str | None = Field( - None, description="Feedback from the model after incorrect attempt." + None, description='Feedback from the model after incorrect attempt.' ) class GenerationConfig(BaseModel): generation_args: GenerationArgs | None = Field( None, - description="Parameters used to generate results - properties may vary by model type", + description='Parameters used to generate results - properties may vary by model type', ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class Format(Enum): - jsonl = "jsonl" - json = "json" + jsonl = 'jsonl' + json = 'json' class HashAlgorithm(Enum): - sha256 = "sha256" - md5 = "md5" + sha256 = 'sha256' + md5 = 'md5' class DetailedEvaluationResults(BaseModel): format: Format | None = Field( - None, description="Format of the detailed evaluation results" + None, description='Format of the detailed evaluation results' ) file_path: str | None = Field( - None, description="Path to the detailed evaluation results file" + None, description='Path to the detailed evaluation results file' ) hash_algorithm: HashAlgorithm | None = Field( None, - description="Hash algorithm used for checksum and sample_hash in instance-level data", + description='Hash algorithm used for checksum and sample_hash in instance-level data', ) - checksum: str | None = Field(None, description="Checksum value of the file") + checksum: str | None = Field(None, description='Checksum value of the file') total_rows: int | None = Field( - None, description="Total number of rows in the detailed evaluation results file" + None, + description='Total number of rows in the detailed evaluation results file', ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class InferenceEngine(BaseModel): - name: str | None = Field(None, description="Name of the inference engine") - version: str | None = Field(None, description="Version of the inference engine") + name: str | None = Field(None, description='Name of the inference engine') + version: str | None = Field( + None, description='Version of the inference engine' + ) class ModelInfo(BaseModel): - name: str = Field(..., description="Model name provided by evaluation source") + name: str = Field( + ..., description='Model name provided by evaluation source' + ) id: str = Field( ..., - description="Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)", + description='Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)', ) developer: str | None = Field( - None, description="Name of organization that provides the model (e.g. 'OpenAI')" + None, + description="Name of organization that provides the model (e.g. 'OpenAI')", ) inference_platform: str | None = Field( None, - description="Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)", + description='Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)', ) inference_engine: InferenceEngine | None = Field( None, - description="Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).", + description='Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).', ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class SourceDataUrl(BaseModel): - dataset_name: str = Field(..., description="Name of the source dataset") - source_type: Literal["url"] + dataset_name: str = Field(..., description='Name of the source dataset') + source_type: Literal['url'] url: list[str] = Field( - ..., description="URL(s) for the source of the evaluation data", min_length=1 + ..., + description='URL(s) for the source of the evaluation data', + min_length=1, ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class SourceDataHf(BaseModel): - dataset_name: str = Field(..., description="Name of the source dataset") - source_type: Literal["hf_dataset"] - hf_repo: str | None = Field(None, description="HuggingFace repository identifier") - hf_split: str | None = Field(None, description="One of train, val or test.") + dataset_name: str = Field(..., description='Name of the source dataset') + source_type: Literal['hf_dataset'] + hf_repo: str | None = Field( + None, description='HuggingFace repository identifier' + ) + hf_split: str | None = Field(None, description='One of train, val or test.') samples_number: int | None = Field( - None, description="Number of samples in the dataset" + None, description='Number of samples in the dataset' ) sample_ids: list[str] | None = Field( - None, description="Array of sample ids used for evaluation" + None, description='Array of sample ids used for evaluation' ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class SourceDataPrivate(BaseModel): - dataset_name: str = Field(..., description="Name of the source dataset") - source_type: Literal["other"] + dataset_name: str = Field(..., description='Name of the source dataset') + source_type: Literal['other'] additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) @@ -324,89 +359,99 @@ class JudgeConfig(BaseModel): model_info: ModelInfo temperature: float | None = None weight: float | None = Field( - None, description="Weight of this judge's score in aggregation (used in jury)" + None, + description="Weight of this judge's score in aggregation (used in jury)", ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class LlmScoring(BaseModel): judges: list[JudgeConfig] = Field( ..., - description="LLM judge(s) - single item for judge, multiple for jury", + description='LLM judge(s) - single item for judge, multiple for jury', min_length=1, ) - input_prompt: str = Field(..., description="Prompt template used for judging") + input_prompt: str = Field( + ..., description='Prompt template used for judging' + ) aggregation_method: AggregationMethod | None = Field( - None, description="How to aggregate scores when multiple judges" + None, description='How to aggregate scores when multiple judges' ) expert_baseline: float | None = Field( - None, description="Expert/human baseline score for comparison" + None, description='Expert/human baseline score for comparison' ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) class MetricConfig(BaseModel): evaluation_description: str | None = Field( - None, description="Description of the evaluation" + None, description='Description of the evaluation' ) metric_id: str | None = Field( None, - description="Stable metric identifier for joining/deduping/querying. Use a canonical global id when applicable (e.g. accuracy, f1_macro, auroc, rmse, pass_at_k). For benchmark/leaderboard-specific metrics, use a namespaced id (e.g. rewardbench.overall, lmarena.elo).", + description='Stable metric identifier for joining/deduping/querying. Use a canonical global id when applicable (e.g. accuracy, f1_macro, auroc, rmse, pass_at_k). For benchmark/leaderboard-specific metrics, use a namespaced id (e.g. rewardbench.overall, lmarena.elo).', ) metric_name: str | None = Field( None, - description="Display name for the metric (e.g. Accuracy, F1-macro, pass@1).", + description='Display name for the metric (e.g. Accuracy, F1-macro, pass@1).', ) metric_kind: str | None = Field( None, - description="Normalized metric family/type used for safe aggregation (e.g. accuracy, f1, auroc, rmse, mae, pass_rate, elo).", + description='Normalized metric family/type used for safe aggregation (e.g. accuracy, f1, auroc, rmse, mae, pass_rate, elo).', ) metric_unit: str | None = Field( None, - description="Unit of the metric values (e.g. proportion, percent, points, ms, tokens).", + description='Unit of the metric values (e.g. proportion, percent, points, ms, tokens).', ) metric_parameters: dict[str, str | float | bool | None] | None = Field( - None, description='Metric-specific parameters (e.g. {"k": 1} for pass@k).' + None, + description='Metric-specific parameters (e.g. {"k": 1} for pass@k).', + ) + lower_is_better: bool = Field( + ..., description='Whether a lower score is better' + ) + score_type: ScoreType | None = Field(None, description='Type of score') + level_names: list[str] | None = Field( + None, description='Names of the score levels' ) - lower_is_better: bool = Field(..., description="Whether a lower score is better") - score_type: ScoreType | None = Field(None, description="Type of score") - level_names: list[str] | None = Field(None, description="Names of the score levels") level_metadata: list[str] | None = Field( - None, description="Additional Description for each Score Level" + None, description='Additional Description for each Score Level' ) has_unknown_level: bool | None = Field( None, - description="Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown", + description='Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown', ) min_score: float | None = Field( - None, description="Minimum possible score for continuous metric" + None, description='Minimum possible score for continuous metric' ) max_score: float | None = Field( - None, description="Maximum possible score for continuous metric" + None, description='Maximum possible score for continuous metric' ) llm_scoring: LlmScoring | None = Field( - None, description="Configuration when LLM is used as scorer/judge" + None, description='Configuration when LLM is used as scorer/judge' ) additional_details: dict[str, str] | None = Field( None, - description="Additional parameters (key-value pairs, all values must be strings)", + description='Additional parameters (key-value pairs, all values must be strings)', ) # --- validators (added by post_codegen.py) --- - @model_validator(mode="after") + @model_validator(mode='after') def validate_score_type_requirements(self): if self.score_type == ScoreType.levels: if self.level_names is None: raise ValueError("score_type 'levels' requires level_names") if self.has_unknown_level is None: - raise ValueError("score_type 'levels' requires has_unknown_level") + raise ValueError( + "score_type 'levels' requires has_unknown_level" + ) elif self.score_type == ScoreType.continuous: if self.min_score is None: raise ValueError("score_type 'continuous' requires min_score") @@ -414,55 +459,62 @@ def validate_score_type_requirements(self): raise ValueError("score_type 'continuous' requires max_score") return self + class EvaluationResult(BaseModel): evaluation_result_id: str | None = Field( None, - description="Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.", + description='Stable identifier for this metric result inside an evaluation run. Recommended deterministic join key for instance-level records.', ) - evaluation_name: str = Field(..., description="Name of the evaluation") - source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field( + evaluation_name: str = Field(..., description='Name of the evaluation') + source_data: Annotated[ + SourceDataUrl | SourceDataHf | SourceDataPrivate, + Discriminator('source_type'), + ] = Field( ..., - description="Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.", + description='Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.', ) evaluation_timestamp: str | None = Field( - None, description="Timestamp for when the evaluations were run" + None, description='Timestamp for when the evaluations were run' + ) + metric_config: MetricConfig = Field( + ..., description='Details about the metric' ) - metric_config: MetricConfig = Field(..., description="Details about the metric") score_details: ScoreDetails = Field( - ..., description="The score for the evaluation and related details" + ..., description='The score for the evaluation and related details' ) generation_config: GenerationConfig | None = None class EvaluationLog(BaseModel): model_config = ConfigDict( - extra="forbid", + extra='forbid', ) schema_version: str = Field( - ..., description="Version of the schema used for this evaluation data" + ..., description='Version of the schema used for this evaluation data' ) evaluation_id: str = Field( ..., - description="Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format", + description='Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format', ) evaluation_timestamp: str | None = Field( - None, description="Timestamp for when the evaluation was run" + None, description='Timestamp for when the evaluation was run' ) retrieved_timestamp: str = Field( ..., - description="Timestamp for when this record was created - using Unix Epoch time format", + description='Timestamp for when this record was created - using Unix Epoch time format', ) source_metadata: SourceMetadata = Field( - ..., description="Metadata about the source of the leaderboard data" + ..., description='Metadata about the source of the leaderboard data' ) eval_library: EvalLibrary = Field( - ..., description="Evaluation library/framework used to run the evaluation" + ..., + description='Evaluation library/framework used to run the evaluation', ) model_info: ModelInfo evaluation_results: list[EvaluationResult] = Field( - ..., description="Array of evaluation results" + ..., description='Array of evaluation results' ) detailed_evaluation_results: DetailedEvaluationResults | None = Field( None, - description="Reference to the evaluation results for all individual samples in the evaluation", + description='Reference to the evaluation results for all individual samples in the evaluation', ) diff --git a/every_eval_ever/helpers/__init__.py b/every_eval_ever/helpers/__init__.py index 9056cbe37..84dcc0e8f 100644 --- a/every_eval_ever/helpers/__init__.py +++ b/every_eval_ever/helpers/__init__.py @@ -1,34 +1,34 @@ """Shared utilities for evaluation data adapters.""" from .developer import get_developer, get_model_id -from .fetch import fetch_json, fetch_csv, FetchError -from .io import save_evaluation_log, generate_output_path, sanitize_filename +from .fetch import FetchError, fetch_csv, fetch_json +from .io import generate_output_path, sanitize_filename, save_evaluation_log from .schema import ( SCHEMA_VERSION, - make_metric_config, + make_evaluation_log, make_evaluation_result, - make_source_metadata, + make_metric_config, make_model_info, - make_evaluation_log, + make_source_metadata, ) __all__ = [ # developer.py - "get_developer", - "get_model_id", + 'get_developer', + 'get_model_id', # fetch.py - "fetch_json", - "fetch_csv", - "FetchError", + 'fetch_json', + 'fetch_csv', + 'FetchError', # io.py - "save_evaluation_log", - "generate_output_path", - "sanitize_filename", + 'save_evaluation_log', + 'generate_output_path', + 'sanitize_filename', # schema.py - "SCHEMA_VERSION", - "make_metric_config", - "make_evaluation_result", - "make_source_metadata", - "make_model_info", - "make_evaluation_log", + 'SCHEMA_VERSION', + 'make_metric_config', + 'make_evaluation_result', + 'make_source_metadata', + 'make_model_info', + 'make_evaluation_log', ] diff --git a/every_eval_ever/helpers/developer.py b/every_eval_ever/helpers/developer.py index b54a3b251..861a583a6 100644 --- a/every_eval_ever/helpers/developer.py +++ b/every_eval_ever/helpers/developer.py @@ -4,82 +4,73 @@ DEVELOPER_PATTERNS = { # OpenAI models - "gpt": "openai", - "text-davinci": "openai", - "text-curie": "openai", - "text-babbage": "openai", - "text-ada": "openai", - "davinci": "openai", - "curie": "openai", - "babbage": "openai", - "ada": "openai", - "o1": "openai", - "o3": "openai", - "o4": "openai", - + 'gpt': 'openai', + 'text-davinci': 'openai', + 'text-curie': 'openai', + 'text-babbage': 'openai', + 'text-ada': 'openai', + 'davinci': 'openai', + 'curie': 'openai', + 'babbage': 'openai', + 'ada': 'openai', + 'o1': 'openai', + 'o3': 'openai', + 'o4': 'openai', # Anthropic models - "claude": "anthropic", - + 'claude': 'anthropic', # Google models - "gemini": "google", - "gemma": "google", - "palm": "google", - "t5": "google", - "ul2": "google", - "text-bison": "google", - "text-unicorn": "google", - + 'gemini': 'google', + 'gemma': 'google', + 'palm': 'google', + 't5': 'google', + 'ul2': 'google', + 'text-bison': 'google', + 'text-unicorn': 'google', # Meta models - "llama": "meta", - "opt": "meta", - + 'llama': 'meta', + 'opt': 'meta', # Mistral models - "mistral": "mistralai", - "mixtral": "mistralai", - + 'mistral': 'mistralai', + 'mixtral': 'mistralai', # Alibaba models - "qwen": "alibaba", - + 'qwen': 'alibaba', # Microsoft models - "phi": "microsoft", - "tnlg": "microsoft", - + 'phi': 'microsoft', + 'tnlg': 'microsoft', # AI21 models - "j1": "ai21", - "j2": "ai21", - "jamba": "ai21", - "jurassic": "ai21", - + 'j1': 'ai21', + 'j2': 'ai21', + 'jamba': 'ai21', + 'jurassic': 'ai21', # Cohere models - "command": "cohere", - "cohere": "cohere", - "aya": "cohere", - "granite": "ibm", - + 'command': 'cohere', + 'cohere': 'cohere', + 'aya': 'cohere', + 'granite': 'ibm', # Other providers - "falcon": "tiiuae", - "bloom": "bigscience", - "t0pp": "bigscience", - "pythia": "eleutherai", - "gpt-j": "eleutherai", - "gpt-neox": "eleutherai", - "luminous": "aleph-alpha", - "mpt": "mosaicml", - "redpajama": "together", - "vicuna": "lmsys", - "alpaca": "stanford", - "palmyra": "writer", - "instructpalmyra": "writer", - "yalm": "yandex", - "glm": "zhipu-ai", - "deepseek": "deepseek", - "yi": "01-ai", - "solar": "upstage", - "arctic": "snowflake", - "dbrx": "databricks", - "olmo": "allenai", - "nova": "amazon", - "grok": "xai", + 'falcon': 'tiiuae', + 'bloom': 'bigscience', + 't0pp': 'bigscience', + 'pythia': 'eleutherai', + 'gpt-j': 'eleutherai', + 'gpt-neox': 'eleutherai', + 'luminous': 'aleph-alpha', + 'mpt': 'mosaicml', + 'redpajama': 'together', + 'vicuna': 'lmsys', + 'alpaca': 'stanford', + 'palmyra': 'writer', + 'instructpalmyra': 'writer', + 'yalm': 'yandex', + 'glm': 'zhipu-ai', + 'deepseek': 'deepseek', + 'yi': '01-ai', + 'solar': 'upstage', + 'arctic': 'snowflake', + 'dbrx': 'databricks', + 'olmo': 'allenai', + 'nova': 'amazon', + 'grok': 'xai', } @@ -108,19 +99,19 @@ def get_developer(model_name: str) -> str: "unknown" """ if not model_name: - return "unknown" + return 'unknown' # If already has org prefix (e.g., "meta-llama/Llama-3-8B"), use it - if "/" in model_name: - return model_name.split("/")[0] + if '/' in model_name: + return model_name.split('/')[0] # Pattern match against known model families lower_name = model_name.lower() for pattern, developer in DEVELOPER_PATTERNS.items(): - if lower_name.startswith(pattern) or f"-{pattern}" in lower_name: + if lower_name.startswith(pattern) or f'-{pattern}' in lower_name: return developer - return "unknown" + return 'unknown' def get_model_id(model_name: str, developer: Optional[str] = None) -> str: @@ -140,8 +131,8 @@ def get_model_id(model_name: str, developer: Optional[str] = None) -> str: >>> get_model_id("openai/gpt-4") "openai/gpt-4" """ - if "/" in model_name: + if '/' in model_name: return model_name dev = developer or get_developer(model_name) - return f"{dev}/{model_name}" + return f'{dev}/{model_name}' diff --git a/every_eval_ever/helpers/fetch.py b/every_eval_ever/helpers/fetch.py index cbaa5a8a0..ef48de570 100644 --- a/every_eval_ever/helpers/fetch.py +++ b/every_eval_ever/helpers/fetch.py @@ -6,13 +6,15 @@ import requests - DEFAULT_TIMEOUT = 60 # seconds + class FetchError(Exception): """Raised when fetching data from a remote source fails.""" + pass + def fetch_json( url: str, timeout: int = DEFAULT_TIMEOUT, @@ -37,9 +39,9 @@ def fetch_json( response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: - raise FetchError(f"Failed to fetch {url}: {e}") from e + raise FetchError(f'Failed to fetch {url}: {e}') from e except ValueError as e: - raise FetchError(f"Failed to parse JSON from {url}: {e}") from e + raise FetchError(f'Failed to parse JSON from {url}: {e}') from e def fetch_csv( @@ -62,11 +64,13 @@ def fetch_csv( FetchError: If the request fails or returns non-200 status """ try: - response = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True) + response = requests.get( + url, timeout=timeout, headers=headers, allow_redirects=True + ) response.raise_for_status() reader = csv.DictReader(io.StringIO(response.text)) return list(reader) except requests.exceptions.RequestException as e: - raise FetchError(f"Failed to fetch {url}: {e}") from e + raise FetchError(f'Failed to fetch {url}: {e}') from e except csv.Error as e: - raise FetchError(f"Failed to parse CSV from {url}: {e}") from e + raise FetchError(f'Failed to parse CSV from {url}: {e}') from e diff --git a/every_eval_ever/helpers/io.py b/every_eval_ever/helpers/io.py index 2427229b8..88ac19730 100644 --- a/every_eval_ever/helpers/io.py +++ b/every_eval_ever/helpers/io.py @@ -21,7 +21,7 @@ def sanitize_filename(name: str) -> str: Sanitized string safe for filesystem use """ # Replace characters invalid on Windows/Unix filesystems - return re.sub(r'[<>:"/\\|?*]', "_", name) + return re.sub(r'[<>:"/\\|?*]', '_', name) def generate_output_path( @@ -75,7 +75,7 @@ def save_evaluation_log( dir_path = generate_output_path(base_dir, developer, model_name) dir_path.mkdir(parents=True, exist_ok=True) - filename = f"{uuid.uuid4()}.json" + filename = f'{uuid.uuid4()}.json' filepath = dir_path / filename json_str = eval_log.model_dump_json(indent=2, exclude_none=True) diff --git a/every_eval_ever/helpers/schema.py b/every_eval_ever/helpers/schema.py index fa228292c..12b7eb190 100644 --- a/every_eval_ever/helpers/schema.py +++ b/every_eval_ever/helpers/schema.py @@ -7,9 +7,9 @@ def _load_schema_version() -> str: - schema_path = Path(__file__).parent.parent.parent / "eval.schema.json" + schema_path = Path(__file__).parent.parent.parent / 'eval.schema.json' with schema_path.open() as f: - return json.load(f)["version"] + return json.load(f)['version'] SCHEMA_VERSION = _load_schema_version() @@ -121,7 +121,7 @@ def make_evaluation_result( def make_source_metadata( source_name: str, organization_name: str, - source_type: str = "documentation", + source_type: str = 'documentation', evaluator_relationship: EvaluatorRelationship = EvaluatorRelationship.third_party, organization_url: Optional[str] = None, additional_details: Optional[Dict[str, str]] = None, @@ -153,7 +153,7 @@ def make_source_metadata( def make_model_info( model_name: str, developer: Optional[str] = None, - inference_platform: str = "unknown", + inference_platform: str = 'unknown', additional_details: Optional[Dict[str, Any]] = None, ) -> ModelInfo: """ @@ -188,11 +188,11 @@ def make_evaluation_log( evaluation_results: List[EvaluationResult], source_data: List[str], organization_name: str, - source_type: str = "documentation", + source_type: str = 'documentation', evaluator_relationship: EvaluatorRelationship = EvaluatorRelationship.third_party, organization_url: Optional[str] = None, developer: Optional[str] = None, - inference_platform: str = "unknown", + inference_platform: str = 'unknown', model_additional_details: Optional[Dict[str, Any]] = None, retrieved_timestamp: Optional[str] = None, ) -> EvaluationLog: @@ -223,8 +223,8 @@ def make_evaluation_log( model_id = get_model_id(model_name, dev) # Build evaluation_id: source_name/model_id_sanitized/timestamp - sanitized_model_id = model_id.replace("/", "_") - evaluation_id = f"{source_name}/{sanitized_model_id}/{timestamp}" + sanitized_model_id = model_id.replace('/', '_') + evaluation_id = f'{source_name}/{sanitized_model_id}/{timestamp}' return EvaluationLog( schema_version=SCHEMA_VERSION, diff --git a/every_eval_ever/instance_level_types.py b/every_eval_ever/instance_level_types.py index e43b5b12f..ff78d3420 100644 --- a/every_eval_ever/instance_level_types.py +++ b/every_eval_ever/instance_level_types.py @@ -3,216 +3,239 @@ # timestamp: 2026-03-19T20:30:15+00:00 from __future__ import annotations + from enum import Enum -from pydantic import BaseModel, ConfigDict, Field, confloat, conint, model_validator from typing import Any +from pydantic import ( + BaseModel, + ConfigDict, + Field, + confloat, + conint, + model_validator, +) + class InteractionType(Enum): - single_turn = "single_turn" - multi_turn = "multi_turn" - agentic = "agentic" + single_turn = 'single_turn' + multi_turn = 'multi_turn' + agentic = 'agentic' class Input(BaseModel): - raw: str = Field(..., description="The raw input as defined in the eval") + raw: str = Field(..., description='The raw input as defined in the eval') formatted: str | None = Field( None, - description="Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees", + description='Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees', ) reference: list[str] = Field( - ..., description="Ground truths or reference answers for comparison/scoring" + ..., + description='Ground truths or reference answers for comparison/scoring', ) choices: list[str] | None = Field( - None, description="Optional list of choices for multiple-choice questions" + None, + description='Optional list of choices for multiple-choice questions', ) class Output(BaseModel): - raw: list[str] = Field(..., description="Complete model responses") + raw: list[str] = Field(..., description='Complete model responses') reasoning_trace: list[str] | None = Field( None, - description="Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)", + description='Reasoning traces of the model if applicable (e.g. chain-of-thought tokens)', ) class ToolCall(BaseModel): - id: str = Field(..., description="Unique identifier for the tool call") - name: str = Field(..., description="Name of tool/function") + id: str = Field(..., description='Unique identifier for the tool call') + name: str = Field(..., description='Name of tool/function') arguments: dict[str, Any] | None = Field( - None, description="Arguments used to call the tool (all values must be strings)" + None, + description='Arguments used to call the tool (all values must be strings)', ) class Message(BaseModel): turn_idx: conint(ge=0) = Field( ..., - description="Index starting from 0 indicating the position in the conversation", + description='Index starting from 0 indicating the position in the conversation', ) role: str = Field( - ..., description="Role of the speaker (e.g. user, assistant, system, tool)" + ..., + description='Role of the speaker (e.g. user, assistant, system, tool)', ) content: str | None = Field( None, - description="The actual raw text for that particular turn (can be null if empty)", + description='The actual raw text for that particular turn (can be null if empty)', ) reasoning_trace: str | None = Field( - None, description="Reasoning trace for that particular turn if applicable" + None, + description='Reasoning trace for that particular turn if applicable', ) tool_calls: list[ToolCall] | None = Field( - None, description="List of tool invocations for this turn, if applicable" + None, + description='List of tool invocations for this turn, if applicable', ) tool_call_id: list[str] | None = Field( None, - description="Reference to the tool call ID(s) this message has the content payload for.", + description='Reference to the tool call ID(s) this message has the content payload for.', ) class AnswerAttributionItem(BaseModel): turn_idx: conint(ge=0) = Field( - ..., description="Turn index in messages. 0 for single_turn" + ..., description='Turn index in messages. 0 for single_turn' ) source: str = Field( ..., description="Source of the extracted value (e.g. 'output.raw' or 'messages[turn_idx].content')", ) - extracted_value: str = Field(..., description="Value that was extracted") + extracted_value: str = Field(..., description='Value that was extracted') extraction_method: str = Field( ..., - description="Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)", + description='Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)', ) is_terminal: bool = Field( ..., - description="Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)", + description='Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)', ) class Evaluation(BaseModel): - score: float = Field(..., description="Instance-level score") - is_correct: bool = Field(..., description="Whether the final answer is correct") + score: float = Field(..., description='Instance-level score') + is_correct: bool = Field( + ..., description='Whether the final answer is correct' + ) num_turns: conint(ge=1) | None = Field( - None, description="Number of turns in the interaction" + None, description='Number of turns in the interaction' ) tool_calls_count: conint(ge=0) | None = Field( - None, description="Count of tool calls across all turns in messages" + None, description='Count of tool calls across all turns in messages' ) class TokenUsage(BaseModel): - input_tokens: conint(ge=0) = Field(..., description="Total input tokens used") - output_tokens: conint(ge=0) = Field(..., description="Total output tokens used") - total_tokens: conint(ge=0) = Field(..., description="Total tokens used") + input_tokens: conint(ge=0) = Field( + ..., description='Total input tokens used' + ) + output_tokens: conint(ge=0) = Field( + ..., description='Total output tokens used' + ) + total_tokens: conint(ge=0) = Field(..., description='Total tokens used') input_tokens_cache_write: conint(ge=0) | None = Field( - None, description="Number of tokens written to the cache" + None, description='Number of tokens written to the cache' ) input_tokens_cache_read: conint(ge=0) | None = Field( - None, description="Number of tokens retrieved from the cache" + None, description='Number of tokens retrieved from the cache' ) reasoning_tokens: conint(ge=0) | None = Field( - None, description="Number of tokens used for reasoning" + None, description='Number of tokens used for reasoning' ) class Performance(BaseModel): latency_ms: confloat(ge=0.0) | None = Field( - None, description="Total latency in milliseconds" + None, description='Total latency in milliseconds' ) time_to_first_token_ms: confloat(ge=0.0) | None = Field( - None, description="Time to first token in milliseconds" + None, description='Time to first token in milliseconds' ) generation_time_ms: confloat(ge=0.0) | None = Field( - None, description="Time for generation in milliseconds" + None, description='Time for generation in milliseconds' ) additional_details: dict[str, Any] | None = Field( None, - description="Additional performance metrics (key-value pairs, all values must be strings)", + description='Additional performance metrics (key-value pairs, all values must be strings)', ) class InstanceLevelEvaluationLog(BaseModel): model_config = ConfigDict( - extra="forbid", + extra='forbid', ) schema_version: str = Field( - ..., description="Version of the schema used for this instance data" + ..., description='Version of the schema used for this instance data' ) evaluation_id: str = Field( ..., - description="Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file.", + description='Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file.', ) model_id: str = Field( ..., - description="Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)", + description='Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)', ) evaluation_name: str = Field( ..., - description="The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics). Primarily for display/filtering when evaluation_result_id is unavailable. If an evaluation run reports multiple aggregate results/metrics for the same sample, emit multiple instance records (one per aggregate result) rather than trying to attach a single instance record to many aggregate results.", + description='The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics). Primarily for display/filtering when evaluation_result_id is unavailable. If an evaluation run reports multiple aggregate results/metrics for the same sample, emit multiple instance records (one per aggregate result) rather than trying to attach a single instance record to many aggregate results.', ) evaluation_result_id: str | None = Field( None, - description="Preferred deterministic foreign key to aggregate evaluation_results[].evaluation_result_id. This is intended to point to exactly one aggregate evaluation result; if a single underlying interaction/sample contributes to multiple aggregate results, emit multiple instance records with different evaluation_result_id values.", + description='Preferred deterministic foreign key to aggregate evaluation_results[].evaluation_result_id. This is intended to point to exactly one aggregate evaluation result; if a single underlying interaction/sample contributes to multiple aggregate results, emit multiple instance records with different evaluation_result_id values.', ) sample_id: str = Field( ..., - description="Question/sample identifier from the original dataset (e.g. gsm8k_0001)", + description='Question/sample identifier from the original dataset (e.g. gsm8k_0001)', ) sample_hash: str | None = Field( None, - description="Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent", + description='Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent', ) interaction_type: InteractionType = Field( ..., - description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents", + description='Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents', + ) + input: Input = Field( + ..., description='Input data for the evaluation sample' ) - input: Input = Field(..., description="Input data for the evaluation sample") output: Output | None = Field( None, - description="Output data - only used for single_turn interactions, null for multi_turn/agentic", + description='Output data - only used for single_turn interactions, null for multi_turn/agentic', ) messages: list[Message] | None = Field( None, - description="Full message transcript - used for multi_turn and agentic, null for single_turn. Contains all system, user, assistant, and tool messages in order.", + description='Full message transcript - used for multi_turn and agentic, null for single_turn. Contains all system, user, assistant, and tool messages in order.', ) answer_attribution: list[AnswerAttributionItem] = Field( ..., - description="Information about how the answer was extracted from the model output", + description='Information about how the answer was extracted from the model output', ) evaluation: Evaluation = Field( - ..., description="Evaluation results and scoring data" + ..., description='Evaluation results and scoring data' ) token_usage: TokenUsage | None = Field( - None, description="Token usage for the model completion" + None, description='Token usage for the model completion' ) performance: Performance | None = Field( - None, description="Performance and latency metrics" + None, description='Performance and latency metrics' ) error: str | None = Field( None, - description="Information about any error that occurred (e.g. timeout, refusal, API error)", + description='Information about any error that occurred (e.g. timeout, refusal, API error)', ) metadata: dict[str, Any] | None = Field( None, - description="Optional metadata about the sample (e.g. subject, difficulty, tags)", + description='Optional metadata about the sample (e.g. subject, difficulty, tags)', ) # --- validators (added by post_codegen.py) --- - @model_validator(mode="after") + @model_validator(mode='after') def validate_interaction_type_consistency(self): if self.interaction_type == InteractionType.single_turn: if self.output is None: - raise ValueError("single_turn interaction_type requires output") + raise ValueError('single_turn interaction_type requires output') if self.messages is not None: raise ValueError( - "single_turn interaction_type must not have messages" + 'single_turn interaction_type must not have messages' ) else: if self.messages is None: raise ValueError( - f"{self.interaction_type.value} interaction_type requires messages" + f'{self.interaction_type.value} interaction_type requires messages' ) if self.output is not None: raise ValueError( - f"{self.interaction_type.value} interaction_type must not have output" + f'{self.interaction_type.value} interaction_type must not have output' ) return self diff --git a/every_eval_ever/schema.py b/every_eval_ever/schema.py index 677bb8a0a..9400cd65d 100644 --- a/every_eval_ever/schema.py +++ b/every_eval_ever/schema.py @@ -8,12 +8,12 @@ from typing import Any -def schema_text(name: str = "eval.schema.json") -> str: +def schema_text(name: str = 'eval.schema.json') -> str: with schema_path(name) as path: - return path.read_text(encoding="utf-8") + return path.read_text(encoding='utf-8') -def schema_json(name: str = "eval.schema.json") -> dict[str, Any]: +def schema_json(name: str = 'eval.schema.json') -> dict[str, Any]: return json.loads(schema_text(name)) @@ -25,8 +25,8 @@ class _SchemaPathContext: resource as a temporary file for the lifetime of the context. """ - def __init__(self, name: str = "eval.schema.json") -> None: - resource = resources.files("every_eval_ever.schemas").joinpath(name) + def __init__(self, name: str = 'eval.schema.json') -> None: + resource = resources.files('every_eval_ever.schemas').joinpath(name) self._context = resources.as_file(resource) def __enter__(self) -> Path: @@ -37,5 +37,5 @@ def __exit__(self, exc_type, exc_value, traceback) -> None: return None -def schema_path(name: str = "eval.schema.json") -> _SchemaPathContext: +def schema_path(name: str = 'eval.schema.json') -> _SchemaPathContext: return _SchemaPathContext(name) diff --git a/every_eval_ever/validate.py b/every_eval_ever/validate.py index ac80ea80d..46f4d0a3b 100644 --- a/every_eval_ever/validate.py +++ b/every_eval_ever/validate.py @@ -38,7 +38,7 @@ class ValidationReport: file_path: Path valid: bool errors: list[dict] = field(default_factory=list) - file_type: str = "" # "aggregate" or "instance" + file_type: str = '' # "aggregate" or "instance" line_count: int = 0 # for JSONL files @@ -47,13 +47,13 @@ def _format_loc(loc: tuple) -> str: parts = [] for part in loc: if isinstance(part, int): - parts.append(f"[{part}]") + parts.append(f'[{part}]') else: if parts: - parts.append(f" -> {part}") + parts.append(f' -> {part}') else: parts.append(str(part)) - return "".join(parts) if parts else "(root)" + return ''.join(parts) if parts else '(root)' def _pydantic_errors_to_dicts(exc: ValidationError) -> list[dict]: @@ -62,10 +62,10 @@ def _pydantic_errors_to_dicts(exc: ValidationError) -> list[dict]: for err in exc.errors(): errors.append( { - "loc": _format_loc(err["loc"]), - "msg": err["msg"], - "type": err["type"], - "input": err.get("input"), + 'loc': _format_loc(err['loc']), + 'msg': err['msg'], + 'type': err['type'], + 'input': err.get('input'), } ) return errors @@ -73,13 +73,17 @@ def _pydantic_errors_to_dicts(exc: ValidationError) -> list[dict]: def validate_aggregate(file_path: Path) -> ValidationReport: """Validate a .json file as an EvaluationLog.""" - report = ValidationReport(file_path=file_path, valid=True, file_type="aggregate") + report = ValidationReport( + file_path=file_path, valid=True, file_type='aggregate' + ) try: - raw = file_path.read_text(encoding="utf-8") + raw = file_path.read_text(encoding='utf-8') except OSError as e: report.valid = False - report.errors.append({"loc": "(file)", "msg": str(e), "type": "io_error"}) + report.errors.append( + {'loc': '(file)', 'msg': str(e), 'type': 'io_error'} + ) return report try: @@ -88,9 +92,9 @@ def validate_aggregate(file_path: Path) -> ValidationReport: report.valid = False report.errors.append( { - "loc": f"line {e.lineno}, col {e.colno}", - "msg": e.msg, - "type": "json_parse_error", + 'loc': f'line {e.lineno}, col {e.colno}', + 'msg': e.msg, + 'type': 'json_parse_error', } ) return report @@ -111,9 +115,9 @@ def _validate_instance_line(line: str, line_num: int) -> list[dict]: except json.JSONDecodeError as e: return [ { - "loc": f"line {line_num}, col {e.colno}", - "msg": e.msg, - "type": "json_parse_error", + 'loc': f'line {line_num}, col {e.colno}', + 'msg': e.msg, + 'type': 'json_parse_error', } ] @@ -122,7 +126,7 @@ def _validate_instance_line(line: str, line_num: int) -> list[dict]: except ValidationError as e: errors = _pydantic_errors_to_dicts(e) for err in errors: - err["loc"] = f"line {line_num} -> {err['loc']}" + err['loc'] = f'line {line_num} -> {err["loc"]}' return errors return [] @@ -132,13 +136,17 @@ def validate_instance_file( file_path: Path, max_errors: int = DEFAULT_MAX_ERRORS ) -> ValidationReport: """Validate a .jsonl file as InstanceLevelEvaluationLog (line-by-line).""" - report = ValidationReport(file_path=file_path, valid=True, file_type="instance") + report = ValidationReport( + file_path=file_path, valid=True, file_type='instance' + ) try: - f = file_path.open(encoding="utf-8") + f = file_path.open(encoding='utf-8') except OSError as e: report.valid = False - report.errors.append({"loc": "(file)", "msg": str(e), "type": "io_error"}) + report.errors.append( + {'loc': '(file)', 'msg': str(e), 'type': 'io_error'} + ) return report with f: @@ -158,9 +166,9 @@ def validate_instance_file( if remaining <= 0: report.errors.append( { - "loc": "(truncated)", - "msg": f"Error limit reached ({max_errors}). Use --max-errors to increase.", - "type": "truncated", + 'loc': '(truncated)', + 'msg': f'Error limit reached ({max_errors}). Use --max-errors to increase.', + 'type': 'truncated', } ) break @@ -173,9 +181,9 @@ def validate_instance_file( if len(report.errors) >= max_errors: report.errors.append( { - "loc": "(truncated)", - "msg": f"Error limit reached ({max_errors}). Use --max-errors to increase.", - "type": "truncated", + 'loc': '(truncated)', + 'msg': f'Error limit reached ({max_errors}). Use --max-errors to increase.', + 'type': 'truncated', } ) break @@ -187,17 +195,19 @@ def validate_file( file_path: Path, max_errors: int = DEFAULT_MAX_ERRORS ) -> ValidationReport: """Dispatch validation by file extension.""" - if file_path.suffix == ".json": + if file_path.suffix == '.json': return validate_aggregate(file_path) - elif file_path.suffix == ".jsonl": + elif file_path.suffix == '.jsonl': return validate_instance_file(file_path, max_errors) else: - report = ValidationReport(file_path=file_path, valid=False, file_type="unsupported") + report = ValidationReport( + file_path=file_path, valid=False, file_type='unsupported' + ) report.errors.append( { - "loc": "(file)", - "msg": f"Unsupported file extension '{file_path.suffix}'. Expected .json or .jsonl", - "type": "unsupported_extension", + 'loc': '(file)', + 'msg': f"Unsupported file extension '{file_path.suffix}'. Expected .json or .jsonl", + 'type': 'unsupported_extension', } ) return report @@ -211,7 +221,7 @@ def expand_paths(paths: list[str]) -> list[Path]: if path.is_file(): result.append(path) elif path.is_dir(): - for ext in ("*.json", "*.jsonl"): + for ext in ('*.json', '*.jsonl'): result.extend(sorted(path.rglob(ext))) else: result.append(path) # let validate_file report the error @@ -222,7 +232,7 @@ def _truncate(value: object, max_len: int = 80) -> str: """Truncate a repr for display.""" s = repr(value) if len(s) > max_len: - return s[: max_len - 3] + "..." + return s[: max_len - 3] + '...' return s @@ -234,59 +244,73 @@ def _truncate(value: object, max_len: int = 80) -> str: def render_report_rich(report: ValidationReport, console: Console) -> None: """Render a single report as a rich panel.""" if report.valid: - label = Text(" PASS ", style="bold white on green") - kind = "Aggregate (EvaluationLog)" if report.file_type == "aggregate" else f"Instance (InstanceLevelEvaluationLog, {report.line_count} lines)" - header = Text.assemble(label, " ", (kind, "dim")) + label = Text(' PASS ', style='bold white on green') + kind = ( + 'Aggregate (EvaluationLog)' + if report.file_type == 'aggregate' + else f'Instance (InstanceLevelEvaluationLog, {report.line_count} lines)' + ) + header = Text.assemble(label, ' ', (kind, 'dim')) console.print( Panel( header, - title=f"[blue underline]{report.file_path}[/]", - title_align="left", - border_style="green", + title=f'[blue underline]{report.file_path}[/]', + title_align='left', + border_style='green', ) ) else: - label = Text(" FAIL ", style="bold white on red") - kind = "Aggregate (EvaluationLog)" if report.file_type == "aggregate" else "Instance (InstanceLevelEvaluationLog)" - header_line = Text.assemble(label, " ", (kind, "dim")) + label = Text(' FAIL ', style='bold white on red') + kind = ( + 'Aggregate (EvaluationLog)' + if report.file_type == 'aggregate' + else 'Instance (InstanceLevelEvaluationLog)' + ) + header_line = Text.assemble(label, ' ', (kind, 'dim')) - lines = [header_line, Text("")] + lines = [header_line, Text('')] for i, err in enumerate(report.errors, 1): - loc_text = Text(f" {i}. {err['loc']}", style="cyan") - msg_text = Text(f" {err['msg']}", style="default") + loc_text = Text(f' {i}. {err["loc"]}', style='cyan') + msg_text = Text(f' {err["msg"]}', style='default') lines.append(loc_text) lines.append(msg_text) - if "input" in err and err["input"] is not None: - got_text = Text(f" Got: {_truncate(err['input'])}", style="dim") + if 'input' in err and err['input'] is not None: + got_text = Text( + f' Got: {_truncate(err["input"])}', style='dim' + ) lines.append(got_text) - lines.append(Text("")) + lines.append(Text('')) - body = Text("\n").join(lines) + body = Text('\n').join(lines) console.print( Panel( body, - title=f"[blue underline]{report.file_path}[/]", - title_align="left", - border_style="red", + title=f'[blue underline]{report.file_path}[/]', + title_align='left', + border_style='red', ) ) -def render_summary_rich(reports: list[ValidationReport], console: Console) -> None: +def render_summary_rich( + reports: list[ValidationReport], console: Console +) -> None: """Render a summary panel.""" passed = sum(1 for r in reports if r.valid) failed = len(reports) - passed total_errors = sum(len(r.errors) for r in reports) if failed == 0: - style = "bold green" - msg = f"All {passed} file(s) passed validation" + style = 'bold green' + msg = f'All {passed} file(s) passed validation' else: - style = "bold red" - msg = f"{failed} file(s) failed, {passed} passed ({total_errors} total errors)" + style = 'bold red' + msg = f'{failed} file(s) failed, {passed} passed ({total_errors} total errors)' console.print() - console.print(Panel(Text(msg, style=style), title="Summary", border_style="dim")) + console.print( + Panel(Text(msg, style=style), title='Summary', border_style='dim') + ) def render_report_json(reports: list[ValidationReport]) -> str: @@ -295,11 +319,11 @@ def render_report_json(reports: list[ValidationReport]) -> str: for r in reports: output.append( { - "file": str(r.file_path), - "valid": r.valid, - "file_type": r.file_type, - "line_count": r.line_count, - "errors": r.errors, + 'file': str(r.file_path), + 'valid': r.valid, + 'file_type': r.file_type, + 'line_count': r.line_count, + 'errors': r.errors, } ) return json.dumps(output, indent=2, default=str) @@ -310,8 +334,10 @@ def render_report_github(reports: list[ValidationReport]) -> str: lines = [] for r in reports: for err in r.errors: - lines.append(f"::error file={r.file_path}::{err['loc']}: {err['msg']}") - return "\n".join(lines) + lines.append( + f'::error file={r.file_path}::{err["loc"]}: {err["msg"]}' + ) + return '\n'.join(lines) # --------------------------------------------------------------------------- @@ -321,46 +347,48 @@ def render_report_github(reports: list[ValidationReport]) -> str: def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( - prog="eee-validate", - description="Validate EEE schema files using Pydantic models", + prog='eee-validate', + description='Validate EEE schema files using Pydantic models', ) parser.add_argument( - "paths", - nargs="+", - help="File or directory paths to validate (.json for aggregate, .jsonl for instance-level)", + 'paths', + nargs='+', + help='File or directory paths to validate (.json for aggregate, .jsonl for instance-level)', ) parser.add_argument( - "--max-errors", + '--max-errors', type=int, default=DEFAULT_MAX_ERRORS, - help=f"Maximum errors per JSONL file (default: {DEFAULT_MAX_ERRORS})", + help=f'Maximum errors per JSONL file (default: {DEFAULT_MAX_ERRORS})', ) parser.add_argument( - "--format", - choices=["rich", "json", "github"], - default="rich", - dest="output_format", - help="Output format (default: rich)", + '--format', + choices=['rich', 'json', 'github'], + default='rich', + dest='output_format', + help='Output format (default: rich)', ) args = parser.parse_args(argv) file_paths = expand_paths(args.paths) if not file_paths: - print("No files found to validate.", file=sys.stderr) + print('No files found to validate.', file=sys.stderr) return 1 - reports = [validate_file(fp, max_errors=args.max_errors) for fp in file_paths] + reports = [ + validate_file(fp, max_errors=args.max_errors) for fp in file_paths + ] - if args.output_format == "rich": + if args.output_format == 'rich': console = Console() console.print() for report in reports: render_report_rich(report, console) render_summary_rich(reports, console) console.print() - elif args.output_format == "json": + elif args.output_format == 'json': print(render_report_json(reports)) - elif args.output_format == "github": + elif args.output_format == 'github': output = render_report_github(reports) if output: print(output) @@ -368,5 +396,5 @@ def main(argv: list[str] | None = None) -> int: return 1 if any(not r.valid for r in reports) else 0 -if __name__ == "__main__": +if __name__ == '__main__': raise SystemExit(main()) diff --git a/post_codegen.py b/post_codegen.py index 103653be3..c355536e6 100644 --- a/post_codegen.py +++ b/post_codegen.py @@ -19,10 +19,10 @@ PATCHES = [ { - "file": "every_eval_ever/instance_level_types.py", - "import_add": "model_validator", - "class_name": "InstanceLevelEvaluationLog", - "validator": ''' + 'file': 'every_eval_ever/instance_level_types.py', + 'import_add': 'model_validator', + 'class_name': 'InstanceLevelEvaluationLog', + 'validator': """ # --- validators (added by post_codegen.py) --- @model_validator(mode="after") @@ -44,13 +44,13 @@ def validate_interaction_type_consistency(self): f"{self.interaction_type.value} interaction_type must not have output" ) return self -''', +""", }, { - "file": "every_eval_ever/eval_types.py", - "import_add": "model_validator", - "class_name": "MetricConfig", - "validator": ''' + 'file': 'every_eval_ever/eval_types.py', + 'import_add': 'model_validator', + 'class_name': 'MetricConfig', + 'validator': """ # --- validators (added by post_codegen.py) --- @model_validator(mode="after") @@ -66,7 +66,7 @@ def validate_score_type_requirements(self): if self.max_score is None: raise ValueError("score_type 'continuous' requires max_score") return self -''', +""", }, ] @@ -75,10 +75,10 @@ def validate_score_type_requirements(self): # --------------------------------------------------------------------------- DISCRIMINATOR_PATCH = { - "file": "every_eval_ever/eval_types.py", - "target_line": " source_data: SourceDataUrl | SourceDataHf | SourceDataPrivate = Field(", - "replacement": ' source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field(', - "imports": ["Annotated", "Discriminator"], + 'file': 'every_eval_ever/eval_types.py', + 'target_line': ' source_data: SourceDataUrl | SourceDataHf | SourceDataPrivate = Field(', + 'replacement': ' source_data: Annotated[SourceDataUrl | SourceDataHf | SourceDataPrivate, Discriminator("source_type")] = Field(', + 'imports': ['Annotated', 'Discriminator'], } @@ -89,104 +89,110 @@ def add_import(content: str, symbol: str) -> str: def replacer(m): existing = m.group(1) - return f"from pydantic import {existing}, {symbol}" + return f'from pydantic import {existing}, {symbol}' - return re.sub(r"from pydantic import (.+)", replacer, content, count=1) + return re.sub(r'from pydantic import (.+)', replacer, content, count=1) -def append_to_last_class_field(content: str, class_name: str, validator_code: str) -> str: +def append_to_last_class_field( + content: str, class_name: str, validator_code: str +) -> str: """Append validator code after the last field of a class, before the next class or EOF.""" # Find the class definition - class_pattern = rf"^class {class_name}\(.*?\):" + class_pattern = rf'^class {class_name}\(.*?\):' class_match = re.search(class_pattern, content, re.MULTILINE) if not class_match: - raise ValueError(f"Class {class_name} not found") + raise ValueError(f'Class {class_name} not found') class_start = class_match.start() # Find the next class definition or EOF after this class - next_class = re.search(r"^\nclass ", content[class_start + 1:], re.MULTILINE) + next_class = re.search( + r'^\nclass ', content[class_start + 1 :], re.MULTILINE + ) if next_class: insert_pos = class_start + 1 + next_class.start() else: insert_pos = len(content) # Insert validator before the next class (or at EOF), replacing trailing whitespace - before = content[:insert_pos].rstrip("\n") + before = content[:insert_pos].rstrip('\n') after = content[insert_pos:] - return before + "\n" + validator_code + after + return before + '\n' + validator_code + after def patch_file(patch: dict) -> None: - path = Path(__file__).parent / patch["file"] + path = Path(__file__).parent / patch['file'] content = path.read_text() # Check if already patched - if "post_codegen.py" in content: - print(f" {patch['file']}: already patched, skipping") + if 'post_codegen.py' in content: + print(f' {patch["file"]}: already patched, skipping') return - content = add_import(content, patch["import_add"]) - content = append_to_last_class_field(content, patch["class_name"], patch["validator"]) + content = add_import(content, patch['import_add']) + content = append_to_last_class_field( + content, patch['class_name'], patch['validator'] + ) path.write_text(content) - print(f" {patch['file']}: patched {patch['class_name']}") + print(f' {patch["file"]}: patched {patch["class_name"]}') def apply_discriminator_patch(patch: dict) -> None: """Add Discriminator annotation to a union field for better error messages.""" - path = Path(__file__).parent / patch["file"] + path = Path(__file__).parent / patch['file'] content = path.read_text() # Check if the specific replacement has already been applied - if patch["replacement"] in content: - print(f" {patch['file']}: discriminator already patched, skipping") + if patch['replacement'] in content: + print(f' {patch["file"]}: discriminator already patched, skipping') return # Add imports - for symbol in patch["imports"]: - if symbol == "Annotated": - if "from typing import" in content: - if "Annotated" not in content: + for symbol in patch['imports']: + if symbol == 'Annotated': + if 'from typing import' in content: + if 'Annotated' not in content: content = content.replace( - "from typing import ", - "from typing import Annotated, ", + 'from typing import ', + 'from typing import Annotated, ', ) else: # Add typing import after pydantic import content = content.replace( - "from pydantic import ", - "from typing import Annotated\nfrom pydantic import ", + 'from pydantic import ', + 'from typing import Annotated\nfrom pydantic import ', ) - elif symbol == "Discriminator": - content = add_import(content, "Discriminator") + elif symbol == 'Discriminator': + content = add_import(content, 'Discriminator') # Replace the target line - target_line = patch["target_line"] + target_line = patch['target_line'] occurrences = content.count(target_line) if occurrences == 0: raise ValueError( - f"Target line for discriminator patch not found in {patch['file']}" + f'Target line for discriminator patch not found in {patch["file"]}' ) if occurrences > 1: print( - f" {patch['file']}: warning: multiple ({occurrences}) occurrences of " - "target line found; patching all occurrences" + f' {patch["file"]}: warning: multiple ({occurrences}) occurrences of ' + 'target line found; patching all occurrences' ) - content = content.replace(target_line, patch["replacement"]) + content = content.replace(target_line, patch['replacement']) path.write_text(content) - print(f" {patch['file']}: patched source_data with Discriminator") + print(f' {patch["file"]}: patched source_data with Discriminator') def main(): - print("Applying post-codegen patches...") + print('Applying post-codegen patches...') for patch in PATCHES: patch_file(patch) apply_discriminator_patch(DISCRIMINATOR_PATCH) - print("Done.") + print('Done.') -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/pyproject.toml b/pyproject.toml index 8ee714086..b2dfc7df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,3 +45,27 @@ dev = [ "pytest>=9.0.2", "ruff>=0.12.2", ] + +[tool.ruff] +line-length = 80 +target-version = "py312" + +[tool.ruff.lint] +# Enable Flake8 (E, F) and isort (I) rules. +select = ["E", "F", "I"] +# Ignore specific rules, for example, E501 (line too long) as it's handled by the formatter. +ignore = [ + "E501", # line too long + "E402", # Module level import not at top of file +] + +[tool.ruff.format] +quote-style = "single" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" +docstring-code-format = false + +[tool.ty.rules] +unused-ignore-comment = "ignore" +unused-type-ignore-comment = "ignore" diff --git a/tests/test_check_duplicate_entries.py b/tests/test_check_duplicate_entries.py index 450349e7b..ddee07495 100644 --- a/tests/test_check_duplicate_entries.py +++ b/tests/test_check_duplicate_entries.py @@ -5,23 +5,29 @@ from every_eval_ever import check_duplicate_entries as check_module -DATA_ROOT = Path(__file__).resolve().parents[1] / "data" +DATA_ROOT = Path(__file__).resolve().parents[1] / 'data' SAMPLE_FILES = [ - Path(__file__).resolve().parent / "data" / "98ea850e-7019-4728-a558-8b1819ec47c2.json", - Path(__file__).resolve().parent / "data" / "98ea850e-7019-4728-a558-8b1819ec47c2.json" + Path(__file__).resolve().parent + / 'data' + / '98ea850e-7019-4728-a558-8b1819ec47c2.json', + Path(__file__).resolve().parent + / 'data' + / '98ea850e-7019-4728-a558-8b1819ec47c2.json', ] -@pytest.fixture(scope="module") +@pytest.fixture(scope='module') def sample_payloads(): missing = [path for path in SAMPLE_FILES if not path.exists()] if missing: - pytest.skip(f"Sample data file missing: {missing[0]}") - return [json.loads(path.read_text(encoding="utf-8")) for path in SAMPLE_FILES] + pytest.skip(f'Sample data file missing: {missing[0]}') + return [ + json.loads(path.read_text(encoding='utf-8')) for path in SAMPLE_FILES + ] def write_json(path: Path, payload: dict) -> None: - path.write_text(json.dumps(payload), encoding="utf-8") + path.write_text(json.dumps(payload), encoding='utf-8') def clone_payload(payload: dict) -> dict: @@ -30,10 +36,12 @@ def clone_payload(payload: dict) -> dict: def simulate_rescrape(payload: dict) -> dict: cloned = clone_payload(payload) - cloned["evaluation_id"] = "simulated-duplicate" - cloned["retrieved_timestamp"] = "1234567890.0" - if isinstance(cloned.get("evaluation_results"), list): - cloned["evaluation_results"] = list(reversed(cloned["evaluation_results"])) + cloned['evaluation_id'] = 'simulated-duplicate' + cloned['retrieved_timestamp'] = '1234567890.0' + if isinstance(cloned.get('evaluation_results'), list): + cloned['evaluation_results'] = list( + reversed(cloned['evaluation_results']) + ) return cloned @@ -41,35 +49,38 @@ def test_normalized_hash_ignores_keys_and_list_order(sample_payloads): payload_a = clone_payload(sample_payloads[0]) payload_b = simulate_rescrape(sample_payloads[0]) - assert check_module.normalized_hash(payload_a) == check_module.normalized_hash( - payload_b - ) + assert check_module.normalized_hash( + payload_a + ) == check_module.normalized_hash(payload_b) def test_normalized_hash_detects_real_changes(sample_payloads): payload_a = clone_payload(sample_payloads[0]) payload_c = clone_payload(sample_payloads[0]) - payload_c["evaluation_id"] = "eval-c" - payload_c["retrieved_timestamp"] = "2024-01-03" - if isinstance(payload_c.get("evaluation_results"), list) and payload_c["evaluation_results"]: - payload_c["evaluation_results"][0]["score_details"]["score"] = ( - payload_c["evaluation_results"][0]["score_details"]["score"] + 0.001 + payload_c['evaluation_id'] = 'eval-c' + payload_c['retrieved_timestamp'] = '2024-01-03' + if ( + isinstance(payload_c.get('evaluation_results'), list) + and payload_c['evaluation_results'] + ): + payload_c['evaluation_results'][0]['score_details']['score'] = ( + payload_c['evaluation_results'][0]['score_details']['score'] + 0.001 ) - assert check_module.normalized_hash(payload_a) != check_module.normalized_hash( - payload_c - ) + assert check_module.normalized_hash( + payload_a + ) != check_module.normalized_hash(payload_c) def test_expand_paths_returns_json_files(tmp_path): - top = tmp_path / "top.json" - nested_dir = tmp_path / "nested" + top = tmp_path / 'top.json' + nested_dir = tmp_path / 'nested' nested_dir.mkdir() - nested = nested_dir / "nested.json" - ignored = nested_dir / "note.txt" - top.write_text("{}", encoding="utf-8") - nested.write_text("{}", encoding="utf-8") - ignored.write_text("nope", encoding="utf-8") + nested = nested_dir / 'nested.json' + ignored = nested_dir / 'note.txt' + top.write_text('{}', encoding='utf-8') + nested.write_text('{}', encoding='utf-8') + ignored.write_text('nope', encoding='utf-8') expanded = check_module.expand_paths([str(tmp_path)]) assert set(expanded) == {str(top), str(nested)} @@ -77,21 +88,19 @@ def test_expand_paths_returns_json_files(tmp_path): expanded_file = check_module.expand_paths([str(top)]) assert expanded_file == [str(top)] - missing = tmp_path / "missing.json" - with pytest.raises(Exception, match="Could not find file or directory"): + missing = tmp_path / 'missing.json' + with pytest.raises(Exception, match='Could not find file or directory'): check_module.expand_paths([str(missing)]) -def test_main_reports_duplicates( - sample_payloads, tmp_path, capsys -): +def test_main_reports_duplicates(sample_payloads, tmp_path, capsys): payload = sample_payloads[0] - file_a = tmp_path / "a.json" - file_b = tmp_path / "b.json" + file_a = tmp_path / 'a.json' + file_b = tmp_path / 'b.json' write_json(file_a, payload) write_json(file_b, simulate_rescrape(payload)) assert check_module.main([str(file_a), str(file_b)]) == 1 captured = capsys.readouterr().out - assert "Found duplicate entries" in captured + assert 'Found duplicate entries' in captured diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py index 4074515a8..c4a3f46f2 100644 --- a/tests/test_helm_adapter.py +++ b/tests/test_helm_adapter.py @@ -1,27 +1,36 @@ import pytest -pytest.importorskip("helm", reason="crfm-helm not installed; install with: uv sync --extra helm") -from pathlib import Path +pytest.importorskip( + 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' +) + import tempfile +from pathlib import Path from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.eval_types import ( EvaluationLog, EvaluatorRelationship, SourceDataHf, - SourceMetadata + SourceMetadata, ) def _load_eval(adapter, filepath, metadata_args): eval_dirpath = Path(filepath) - + with tempfile.TemporaryDirectory() as tmpdir: - converted_eval = adapter.transform_from_directory(eval_dirpath, output_path=str(Path(tmpdir) / 'helm_output'), metadata_args=metadata_args) + converted_eval = adapter.transform_from_directory( + eval_dirpath, + output_path=str(Path(tmpdir) / 'helm_output'), + metadata_args=metadata_args, + ) converted_eval = converted_eval[0] assert isinstance(converted_eval, EvaluationLog) - assert isinstance(converted_eval.evaluation_results[0].source_data, SourceDataHf) + assert isinstance( + converted_eval.evaluation_results[0].source_data, SourceDataHf + ) assert isinstance(converted_eval.source_metadata, SourceMetadata) assert converted_eval.source_metadata.source_name == 'HELM' @@ -29,6 +38,7 @@ def _load_eval(adapter, filepath, metadata_args): return converted_eval + def test_mmlu_eval(): adapter = HELMAdapter() metadata_args = { @@ -36,14 +46,22 @@ def test_mmlu_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', metadata_args) + converted_eval = _load_eval( + adapter, + 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', + metadata_args, + ) assert converted_eval.evaluation_timestamp is not None assert converted_eval.retrieved_timestamp is not None - - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'mmlu' + + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name == 'mmlu' + ) assert converted_eval.evaluation_results[0].source_data.hf_repo is None - assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 10 + assert ( + len(converted_eval.evaluation_results[0].source_data.sample_ids) == 10 + ) assert converted_eval.model_info.name == 'openai/gpt2' assert converted_eval.model_info.id == 'openai/gpt2' @@ -60,6 +78,7 @@ def test_mmlu_eval(): assert converted_eval.detailed_evaluation_results.format is not None assert converted_eval.detailed_evaluation_results.total_rows == 10 + def test_hellswag_eval(): adapter = HELMAdapter() metadata_args = { @@ -67,14 +86,23 @@ def test_hellswag_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', metadata_args) + converted_eval = _load_eval( + adapter, + 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', + metadata_args, + ) assert converted_eval.evaluation_timestamp is not None assert converted_eval.retrieved_timestamp is not None - - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'hellaswag' + + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name + == 'hellaswag' + ) assert converted_eval.evaluation_results[0].source_data.hf_repo is None - assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 10 + assert ( + len(converted_eval.evaluation_results[0].source_data.sample_ids) == 10 + ) assert converted_eval.model_info.name == 'eleutherai/pythia-1b-v0' assert converted_eval.model_info.id == 'eleutherai/pythia-1b-v0' @@ -91,6 +119,7 @@ def test_hellswag_eval(): assert converted_eval.detailed_evaluation_results.format is not None assert converted_eval.detailed_evaluation_results.total_rows == 10 + def test_narrativeqa_eval(): adapter = HELMAdapter() metadata_args = { @@ -98,12 +127,17 @@ def test_narrativeqa_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', metadata_args) + converted_eval = _load_eval( + adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', metadata_args + ) assert converted_eval.evaluation_timestamp is not None assert converted_eval.retrieved_timestamp is not None - - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'narrativeqa' + + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name + == 'narrativeqa' + ) assert converted_eval.evaluation_results[0].source_data.hf_repo is None assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 5 diff --git a/tests/test_helm_instance_level_adapter.py b/tests/test_helm_instance_level_adapter.py index dc40075e8..4ee46c6c2 100644 --- a/tests/test_helm_instance_level_adapter.py +++ b/tests/test_helm_instance_level_adapter.py @@ -1,5 +1,8 @@ import pytest -pytest.importorskip("helm", reason="crfm-helm not installed; install with: uv sync --extra helm") + +pytest.importorskip( + 'helm', reason='crfm-helm not installed; install with: uv sync --extra helm' +) import json import tempfile @@ -7,26 +10,35 @@ from every_eval_ever.converters.helm.adapter import HELMAdapter from every_eval_ever.eval_types import EvaluatorRelationship -from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog, InteractionType +from every_eval_ever.instance_level_types import ( + InstanceLevelEvaluationLog, + InteractionType, +) def _load_instance_level_data(adapter, filepath, metadata_args): eval_dirpath = Path(filepath) converted_eval_list = adapter.transform_from_directory( eval_dirpath, - output_path=str(Path(metadata_args['parent_eval_output_dir']) / 'helm_output'), - metadata_args=metadata_args + output_path=str( + Path(metadata_args['parent_eval_output_dir']) / 'helm_output' + ), + metadata_args=metadata_args, ) converted_eval = converted_eval_list[0] - instance_level_path = Path(converted_eval.detailed_evaluation_results.file_path) + instance_level_path = Path( + converted_eval.detailed_evaluation_results.file_path + ) instance_logs = [] with instance_level_path.open('r', encoding='utf-8') as f: for line in f: if line.strip(): data = json.loads(line) - instance_logs.append(InstanceLevelEvaluationLog.model_validate(data)) + instance_logs.append( + InstanceLevelEvaluationLog.model_validate(data) + ) return converted_eval, instance_logs @@ -39,13 +51,13 @@ def test_mmlu_instance_level(): 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_mmlu' + 'file_uuid': 'test_mmlu', } converted_eval, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', - metadata_args + metadata_args, ) assert len(instance_logs) == 10 @@ -88,13 +100,13 @@ def test_hellaswag_instance_level(): 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_hellaswag' + 'file_uuid': 'test_hellaswag', } converted_eval, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', - metadata_args + metadata_args, ) assert len(instance_logs) == 10 @@ -124,13 +136,13 @@ def test_narrativeqa_instance_level(): 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_narrativeqa' + 'file_uuid': 'test_narrativeqa', } converted_eval, instance_logs = _load_instance_level_data( adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', - metadata_args + metadata_args, ) assert len(instance_logs) == 5 @@ -141,7 +153,10 @@ def test_narrativeqa_instance_level(): assert log.evaluation_name == 'narrativeqa' assert log.interaction_type == InteractionType.single_turn - assert log.input.reference == ['The school Mascot', 'the schools mascot'] + assert log.input.reference == [ + 'The school Mascot', + 'the schools mascot', + ] assert log.output.raw == [' Olive.'] assert log.messages is None diff --git a/tests/test_inspect_adapter.py b/tests/test_inspect_adapter.py index f7c0204b7..da94d2732 100644 --- a/tests/test_inspect_adapter.py +++ b/tests/test_inspect_adapter.py @@ -1,31 +1,41 @@ import pytest -pytest.importorskip("inspect_ai", reason="inspect-ai not installed; install with: uv sync --extra inspect") + +pytest.importorskip( + 'inspect_ai', + reason='inspect-ai not installed; install with: uv sync --extra inspect', +) import contextlib -from pathlib import Path import tempfile +from pathlib import Path from every_eval_ever.converters.inspect.adapter import InspectAIAdapter -from every_eval_ever.converters.inspect.utils import extract_model_info_from_model_path +from every_eval_ever.converters.inspect.utils import ( + extract_model_info_from_model_path, +) from every_eval_ever.eval_types import ( EvaluationLog, EvaluatorRelationship, SourceDataHf, - SourceMetadata + SourceMetadata, ) def _load_eval(adapter, filepath, metadata_args): eval_path = Path(filepath) metadata_args = dict(metadata_args) - metadata_args.setdefault("file_uuid", "test-file-uuid") - + metadata_args.setdefault('file_uuid', 'test-file-uuid') + with tempfile.TemporaryDirectory() as tmpdir: metadata_args['parent_eval_output_dir'] = tmpdir - converted_eval = adapter.transform_from_file(eval_path, metadata_args=metadata_args) - + converted_eval = adapter.transform_from_file( + eval_path, metadata_args=metadata_args + ) + assert isinstance(converted_eval, EvaluationLog) - assert isinstance(converted_eval.evaluation_results[0].source_data, SourceDataHf) + assert isinstance( + converted_eval.evaluation_results[0].source_data, SourceDataHf + ) assert isinstance(converted_eval.source_metadata, SourceMetadata) assert converted_eval.source_metadata.source_name == 'inspect_ai' @@ -34,11 +44,13 @@ def _load_eval(adapter, filepath, metadata_args): return converted_eval -def _extract_file_uuid_from_detailed_results(converted_eval: EvaluationLog) -> str: +def _extract_file_uuid_from_detailed_results( + converted_eval: EvaluationLog, +) -> str: assert converted_eval.detailed_evaluation_results is not None stem = Path(converted_eval.detailed_evaluation_results.file_path).stem - assert stem.endswith("_samples") - return stem[: -len("_samples")] + assert stem.endswith('_samples') + return stem[: -len('_samples')] def test_pubmedqa_eval(): @@ -48,13 +60,23 @@ def test_pubmedqa_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/inspect/data_pubmedqa_gpt4o_mini.json', metadata_args) + converted_eval = _load_eval( + adapter, + 'tests/data/inspect/data_pubmedqa_gpt4o_mini.json', + metadata_args, + ) assert converted_eval.evaluation_timestamp == '1751553870.0' assert converted_eval.retrieved_timestamp is not None - - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'pubmed_qa' - assert converted_eval.evaluation_results[0].source_data.hf_repo == 'bigbio/pubmed_qa' + + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name + == 'pubmed_qa' + ) + assert ( + converted_eval.evaluation_results[0].source_data.hf_repo + == 'bigbio/pubmed_qa' + ) assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 2 assert converted_eval.model_info.name == 'openai/gpt-4o-mini-2024-07-18' @@ -77,7 +99,7 @@ def test_transform_without_metadata_args_uses_defaults(tmp_path, caplog): adapter = InspectAIAdapter() eval_file = ( Path(__file__).resolve().parent - / "data/inspect/data_pubmedqa_gpt4o_mini.json" + / 'data/inspect/data_pubmedqa_gpt4o_mini.json' ) with contextlib.chdir(tmp_path): converted_eval = adapter.transform_from_file( @@ -94,18 +116,21 @@ def test_transform_without_metadata_args_uses_defaults(tmp_path, caplog): ) assert converted_eval.detailed_evaluation_results is not None assert converted_eval.detailed_evaluation_results.total_rows == 2 - assert _extract_file_uuid_from_detailed_results(converted_eval) != "none" + assert _extract_file_uuid_from_detailed_results(converted_eval) != 'none' def test_transform_directory_assigns_unique_file_uuid_per_log(): adapter = InspectAIAdapter() - fixture_dir = Path(__file__).resolve().parent / "data/inspect" + fixture_dir = Path(__file__).resolve().parent / 'data/inspect' - with tempfile.TemporaryDirectory() as tmp_logs_dir, tempfile.TemporaryDirectory() as tmp_out_dir: + with ( + tempfile.TemporaryDirectory() as tmp_logs_dir, + tempfile.TemporaryDirectory() as tmp_out_dir, + ): tmp_logs_path = Path(tmp_logs_dir) fixture_targets = { - "data_pubmedqa_gpt4o_mini.json": "2026-02-01T11-00-00+00-00_pubmedqa_test1.json", - "data_arc_qwen.json": "2026-02-01T11-05-00+00-00_arc_test2.json", + 'data_pubmedqa_gpt4o_mini.json': '2026-02-01T11-00-00+00-00_pubmedqa_test1.json', + 'data_arc_qwen.json': '2026-02-01T11-05-00+00-00_arc_test2.json', } for source_name, target_name in fixture_targets.items(): source = fixture_dir / source_name @@ -115,30 +140,35 @@ def test_transform_directory_assigns_unique_file_uuid_per_log(): converted_logs = adapter.transform_from_directory( tmp_logs_path, metadata_args={ - "source_organization_name": "TestOrg", - "evaluator_relationship": EvaluatorRelationship.first_party, - "parent_eval_output_dir": tmp_out_dir, - "file_uuid": "shared-uuid", + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmp_out_dir, + 'file_uuid': 'shared-uuid', }, ) assert len(converted_logs) == 2 - uuids = {_extract_file_uuid_from_detailed_results(log) for log in converted_logs} - assert "shared-uuid" not in uuids + uuids = { + _extract_file_uuid_from_detailed_results(log) for log in converted_logs + } + assert 'shared-uuid' not in uuids assert len(uuids) == 2 def test_transform_directory_uses_file_uuids_metadata_when_provided(): adapter = InspectAIAdapter() - fixture_dir = Path(__file__).resolve().parent / "data/inspect" - expected_uuids = ["explicit-uuid-1", "explicit-uuid-2"] + fixture_dir = Path(__file__).resolve().parent / 'data/inspect' + expected_uuids = ['explicit-uuid-1', 'explicit-uuid-2'] - with tempfile.TemporaryDirectory() as tmp_logs_dir, tempfile.TemporaryDirectory() as tmp_out_dir: + with ( + tempfile.TemporaryDirectory() as tmp_logs_dir, + tempfile.TemporaryDirectory() as tmp_out_dir, + ): tmp_logs_path = Path(tmp_logs_dir) fixture_targets = { - "data_pubmedqa_gpt4o_mini.json": "2026-02-01T11-00-00+00-00_pubmedqa_test1.json", - "data_arc_qwen.json": "2026-02-01T11-05-00+00-00_arc_test2.json", + 'data_pubmedqa_gpt4o_mini.json': '2026-02-01T11-00-00+00-00_pubmedqa_test1.json', + 'data_arc_qwen.json': '2026-02-01T11-05-00+00-00_arc_test2.json', } for source_name, target_name in fixture_targets.items(): source = fixture_dir / source_name @@ -148,15 +178,17 @@ def test_transform_directory_uses_file_uuids_metadata_when_provided(): converted_logs = adapter.transform_from_directory( tmp_logs_path, metadata_args={ - "source_organization_name": "TestOrg", - "evaluator_relationship": EvaluatorRelationship.first_party, - "parent_eval_output_dir": tmp_out_dir, - "file_uuids": expected_uuids, + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, + 'parent_eval_output_dir': tmp_out_dir, + 'file_uuids': expected_uuids, }, ) assert len(converted_logs) == 2 - uuids = {_extract_file_uuid_from_detailed_results(log) for log in converted_logs} + uuids = { + _extract_file_uuid_from_detailed_results(log) for log in converted_logs + } assert uuids == set(expected_uuids) @@ -167,16 +199,26 @@ def test_arc_sonnet_eval(): 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/inspect/data_arc_sonnet.json', metadata_args) + converted_eval = _load_eval( + adapter, 'tests/data/inspect/data_arc_sonnet.json', metadata_args + ) assert converted_eval.evaluation_timestamp == '1761000045.0' assert converted_eval.retrieved_timestamp is not None - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'ai2_arc' - assert converted_eval.evaluation_results[0].source_data.hf_repo == 'allenai/ai2_arc' + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name + == 'ai2_arc' + ) + assert ( + converted_eval.evaluation_results[0].source_data.hf_repo + == 'allenai/ai2_arc' + ) assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 5 - assert converted_eval.model_info.name == 'anthropic/claude-sonnet-4-20250514' + assert ( + converted_eval.model_info.name == 'anthropic/claude-sonnet-4-20250514' + ) assert converted_eval.model_info.id == 'anthropic/claude-sonnet-4-20250514' assert converted_eval.model_info.developer == 'anthropic' assert converted_eval.model_info.inference_platform == 'anthropic' @@ -199,13 +241,21 @@ def test_arc_qwen_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/inspect/data_arc_qwen.json', metadata_args) + converted_eval = _load_eval( + adapter, 'tests/data/inspect/data_arc_qwen.json', metadata_args + ) assert converted_eval.evaluation_timestamp == '1761001924.0' assert converted_eval.retrieved_timestamp is not None - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'ai2_arc' - assert converted_eval.evaluation_results[0].source_data.hf_repo == 'allenai/ai2_arc' + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name + == 'ai2_arc' + ) + assert ( + converted_eval.evaluation_results[0].source_data.hf_repo + == 'allenai/ai2_arc' + ) assert len(converted_eval.evaluation_results[0].source_data.sample_ids) == 3 assert converted_eval.model_info.name == 'ollama/qwen2.5:0.5b' @@ -231,12 +281,18 @@ def test_gaia_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/inspect/2026-02-07T11-26-57+00-00_gaia_4V8zHbbRKpU5Yv2BMoBcjE.json', metadata_args) + converted_eval = _load_eval( + adapter, + 'tests/data/inspect/2026-02-07T11-26-57+00-00_gaia_4V8zHbbRKpU5Yv2BMoBcjE.json', + metadata_args, + ) assert converted_eval.evaluation_timestamp is not None assert converted_eval.retrieved_timestamp is not None - - assert converted_eval.evaluation_results[0].source_data.dataset_name == 'GAIA' + + assert ( + converted_eval.evaluation_results[0].source_data.dataset_name == 'GAIA' + ) assert converted_eval.evaluation_results[0].source_data.hf_repo is not None assert len(converted_eval.evaluation_results[0].source_data.sample_ids) > 0 @@ -264,38 +320,43 @@ def test_humaneval_eval(): 'evaluator_relationship': EvaluatorRelationship.first_party, } - converted_eval = _load_eval(adapter, 'tests/data/inspect/2026-02-24T11-23-20+00-00_humaneval_ENiBTeoXr2dbbNcDtpbVvq.json', metadata_args) + converted_eval = _load_eval( + adapter, + 'tests/data/inspect/2026-02-24T11-23-20+00-00_humaneval_ENiBTeoXr2dbbNcDtpbVvq.json', + metadata_args, + ) assert converted_eval.detailed_evaluation_results is not None + def test_convert_model_path_to_standarized_model_ids(): model_path_to_standarized_id_map = { - "openai/gpt-4o-mini": "openai/gpt-4o-mini", - "openai/azure/gpt-4o-mini": "openai/gpt-4o-mini", - "anthropic/claude-sonnet-4-0": "anthropic/claude-sonnet-4-0", - "anthropic/bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0": "anthropic/claude-3-5-sonnet@20241022", - "anthropic/vertex/claude-3-5-sonnet-v2@20241022": "anthropic/claude-3-5-sonnet@20241022", - "google/gemini-2.5-pro": "google/gemini-2.5-pro", - "google/vertex/gemini-2.0-flash": "google/gemini-2.0-flash", - "mistral/mistral-large-latest": "mistral/mistral-large-latest", - "mistral/azure/Mistral-Large-2411": "mistral/Mistral-Large-2411", - "openai-api/deepseek/deepseek-reasoner": "deepseek/deepseek-reasoner", - "bedrock/meta.llama2-70b-chat-v1": "meta/llama2-70b-chat", - "azureai/Llama-3.3-70B-Instruct": "azureai/Llama-3.3-70B-Instruct", - "together/meta-llama/Meta-Llama-3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "groq/llama-3.1-70b-versatile": "meta-llama/llama-3.1-70b-versatile", - "fireworks/accounts/fireworks/models/deepseek-r1-0528": "deepseek-ai/deepseek-r1-0528", - "sambanova/DeepSeek-V1-0324": "deepseek-ai/DeepSeek-V1-0324", - "cf/meta/llama-3.1-70b-instruct": "meta/llama-3.1-70b-instruct", - "perplexity/sonar": "perplexity/sonar", - "hf/openai-community/gpt2": "openai-community/gpt2", - "vllm/openai-community/gpt2": "openai-community/gpt2", - "vllm/meta-llama/Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct", - "sglang/meta-llama/Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct", - "ollama/llama3.1": "ollama/llama3.1", - "llama-cpp-python/llama3": "llama-cpp-python/llama3", - "openrouter/gryphe/mythomax-l2-13b": "gryphe/mythomax-l2-13b", - "hf-inference-providers/openai/gpt-oss-120b": "openai/gpt-oss-120b", - "hf-inference-providers/openai/gpt-oss-120b:cerebras": "openai/gpt-oss-120b:cerebras", + 'openai/gpt-4o-mini': 'openai/gpt-4o-mini', + 'openai/azure/gpt-4o-mini': 'openai/gpt-4o-mini', + 'anthropic/claude-sonnet-4-0': 'anthropic/claude-sonnet-4-0', + 'anthropic/bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0': 'anthropic/claude-3-5-sonnet@20241022', + 'anthropic/vertex/claude-3-5-sonnet-v2@20241022': 'anthropic/claude-3-5-sonnet@20241022', + 'google/gemini-2.5-pro': 'google/gemini-2.5-pro', + 'google/vertex/gemini-2.0-flash': 'google/gemini-2.0-flash', + 'mistral/mistral-large-latest': 'mistral/mistral-large-latest', + 'mistral/azure/Mistral-Large-2411': 'mistral/Mistral-Large-2411', + 'openai-api/deepseek/deepseek-reasoner': 'deepseek/deepseek-reasoner', + 'bedrock/meta.llama2-70b-chat-v1': 'meta/llama2-70b-chat', + 'azureai/Llama-3.3-70B-Instruct': 'azureai/Llama-3.3-70B-Instruct', + 'together/meta-llama/Meta-Llama-3.1-70B-Instruct': 'meta-llama/Meta-Llama-3.1-70B-Instruct', + 'groq/llama-3.1-70b-versatile': 'meta-llama/llama-3.1-70b-versatile', + 'fireworks/accounts/fireworks/models/deepseek-r1-0528': 'deepseek-ai/deepseek-r1-0528', + 'sambanova/DeepSeek-V1-0324': 'deepseek-ai/DeepSeek-V1-0324', + 'cf/meta/llama-3.1-70b-instruct': 'meta/llama-3.1-70b-instruct', + 'perplexity/sonar': 'perplexity/sonar', + 'hf/openai-community/gpt2': 'openai-community/gpt2', + 'vllm/openai-community/gpt2': 'openai-community/gpt2', + 'vllm/meta-llama/Meta-Llama-3-8B-Instruct': 'meta-llama/Meta-Llama-3-8B-Instruct', + 'sglang/meta-llama/Meta-Llama-3-8B-Instruct': 'meta-llama/Meta-Llama-3-8B-Instruct', + 'ollama/llama3.1': 'ollama/llama3.1', + 'llama-cpp-python/llama3': 'llama-cpp-python/llama3', + 'openrouter/gryphe/mythomax-l2-13b': 'gryphe/mythomax-l2-13b', + 'hf-inference-providers/openai/gpt-oss-120b': 'openai/gpt-oss-120b', + 'hf-inference-providers/openai/gpt-oss-120b:cerebras': 'openai/gpt-oss-120b:cerebras', } for model_path, model_id in model_path_to_standarized_id_map.items(): diff --git a/tests/test_inspect_instance_level_adapter.py b/tests/test_inspect_instance_level_adapter.py index c867b50fd..120c672b5 100644 --- a/tests/test_inspect_instance_level_adapter.py +++ b/tests/test_inspect_instance_level_adapter.py @@ -1,105 +1,115 @@ import pytest -pytest.importorskip("inspect_ai", reason="inspect-ai not installed; install with: uv sync --extra inspect") + +pytest.importorskip( + 'inspect_ai', + reason='inspect-ai not installed; install with: uv sync --extra inspect', +) import json import tempfile from pathlib import Path +from inspect_ai.model import ChatMessageAssistant, ChatMessageUser, ContentText + from every_eval_ever.converters.inspect.adapter import InspectAIAdapter -from every_eval_ever.eval_types import EvaluatorRelationship -from every_eval_ever.instance_level_types import InstanceLevelEvaluationLog, InteractionType from every_eval_ever.converters.inspect.instance_level_adapter import ( InspectInstanceLevelDataAdapter, ) -from inspect_ai.model import ChatMessageAssistant, ChatMessageUser, ContentText +from every_eval_ever.eval_types import EvaluatorRelationship +from every_eval_ever.instance_level_types import ( + InstanceLevelEvaluationLog, + InteractionType, +) def _load_instance_level_data(adapter, filepath, metadata_args): eval_filepath = Path(filepath) - + with tempfile.TemporaryDirectory() as tmpdir: metadata_args['parent_eval_output_dir'] = tmpdir converted_eval = adapter.transform_from_file( - eval_filepath, - metadata_args=metadata_args + eval_filepath, metadata_args=metadata_args + ) + + instance_level_path = Path( + converted_eval.detailed_evaluation_results.file_path ) - - instance_level_path = Path(converted_eval.detailed_evaluation_results.file_path) instance_logs = [] with instance_level_path.open('r', encoding='utf-8') as f: for line in f: if line.strip(): data = json.loads(line) - instance_logs.append(InstanceLevelEvaluationLog.model_validate(data)) + instance_logs.append( + InstanceLevelEvaluationLog.model_validate(data) + ) return converted_eval, instance_logs def test_pubmedqa_instance_level(): adapter = InspectAIAdapter() - + with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_pubmedqa' + 'file_uuid': 'test_pubmedqa', } converted_eval, instance_logs = _load_instance_level_data( adapter, 'tests/data/inspect/data_pubmedqa_gpt4o_mini.json', - metadata_args + metadata_args, ) - + assert len(instance_logs) == 2 log = instance_logs[0] - + assert log.schema_version == '0.2.1' assert log.model_id == 'openai/gpt-4o-mini-2024-07-18' assert log.interaction_type == InteractionType.single_turn - + assert log.input.raw.startswith('Context') - + assert log.output.raw == ['A'] assert log.messages is None - + assert log.evaluation.score == 1.0 assert log.evaluation.is_correct is True + def test_arc_sonnet_instance_level(): adapter = InspectAIAdapter() - + with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_arc_sonnet' + 'file_uuid': 'test_arc_sonnet', } converted_eval, instance_logs = _load_instance_level_data( - adapter, - 'tests/data/inspect/data_arc_sonnet.json', - metadata_args + adapter, 'tests/data/inspect/data_arc_sonnet.json', metadata_args ) - + assert len(instance_logs) == 5 log = instance_logs[0] - + assert log.schema_version == '0.2.1' assert log.model_id == 'anthropic/claude-sonnet-4-20250514' assert log.interaction_type == InteractionType.single_turn - + assert len(log.input.choices) == 4 assert 'Sunlight is the source of energy' in log.input.choices[0] - + assert log.output.raw == ['A'] assert log.messages is None - + assert log.evaluation.score == 1.0 assert log.evaluation.is_correct is True - + assert log.token_usage.input_tokens > 0 assert log.token_usage.output_tokens > 0 assert log.token_usage.total_tokens > 0 @@ -107,94 +117,106 @@ def test_arc_sonnet_instance_level(): def test_arc_qwen_instance_level(): adapter = InspectAIAdapter() - + with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_arc_qwen' + 'file_uuid': 'test_arc_qwen', } converted_eval, instance_logs = _load_instance_level_data( - adapter, - 'tests/data/inspect/data_arc_qwen.json', - metadata_args + adapter, 'tests/data/inspect/data_arc_qwen.json', metadata_args ) - + assert len(instance_logs) == 3 log = instance_logs[0] - + assert log.schema_version == '0.2.1' assert log.model_id == 'ollama/qwen2.5-0.5b' assert log.interaction_type == InteractionType.single_turn - - assert log.input.choices == ['Sunlight is the source of energy for nearly all ecosystems.', 'Most ecosystems are found on land instead of in water.', 'Carbon dioxide is more available than other gases.', 'The producers in all ecosystems are plants.'] - + + assert log.input.choices == [ + 'Sunlight is the source of energy for nearly all ecosystems.', + 'Most ecosystems are found on land instead of in water.', + 'Carbon dioxide is more available than other gases.', + 'The producers in all ecosystems are plants.', + ] + assert log.evaluation.score == 1.0 - + assert log.performance.latency_ms > 0 def test_gaia_instance_level(): adapter = InspectAIAdapter() - + with tempfile.TemporaryDirectory() as tmpdir: metadata_args = { 'source_organization_name': 'TestOrg', 'evaluator_relationship': EvaluatorRelationship.first_party, 'parent_eval_output_dir': tmpdir, - 'file_uuid': 'test_gaia' + 'file_uuid': 'test_gaia', } converted_eval, instance_logs = _load_instance_level_data( adapter, 'tests/data/inspect/2026-02-07T11-26-57+00-00_gaia_4V8zHbbRKpU5Yv2BMoBcjE.json', - metadata_args + metadata_args, ) - + assert len(instance_logs) > 0 log = instance_logs[0] - + assert log.schema_version == '0.2.1' assert log.model_id == 'openai/gpt-4.1-mini-2025-04-14' - + assert log.interaction_type == InteractionType.agentic - + assert log.input.raw is not None or log.input.choices is not None - + assert log.output is None assert log.messages is not None assert any([i.role for i in log.messages if i.role == 'tool']) - + assert len(log.messages) > 2 assert log.messages[0].turn_idx == 0 assert log.messages[0].role == 'system' assert log.messages[1].role == 'user' assert log.evaluation.score >= 0.0 - + assert log.token_usage is not None assert log.token_usage.input_tokens >= 0 assert log.token_usage.output_tokens >= 0 + def test_serialize_input_skips_non_user_messages(): - adapter = InspectInstanceLevelDataAdapter("test_id", "jsonl", "sha256", "/tmp") + adapter = InspectInstanceLevelDataAdapter( + 'test_id', 'jsonl', 'sha256', '/tmp' + ) - user_msg = ChatMessageUser(content="user question") - assistant_msg = ChatMessageAssistant(content="assistant answer") + user_msg = ChatMessageUser(content='user question') + assistant_msg = ChatMessageAssistant(content='assistant answer') result = adapter._serialize_input([assistant_msg, user_msg]) - assert result == "user question" - assert "assistant answer" not in result + assert result == 'user question' + assert 'assistant answer' not in result + def test_serialize_input_concatenates_list_content(): - adapter = InspectInstanceLevelDataAdapter("test_id", "jsonl", "sha256", "/tmp") + adapter = InspectInstanceLevelDataAdapter( + 'test_id', 'jsonl', 'sha256', '/tmp' + ) - msg_str = ChatMessageUser(content="plain string content") - assert adapter._serialize_input([msg_str]) == "plain string content" + msg_str = ChatMessageUser(content='plain string content') + assert adapter._serialize_input([msg_str]) == 'plain string content' - block1 = ContentText(text="Context: some context.") - block2 = ContentText(text="Question: what is X?") + block1 = ContentText(text='Context: some context.') + block2 = ContentText(text='Question: what is X?') msg_list = ChatMessageUser(content=[block1, block2]) - assert adapter._serialize_input([msg_list]) == "Context: some context. Question: what is X?" + assert ( + adapter._serialize_input([msg_list]) + == 'Context: some context. Question: what is X?' + ) diff --git a/tests/test_lm_eval_adapter.py b/tests/test_lm_eval_adapter.py index 113442586..a44e39967 100644 --- a/tests/test_lm_eval_adapter.py +++ b/tests/test_lm_eval_adapter.py @@ -2,23 +2,30 @@ from pathlib import Path from every_eval_ever.converters.lm_eval.adapter import LMEvalAdapter -from every_eval_ever.converters.lm_eval.instance_level_adapter import LMEvalInstanceLevelAdapter -from every_eval_ever.converters.lm_eval.utils import parse_model_args, find_samples_file +from every_eval_ever.converters.lm_eval.instance_level_adapter import ( + LMEvalInstanceLevelAdapter, +) +from every_eval_ever.converters.lm_eval.utils import ( + find_samples_file, + parse_model_args, +) from every_eval_ever.eval_types import ( EvaluationLog, EvaluatorRelationship, SourceDataHf, ) -DATA_DIR = Path("tests/data/lm_eval") -RESULTS_FILE = DATA_DIR / "results_2026-01-21T03-44-18.458309.json" -SAMPLES_FILE = DATA_DIR / "samples_math_perturbed_full_2026-01-21T03-44-18.458309.jsonl" +DATA_DIR = Path('tests/data/lm_eval') +RESULTS_FILE = DATA_DIR / 'results_2026-01-21T03-44-18.458309.json' +SAMPLES_FILE = ( + DATA_DIR / 'samples_math_perturbed_full_2026-01-21T03-44-18.458309.jsonl' +) def _make_metadata_args(**overrides): args = { - "source_organization_name": "TestOrg", - "evaluator_relationship": EvaluatorRelationship.first_party, + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': EvaluatorRelationship.first_party, } args.update(overrides) return args @@ -26,36 +33,44 @@ def _make_metadata_args(**overrides): # ── Utility tests ────────────────────────────────────────────────────── + def test_parse_model_args_basic(): - result = parse_model_args("pretrained=EleutherAI/pythia-160m,dtype=float16") - assert result == {"pretrained": "EleutherAI/pythia-160m", "dtype": "float16"} + result = parse_model_args('pretrained=EleutherAI/pythia-160m,dtype=float16') + assert result == { + 'pretrained': 'EleutherAI/pythia-160m', + 'dtype': 'float16', + } def test_parse_model_args_empty(): - assert parse_model_args("") == {} + assert parse_model_args('') == {} assert parse_model_args(None) == {} def test_parse_model_args_complex(): result = parse_model_args( - "pretrained=RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1,trust_remote_code=True" + 'pretrained=RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1,trust_remote_code=True' + ) + assert ( + result['pretrained'] + == 'RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1' ) - assert result["pretrained"] == "RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1" - assert result["trust_remote_code"] == "True" + assert result['trust_remote_code'] == 'True' def test_find_samples_file(): - found = find_samples_file(DATA_DIR, "math_perturbed_full") + found = find_samples_file(DATA_DIR, 'math_perturbed_full') assert found is not None - assert found.name.startswith("samples_math_perturbed_full") + assert found.name.startswith('samples_math_perturbed_full') def test_find_samples_file_missing(): - assert find_samples_file(DATA_DIR, "nonexistent_task") is None + assert find_samples_file(DATA_DIR, 'nonexistent_task') is None # ── Adapter: transform_from_file ─────────────────────────────────────── + def test_transform_from_file_returns_two_tasks(): adapter = LMEvalAdapter() logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) @@ -69,12 +84,15 @@ def test_transform_from_file_model_info(): logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) model = logs[0].model_info - assert model.name == "RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1" + assert ( + model.name + == 'RylanSchaeffer/mem_Qwen3-93M_minerva_math_rep_0_sbst_1.0000_epch_1_ot_1' + ) assert model.id == model.name - assert model.developer == "RylanSchaeffer" - assert model.inference_engine.name == "transformers" - assert model.additional_details["num_parameters"] == "93069280" - assert model.additional_details["dtype"] == "torch.bfloat16" + assert model.developer == 'RylanSchaeffer' + assert model.inference_engine.name == 'transformers' + assert model.additional_details['num_parameters'] == '93069280' + assert model.additional_details['dtype'] == 'torch.bfloat16' def test_transform_from_file_source_metadata(): @@ -82,9 +100,9 @@ def test_transform_from_file_source_metadata(): logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) src = logs[0].source_metadata - assert src.source_name == "lm-evaluation-harness" - assert src.source_type.value == "evaluation_run" - assert src.source_organization_name == "TestOrg" + assert src.source_name == 'lm-evaluation-harness' + assert src.source_type.value == 'evaluation_run' + assert src.source_organization_name == 'TestOrg' def test_transform_from_file_source_data(): @@ -96,8 +114,8 @@ def test_transform_from_file_source_data(): assert isinstance(log.evaluation_results[0].source_data, SourceDataHf) perturbed = logs[0].evaluation_results[0].source_data - assert perturbed.hf_repo == "stellaathena/math_perturbed_5000" - assert perturbed.hf_split == "test" + assert perturbed.hf_repo == 'stellaathena/math_perturbed_5000' + assert perturbed.hf_split == 'test' def test_transform_from_file_evaluation_results(): @@ -108,7 +126,10 @@ def test_transform_from_file_evaluation_results(): perturbed_results = logs[0].evaluation_results assert len(perturbed_results) == 1 assert perturbed_results[0].score_details.score == 0.0 - assert perturbed_results[0].metric_config.evaluation_description == "exact_match" + assert ( + perturbed_results[0].metric_config.evaluation_description + == 'exact_match' + ) assert perturbed_results[0].metric_config.lower_is_better is False assert perturbed_results[0].metric_config.min_score == 0.0 assert perturbed_results[0].metric_config.max_score == 1.0 @@ -125,7 +146,7 @@ def test_transform_from_file_uncertainty(): uncertainty = logs[1].evaluation_results[0].score_details.uncertainty assert uncertainty is not None assert uncertainty.standard_error.value == 0.0002828144211304471 - assert uncertainty.standard_error.method == "bootstrap" + assert uncertainty.standard_error.method == 'bootstrap' assert uncertainty.num_samples == 5000 @@ -137,78 +158,87 @@ def test_transform_from_file_generation_config(): assert gen is not None assert gen.generation_args.temperature == 0.0 assert gen.generation_args.max_tokens == 512 - assert gen.additional_details["num_fewshot"] == "0" + assert gen.additional_details['num_fewshot'] == '0' def test_transform_from_file_eval_timestamp(): adapter = LMEvalAdapter() logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) - assert logs[0].evaluation_timestamp == "1768964383" + assert logs[0].evaluation_timestamp == '1768964383' # ── Adapter: transform_from_directory ────────────────────────────────── + def test_transform_from_directory(): adapter = LMEvalAdapter() logs = adapter.transform_from_directory(DATA_DIR, _make_metadata_args()) assert len(logs) == 2 - task_names = {r.evaluation_name for log in logs for r in log.evaluation_results} - assert "math_perturbed_full" in task_names - assert "math_rephrased_full" in task_names + task_names = { + r.evaluation_name for log in logs for r in log.evaluation_results + } + assert 'math_perturbed_full' in task_names + assert 'math_rephrased_full' in task_names # ── Adapter: group placeholder filtering ─────────────────────────────── + def test_get_tasks_skips_group_placeholders(): adapter = LMEvalAdapter() raw = { - "results": { - "group_task": {"alias": "group_task", " ": ""}, - "real_task": {"alias": "real_task", "acc,none": 0.5}, + 'results': { + 'group_task': {'alias': 'group_task', ' ': ''}, + 'real_task': {'alias': 'real_task', 'acc,none': 0.5}, } } tasks = adapter._get_tasks(raw) - assert tasks == ["real_task"] + assert tasks == ['real_task'] # ── Adapter: inference engine override ───────────────────────────────── + def test_inference_engine_override(): adapter = LMEvalAdapter() - metadata = _make_metadata_args(inference_engine="vllm", inference_engine_version="0.6.0") + metadata = _make_metadata_args( + inference_engine='vllm', inference_engine_version='0.6.0' + ) logs = adapter.transform_from_file(RESULTS_FILE, metadata) - assert logs[0].model_info.inference_engine.name == "vllm" - assert logs[0].model_info.inference_engine.version == "0.6.0" + assert logs[0].model_info.inference_engine.name == 'vllm' + assert logs[0].model_info.inference_engine.version == '0.6.0' # ── Adapter: eval_metadata tracking ─────────────────────────────────── + def test_eval_metadata_stored_after_transform(): adapter = LMEvalAdapter() logs = adapter.transform_from_file(RESULTS_FILE, _make_metadata_args()) for log in logs: meta = adapter.get_eval_metadata(log.evaluation_id) - assert "task_name" in meta - assert "parent_dir" in meta + assert 'task_name' in meta + assert 'parent_dir' in meta # ── Instance-level adapter ───────────────────────────────────────────── + def test_instance_level_transform_samples(): inst_adapter = LMEvalInstanceLevelAdapter() logs = inst_adapter.transform_samples( SAMPLES_FILE, - evaluation_id="test/eval/123", - model_id="test-model", - task_name="math_perturbed_full", + evaluation_id='test/eval/123', + model_id='test-model', + task_name='math_perturbed_full', ) assert len(logs) == 10 first = logs[0] - assert first.sample_id == "0" - assert first.evaluation_name == "math_perturbed_full" - assert first.model_id == "test-model" - assert first.input.reference == ["3"] + assert first.sample_id == '0' + assert first.evaluation_name == 'math_perturbed_full' + assert first.model_id == 'test-model' + assert first.input.reference == ['3'] assert first.evaluation.score == 0.0 assert first.evaluation.is_correct is False assert first.input.choices is None # generation task, not MC @@ -220,27 +250,27 @@ def test_instance_level_transform_and_save(): with tempfile.TemporaryDirectory() as tmpdir: result = inst_adapter.transform_and_save( SAMPLES_FILE, - evaluation_id="test/eval/123", - model_id="test-model", - task_name="math_perturbed_full", + evaluation_id='test/eval/123', + model_id='test-model', + task_name='math_perturbed_full', output_dir=tmpdir, - file_uuid="abc123", + file_uuid='abc123', ) assert result is not None assert result.total_rows == 10 - assert result.format.value == "jsonl" + assert result.format.value == 'jsonl' assert result.checksum # non-empty sha256 assert Path(result.file_path).exists() - assert "abc123_samples.jsonl" in result.file_path + assert 'abc123_samples.jsonl' in result.file_path def test_instance_level_transform_and_save_no_output_dir(): inst_adapter = LMEvalInstanceLevelAdapter() result = inst_adapter.transform_and_save( SAMPLES_FILE, - evaluation_id="test/eval/123", - model_id="test-model", - task_name="math_perturbed_full", + evaluation_id='test/eval/123', + model_id='test-model', + task_name='math_perturbed_full', output_dir=None, ) assert result is None diff --git a/tests/test_validate.py b/tests/test_validate.py index 7a0ef7abc..edb2d5b6a 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -3,19 +3,15 @@ from __future__ import annotations import json -import textwrap from pathlib import Path -import pytest - from every_eval_ever.validate import ( - ValidationReport, expand_paths, + render_report_github, + render_report_json, validate_aggregate, validate_file, validate_instance_file, - render_report_json, - render_report_github, ) # --------------------------------------------------------------------------- @@ -23,82 +19,82 @@ # --------------------------------------------------------------------------- VALID_AGGREGATE: dict = { - "schema_version": "0.2.2", - "evaluation_id": "test/model/123", - "retrieved_timestamp": "1234567890", - "source_metadata": { - "source_type": "evaluation_run", - "source_organization_name": "TestOrg", - "evaluator_relationship": "first_party", + 'schema_version': '0.2.2', + 'evaluation_id': 'test/model/123', + 'retrieved_timestamp': '1234567890', + 'source_metadata': { + 'source_type': 'evaluation_run', + 'source_organization_name': 'TestOrg', + 'evaluator_relationship': 'first_party', }, - "eval_library": {"name": "inspect_ai", "version": "0.3.0"}, - "model_info": {"name": "test-model", "id": "org/test-model"}, - "evaluation_results": [ + 'eval_library': {'name': 'inspect_ai', 'version': '0.3.0'}, + 'model_info': {'name': 'test-model', 'id': 'org/test-model'}, + 'evaluation_results': [ { - "evaluation_name": "test_eval", - "source_data": { - "dataset_name": "test_ds", - "source_type": "hf_dataset", - "hf_repo": "org/test-ds", + 'evaluation_name': 'test_eval', + 'source_data': { + 'dataset_name': 'test_ds', + 'source_type': 'hf_dataset', + 'hf_repo': 'org/test-ds', }, - "metric_config": { - "lower_is_better": False, - "score_type": "binary", + 'metric_config': { + 'lower_is_better': False, + 'score_type': 'binary', }, - "score_details": {"score": 0.95}, + 'score_details': {'score': 0.95}, } ], } VALID_SINGLE_TURN: dict = { - "schema_version": "instance_level_eval_0.2.2", - "evaluation_id": "test/model/123", - "model_id": "org/test-model", - "evaluation_name": "test_eval", - "sample_id": "sample_001", - "interaction_type": "single_turn", - "input": {"raw": "What is 2+2?", "reference": ["4"]}, - "output": {"raw": ["4"]}, - "answer_attribution": [ + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': 'test/model/123', + 'model_id': 'org/test-model', + 'evaluation_name': 'test_eval', + 'sample_id': 'sample_001', + 'interaction_type': 'single_turn', + 'input': {'raw': 'What is 2+2?', 'reference': ['4']}, + 'output': {'raw': ['4']}, + 'answer_attribution': [ { - "turn_idx": 0, - "source": "output.raw", - "extracted_value": "4", - "extraction_method": "exact_match", - "is_terminal": True, + 'turn_idx': 0, + 'source': 'output.raw', + 'extracted_value': '4', + 'extraction_method': 'exact_match', + 'is_terminal': True, } ], - "evaluation": {"score": 1.0, "is_correct": True}, + 'evaluation': {'score': 1.0, 'is_correct': True}, } VALID_MULTI_TURN: dict = { - "schema_version": "instance_level_eval_0.2.2", - "evaluation_id": "test/model/123", - "model_id": "org/test-model", - "evaluation_name": "test_eval", - "sample_id": "sample_002", - "interaction_type": "multi_turn", - "input": {"raw": "Solve this problem", "reference": ["42"]}, - "messages": [ - {"turn_idx": 0, "role": "user", "content": "Solve this problem"}, - {"turn_idx": 1, "role": "assistant", "content": "The answer is 42"}, + 'schema_version': 'instance_level_eval_0.2.2', + 'evaluation_id': 'test/model/123', + 'model_id': 'org/test-model', + 'evaluation_name': 'test_eval', + 'sample_id': 'sample_002', + 'interaction_type': 'multi_turn', + 'input': {'raw': 'Solve this problem', 'reference': ['42']}, + 'messages': [ + {'turn_idx': 0, 'role': 'user', 'content': 'Solve this problem'}, + {'turn_idx': 1, 'role': 'assistant', 'content': 'The answer is 42'}, ], - "answer_attribution": [ + 'answer_attribution': [ { - "turn_idx": 1, - "source": "messages[1].content", - "extracted_value": "42", - "extraction_method": "regex", - "is_terminal": True, + 'turn_idx': 1, + 'source': 'messages[1].content', + 'extracted_value': '42', + 'extraction_method': 'regex', + 'is_terminal': True, } ], - "evaluation": {"score": 1.0, "is_correct": True}, + 'evaluation': {'score': 1.0, 'is_correct': True}, } def _write_json(tmp_path: Path, name: str, data: dict) -> Path: p = tmp_path / name - p.write_text(json.dumps(data), encoding="utf-8") + p.write_text(json.dumps(data), encoding='utf-8') return p @@ -110,7 +106,7 @@ def _write_jsonl(tmp_path: Path, name: str, lines: list[dict | str]) -> Path: text_lines.append(item) else: text_lines.append(json.dumps(item)) - p.write_text("\n".join(text_lines) + "\n", encoding="utf-8") + p.write_text('\n'.join(text_lines) + '\n', encoding='utf-8') return p @@ -121,96 +117,98 @@ def _write_jsonl(tmp_path: Path, name: str, lines: list[dict | str]) -> Path: class TestAggregateValidation: def test_valid_json_passes(self, tmp_path: Path): - fp = _write_json(tmp_path, "valid.json", VALID_AGGREGATE) + fp = _write_json(tmp_path, 'valid.json', VALID_AGGREGATE) report = validate_aggregate(fp) assert report.valid is True assert report.errors == [] - assert report.file_type == "aggregate" + assert report.file_type == 'aggregate' def test_missing_required_field(self, tmp_path: Path): data = {**VALID_AGGREGATE} - del data["evaluation_id"] - fp = _write_json(tmp_path, "missing.json", data) + del data['evaluation_id'] + fp = _write_json(tmp_path, 'missing.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("evaluation_id" in e["loc"] for e in report.errors) + assert any('evaluation_id' in e['loc'] for e in report.errors) def test_extra_field_on_evaluation_log_fails(self, tmp_path: Path): - data = {**VALID_AGGREGATE, "unexpected_field": "oops"} - fp = _write_json(tmp_path, "extra.json", data) + data = {**VALID_AGGREGATE, 'unexpected_field': 'oops'} + fp = _write_json(tmp_path, 'extra.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("unexpected_field" in e["loc"] for e in report.errors) + assert any('unexpected_field' in e['loc'] for e in report.errors) def test_extra_field_on_generation_args_fails(self, tmp_path: Path): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["evaluation_results"][0]["generation_config"] = { - "generation_args": {"temperature": 0.7, "unknown_param": "bad"} + data['evaluation_results'][0]['generation_config'] = { + 'generation_args': {'temperature': 0.7, 'unknown_param': 'bad'} } - fp = _write_json(tmp_path, "extra_gen.json", data) + fp = _write_json(tmp_path, 'extra_gen.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("unknown_param" in e["loc"] for e in report.errors) + assert any('unknown_param' in e['loc'] for e in report.errors) def test_score_type_levels_without_level_names_fails(self, tmp_path: Path): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["evaluation_results"][0]["metric_config"] = { - "lower_is_better": False, - "score_type": "levels", + data['evaluation_results'][0]['metric_config'] = { + 'lower_is_better': False, + 'score_type': 'levels', # missing level_names and has_unknown_level } - fp = _write_json(tmp_path, "levels.json", data) + fp = _write_json(tmp_path, 'levels.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("level_names" in e["msg"] for e in report.errors) + assert any('level_names' in e['msg'] for e in report.errors) - def test_score_type_continuous_without_min_score_fails(self, tmp_path: Path): + def test_score_type_continuous_without_min_score_fails( + self, tmp_path: Path + ): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["evaluation_results"][0]["metric_config"] = { - "lower_is_better": False, - "score_type": "continuous", + data['evaluation_results'][0]['metric_config'] = { + 'lower_is_better': False, + 'score_type': 'continuous', # missing min_score and max_score } - fp = _write_json(tmp_path, "continuous.json", data) + fp = _write_json(tmp_path, 'continuous.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("min_score" in e["msg"] for e in report.errors) + assert any('min_score' in e['msg'] for e in report.errors) def test_source_data_discriminated_error(self, tmp_path: Path): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["evaluation_results"][0]["source_data"] = { - "dataset_name": "test", - "source_type": "hf_dataset", + data['evaluation_results'][0]['source_data'] = { + 'dataset_name': 'test', + 'source_type': 'hf_dataset', # valid hf_dataset, should pass } - fp = _write_json(tmp_path, "disc.json", data) + fp = _write_json(tmp_path, 'disc.json', data) report = validate_aggregate(fp) assert report.valid is True def test_source_data_wrong_source_type_fails(self, tmp_path: Path): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["evaluation_results"][0]["source_data"] = { - "dataset_name": "test", - "source_type": "invalid_type", + data['evaluation_results'][0]['source_data'] = { + 'dataset_name': 'test', + 'source_type': 'invalid_type', } - fp = _write_json(tmp_path, "bad_source.json", data) + fp = _write_json(tmp_path, 'bad_source.json', data) report = validate_aggregate(fp) assert report.valid is False def test_additional_details_non_string_values_fail(self, tmp_path: Path): data = json.loads(json.dumps(VALID_AGGREGATE)) - data["model_info"]["additional_details"] = {"params_billions": 8.357} - fp = _write_json(tmp_path, "nonstr.json", data) + data['model_info']['additional_details'] = {'params_billions': 8.357} + fp = _write_json(tmp_path, 'nonstr.json', data) report = validate_aggregate(fp) assert report.valid is False - assert any("string" in e["msg"] for e in report.errors) + assert any('string' in e['msg'] for e in report.errors) def test_json_parse_error(self, tmp_path: Path): - fp = tmp_path / "bad.json" - fp.write_text("{invalid json}", encoding="utf-8") + fp = tmp_path / 'bad.json' + fp.write_text('{invalid json}', encoding='utf-8') report = validate_aggregate(fp) assert report.valid is False - assert report.errors[0]["type"] == "json_parse_error" + assert report.errors[0]['type'] == 'json_parse_error' # =================================================================== @@ -220,64 +218,73 @@ def test_json_parse_error(self, tmp_path: Path): class TestInstanceLevelValidation: def test_valid_single_turn_passes(self, tmp_path: Path): - fp = _write_jsonl(tmp_path, "valid.jsonl", [VALID_SINGLE_TURN]) + fp = _write_jsonl(tmp_path, 'valid.jsonl', [VALID_SINGLE_TURN]) report = validate_instance_file(fp) assert report.valid is True assert report.line_count == 1 def test_valid_multi_turn_passes(self, tmp_path: Path): - fp = _write_jsonl(tmp_path, "multi.jsonl", [VALID_MULTI_TURN]) + fp = _write_jsonl(tmp_path, 'multi.jsonl', [VALID_MULTI_TURN]) report = validate_instance_file(fp) assert report.valid is True def test_single_turn_with_messages_fails(self, tmp_path: Path): data = json.loads(json.dumps(VALID_SINGLE_TURN)) - data["messages"] = [ - {"turn_idx": 0, "role": "user", "content": "hi"}, + data['messages'] = [ + {'turn_idx': 0, 'role': 'user', 'content': 'hi'}, ] - fp = _write_jsonl(tmp_path, "bad_st.jsonl", [data]) + fp = _write_jsonl(tmp_path, 'bad_st.jsonl', [data]) report = validate_instance_file(fp) assert report.valid is False - assert any("must not have messages" in e["msg"] for e in report.errors) + assert any('must not have messages' in e['msg'] for e in report.errors) def test_multi_turn_without_messages_fails(self, tmp_path: Path): data = json.loads(json.dumps(VALID_MULTI_TURN)) - del data["messages"] - fp = _write_jsonl(tmp_path, "no_msgs.jsonl", [data]) + del data['messages'] + fp = _write_jsonl(tmp_path, 'no_msgs.jsonl', [data]) report = validate_instance_file(fp) assert report.valid is False - assert any("requires messages" in e["msg"] for e in report.errors) + assert any('requires messages' in e['msg'] for e in report.errors) - def test_invalid_line_in_middle_reports_correct_line_number(self, tmp_path: Path): + def test_invalid_line_in_middle_reports_correct_line_number( + self, tmp_path: Path + ): bad_line = {**VALID_SINGLE_TURN} - del bad_line["evaluation_id"] + del bad_line['evaluation_id'] fp = _write_jsonl( - tmp_path, "mid.jsonl", [VALID_SINGLE_TURN, bad_line, VALID_SINGLE_TURN] + tmp_path, + 'mid.jsonl', + [VALID_SINGLE_TURN, bad_line, VALID_SINGLE_TURN], ) report = validate_instance_file(fp) assert report.valid is False - assert any("line 2" in e["loc"] for e in report.errors) + assert any('line 2' in e['loc'] for e in report.errors) def test_json_parse_error_reports_line_number(self, tmp_path: Path): fp = _write_jsonl( - tmp_path, "parse.jsonl", [VALID_SINGLE_TURN, "{bad json}"] + tmp_path, 'parse.jsonl', [VALID_SINGLE_TURN, '{bad json}'] ) report = validate_instance_file(fp) assert report.valid is False - assert report.errors[0]["type"] == "json_parse_error" - assert "line 2" in report.errors[0]["loc"] + assert report.errors[0]['type'] == 'json_parse_error' + assert 'line 2' in report.errors[0]['loc'] def test_empty_jsonl_passes(self, tmp_path: Path): - fp = tmp_path / "empty.jsonl" - fp.write_text("", encoding="utf-8") + fp = tmp_path / 'empty.jsonl' + fp.write_text('', encoding='utf-8') report = validate_instance_file(fp) assert report.valid is True assert report.line_count == 0 def test_blank_lines_skipped(self, tmp_path: Path): - lines = [json.dumps(VALID_SINGLE_TURN), "", " ", json.dumps(VALID_SINGLE_TURN)] - fp = tmp_path / "blanks.jsonl" - fp.write_text("\n".join(lines) + "\n", encoding="utf-8") + lines = [ + json.dumps(VALID_SINGLE_TURN), + '', + ' ', + json.dumps(VALID_SINGLE_TURN), + ] + fp = tmp_path / 'blanks.jsonl' + fp.write_text('\n'.join(lines) + '\n', encoding='utf-8') report = validate_instance_file(fp) assert report.valid is True assert report.line_count == 2 @@ -290,82 +297,82 @@ def test_blank_lines_skipped(self, tmp_path: Path): class TestFileDispatch: def test_json_dispatches_to_aggregate(self, tmp_path: Path): - fp = _write_json(tmp_path, "test.json", VALID_AGGREGATE) + fp = _write_json(tmp_path, 'test.json', VALID_AGGREGATE) report = validate_file(fp) - assert report.file_type == "aggregate" + assert report.file_type == 'aggregate' def test_jsonl_dispatches_to_instance(self, tmp_path: Path): - fp = _write_jsonl(tmp_path, "test.jsonl", [VALID_SINGLE_TURN]) + fp = _write_jsonl(tmp_path, 'test.jsonl', [VALID_SINGLE_TURN]) report = validate_file(fp) - assert report.file_type == "instance" + assert report.file_type == 'instance' def test_unsupported_extension(self, tmp_path: Path): - fp = tmp_path / "test.csv" - fp.write_text("a,b,c", encoding="utf-8") + fp = tmp_path / 'test.csv' + fp.write_text('a,b,c', encoding='utf-8') report = validate_file(fp) assert report.valid is False - assert report.errors[0]["type"] == "unsupported_extension" + assert report.errors[0]['type'] == 'unsupported_extension' def test_directory_expansion(self, tmp_path: Path): - sub = tmp_path / "sub" + sub = tmp_path / 'sub' sub.mkdir() - _write_json(sub, "a.json", VALID_AGGREGATE) - _write_jsonl(sub, "b.jsonl", [VALID_SINGLE_TURN]) - (sub / "c.txt").write_text("ignored") + _write_json(sub, 'a.json', VALID_AGGREGATE) + _write_jsonl(sub, 'b.jsonl', [VALID_SINGLE_TURN]) + (sub / 'c.txt').write_text('ignored') paths = expand_paths([str(sub)]) extensions = {p.suffix for p in paths} - assert ".json" in extensions - assert ".jsonl" in extensions - assert ".txt" not in extensions + assert '.json' in extensions + assert '.jsonl' in extensions + assert '.txt' not in extensions class TestMaxErrors: def test_max_errors_caps_output(self, tmp_path: Path): bad_line = {**VALID_SINGLE_TURN} - del bad_line["evaluation_id"] + del bad_line['evaluation_id'] lines = [bad_line] * 100 - fp = _write_jsonl(tmp_path, "many.jsonl", lines) + fp = _write_jsonl(tmp_path, 'many.jsonl', lines) report = validate_instance_file(fp, max_errors=5) assert report.valid is False # Should have at most 5 real errors + 1 truncation message assert len(report.errors) <= 6 - assert any(e["type"] == "truncated" for e in report.errors) + assert any(e['type'] == 'truncated' for e in report.errors) class TestOutputFormats: def test_json_output_is_valid_json(self, tmp_path: Path): - fp = _write_json(tmp_path, "test.json", VALID_AGGREGATE) + fp = _write_json(tmp_path, 'test.json', VALID_AGGREGATE) report = validate_file(fp) output = render_report_json([report]) parsed = json.loads(output) assert isinstance(parsed, list) assert len(parsed) == 1 - assert parsed[0]["valid"] is True + assert parsed[0]['valid'] is True def test_github_output_format(self, tmp_path: Path): data = {**VALID_AGGREGATE} - del data["evaluation_id"] - fp = _write_json(tmp_path, "fail.json", data) + del data['evaluation_id'] + fp = _write_json(tmp_path, 'fail.json', data) report = validate_file(fp) output = render_report_github([report]) - assert output.startswith("::error file=") + assert output.startswith('::error file=') def test_github_output_empty_on_pass(self, tmp_path: Path): - fp = _write_json(tmp_path, "pass.json", VALID_AGGREGATE) + fp = _write_json(tmp_path, 'pass.json', VALID_AGGREGATE) report = validate_file(fp) output = render_report_github([report]) - assert output == "" + assert output == '' class TestExitCode: def test_exit_code_0_on_pass(self, tmp_path: Path): - fp = _write_json(tmp_path, "pass.json", VALID_AGGREGATE) + fp = _write_json(tmp_path, 'pass.json', VALID_AGGREGATE) report = validate_file(fp) assert report.valid is True def test_exit_code_1_on_failure(self, tmp_path: Path): data = {**VALID_AGGREGATE} - del data["evaluation_id"] - fp = _write_json(tmp_path, "fail.json", data) + del data['evaluation_id'] + fp = _write_json(tmp_path, 'fail.json', data) report = validate_file(fp) assert report.valid is False diff --git a/utils/helm/adapter.py b/utils/helm/adapter.py index 014c8449f..745cd8ee0 100644 --- a/utils/helm/adapter.py +++ b/utils/helm/adapter.py @@ -29,9 +29,8 @@ ModelInfo, ScoreDetails, ScoreType, - SourceDataUrl + SourceDataUrl, ) - from every_eval_ever.helpers import ( fetch_json, get_developer, @@ -45,44 +44,44 @@ def parse_args(): """Parse CLI arguments.""" parser = ArgumentParser() parser.add_argument( - "--leaderboard_name", + '--leaderboard_name', type=str, - default="HELM_Capabilities", + default='HELM_Capabilities', choices=[ - "HELM_Capabilities", - "HELM_Lite", - "HELM_Classic", - "HELM_Instruct", - "HELM_MMLU", + 'HELM_Capabilities', + 'HELM_Lite', + 'HELM_Classic', + 'HELM_Instruct', + 'HELM_MMLU', ], ) parser.add_argument( - "--source_data_url", + '--source_data_url', type=str, default=( - "https://storage.googleapis.com/crfm-helm-public/" - "capabilities/benchmark_output/releases/v1.12.0/" - "groups/core_scenarios.json" + 'https://storage.googleapis.com/crfm-helm-public/' + 'capabilities/benchmark_output/releases/v1.12.0/' + 'groups/core_scenarios.json' ), ) parser.add_argument( - "--eval_library_name", + '--eval_library_name', type=str, - default="helm", - help="Name of the evaluation library (e.g. helm, lm_eval, inspect_ai)", + default='helm', + help='Name of the evaluation library (e.g. helm, lm_eval, inspect_ai)', ) parser.add_argument( - "--eval_library_version", + '--eval_library_version', type=str, - default="unknown", - help="Version of the evaluation library", + default='unknown', + help='Version of the evaluation library', ) return parser.parse_args() def clean_model_name(model_name: str) -> str: """Remove parentheses from model name.""" - return model_name.replace("(", "").replace(")", "") + return model_name.replace('(', '').replace(')', '') def extract_generation_config(run_specs: List[str]) -> Dict[str, Any]: @@ -90,12 +89,12 @@ def extract_generation_config(run_specs: List[str]) -> Dict[str, Any]: generation_config: Dict[str, Any] = defaultdict(list) for run_spec in run_specs: - _, args_str = run_spec.split(":", 1) - args = args_str.split(",") + _, args_str = run_spec.split(':', 1) + args = args_str.split(',') for arg in args: - key, value = arg.split("=") - if key == "model": + key, value = arg.split('=') + if key == 'model': continue generation_config[key].append(value) @@ -109,46 +108,49 @@ def extract_generation_config(run_specs: List[str]) -> Dict[str, Any]: return dict(generation_config) -def extract_model_info_from_row(row: List[Dict[str, Any]], model_name: str) -> Tuple[ModelInfo, str]: +def extract_model_info_from_row( + row: List[Dict[str, Any]], model_name: str +) -> Tuple[ModelInfo, str]: """Extract model metadata from leaderboard row.""" run_spec_names = next( - (cell["run_spec_names"] for cell in row if "run_spec_names" in cell), + (cell['run_spec_names'] for cell in row if 'run_spec_names' in cell), None, ) - if "(" in model_name and ")" in model_name: + if '(' in model_name and ')' in model_name: model_name = clean_model_name(model_name) if not run_spec_names: developer = get_developer(model_name) - if developer == "unknown": - model_id = model_name.replace(" ", "-") + if developer == 'unknown': + model_id = model_name.replace(' ', '-') else: - model_id = f"{developer}/{model_name.replace(' ', '-')}" + model_id = f'{developer}/{model_name.replace(" ", "-")}' else: spec = run_spec_names[0] - args = spec.split(":", 1)[1].split(",") - + args = spec.split(':', 1)[1].split(',') + model_details = next( - (arg.split("=", 1)[1] for arg in args if arg.startswith("model=")), - "", + (arg.split('=', 1)[1] for arg in args if arg.startswith('model=')), + '', ) - developer = model_details.split("_")[0] - model_id = model_details.replace("_", "/") + developer = model_details.split('_')[0] + model_id = model_details.replace('_', '/') - if developer == "unknown": + if developer == 'unknown': developer = get_developer(model_name) model_info = make_model_info( model_name=model_name, developer=developer, - inference_platform="unknown", + inference_platform='unknown', ) model_info.id = model_id return model_info + def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): """Determine min/max values for each metric column.""" num_columns = len(tab_rows[0]) - 1 @@ -157,7 +159,7 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): for row in tab_rows: for idx, cell in enumerate(row[1:], start=0): - value = cell.get("value", 0) + value = cell.get('value', 0) if value is not None: mins[idx] = min(mins[idx], value) maxs[idx] = max(maxs[idx], value) @@ -168,8 +170,8 @@ def find_column_ranges(tab_rows: List[List[Dict[str, Any]]]): def convert( leaderboard_name: str, leaderboard_data: List[Dict[str, Any]], - eval_library_name: str = "helm", - eval_library_version: str = "unknown", + eval_library_name: str = 'helm', + eval_library_version: str = 'unknown', ): """Convert HELM leaderboard data into unified evaluation logs.""" retrieved_timestamp = str(time.time()) @@ -179,14 +181,14 @@ def convert( model_results: Dict[str, Dict[str, EvaluationResult]] = defaultdict(dict) for tab in leaderboard_data: - tab_name = tab.get("title") - headers = tab.get("header") - rows = tab.get("rows") + tab_name = tab.get('title') + headers = tab.get('header') + rows = tab.get('rows') mins, maxs = find_column_ranges(rows) for row in rows: - model_name = row[0].get("value") + model_name = row[0].get('value') if model_name not in model_infos: model_info = extract_model_info_from_row(row, model_name) @@ -194,17 +196,17 @@ def convert( model_ids[model_name] = model_info.id for col_idx, (header, cell) in enumerate(zip(headers[1:], row[1:])): - full_eval_name = header.get("value") + full_eval_name = header.get('value') short_name = ( full_eval_name.split()[0] - if "-" in full_eval_name + if '-' in full_eval_name else full_eval_name ) is_new_metric = ( - tab_name.lower() == "accuracy" + tab_name.lower() == 'accuracy' or short_name not in model_results[model_name] - or "instruct" in leaderboard_name.lower() + or 'instruct' in leaderboard_name.lower() ) if full_eval_name.lower().startswith('mean'): @@ -218,32 +220,42 @@ def convert( if metric_name: evaluation_description = f'{metric_name} on {dataset_name}' else: - evaluation_description = header.get("description", "") + evaluation_description = header.get('description', '') if is_new_metric: metric_config = MetricConfig( evaluation_description=evaluation_description, - lower_is_better=header.get("lower_is_better", False), + lower_is_better=header.get('lower_is_better', False), min_score=( - 0.0 if mins[col_idx] >= 0 else math.floor(mins[col_idx]) + 0.0 + if mins[col_idx] >= 0 + else math.floor(mins[col_idx]) ), max_score=( - 1.0 if maxs[col_idx] <= 1 else math.ceil(maxs[col_idx]) + 1.0 + if maxs[col_idx] <= 1 + else math.ceil(maxs[col_idx]) ), score_type=ScoreType.continuous, ) - source_dataset_name = leaderboard_name if leaderboard_name.lower() == 'helm_mmlu' else dataset_name + source_dataset_name = ( + leaderboard_name + if leaderboard_name.lower() == 'helm_mmlu' + else dataset_name + ) source_data = SourceDataUrl( dataset_name=source_dataset_name, source_type='url', - url=[args.source_data_url] + url=[args.source_data_url], ) generation_config = ( - extract_generation_config(cell.get("run_spec_names", [])) - if cell.get("run_spec_names") + extract_generation_config( + cell.get('run_spec_names', []) + ) + if cell.get('run_spec_names') else {} ) @@ -253,18 +265,18 @@ def convert( metric_config=metric_config, score_details=ScoreDetails( score=( - round(cell.get("value"), 3) - if cell.get("value") is not None + round(cell.get('value'), 3) + if cell.get('value') is not None else -1 ), details={ - "description": str(cell.get("description", "")), - "tab": str(tab_name), + 'description': str(cell.get('description', '')), + 'tab': str(tab_name), }, ), generation_config=GenerationConfig( additional_details=generation_config - ) + ), ) else: # Add extra score details under the same metric @@ -272,35 +284,37 @@ def convert( detail_key = ( full_eval_name if full_eval_name != existing.evaluation_name - else f"{full_eval_name} - {tab_name}" + else f'{full_eval_name} - {tab_name}' ) if existing.score_details.details is None: existing.score_details.details = {} - existing.score_details.details[detail_key] = json.dumps({ - "description": str(cell.get("description", "")), - "tab": tab_name, - "score": str(cell.get("value", "")), - }) - + existing.score_details.details[detail_key] = json.dumps( + { + 'description': str(cell.get('description', '')), + 'tab': tab_name, + 'score': str(cell.get('value', '')), + } + ) + # Save evaluation logs for model_name, results_by_metric in model_results.items(): model_info = model_infos[model_name] model_id = model_ids[model_name] evaluation_id = ( - f"{leaderboard_name}/" - f"{model_id.replace('/', '_')}/" - f"{retrieved_timestamp}" + f'{leaderboard_name}/' + f'{model_id.replace("/", "_")}/' + f'{retrieved_timestamp}' ) eval_log = EvaluationLog( - schema_version="0.2.1", + schema_version='0.2.1', evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=make_source_metadata( source_name=leaderboard_name, - organization_name="crfm", + organization_name='crfm', evaluator_relationship=EvaluatorRelationship.third_party, ), eval_library=EvalLibrary( @@ -312,31 +326,31 @@ def convert( ) # Determine output path - if model_info.developer == "unknown": + if model_info.developer == 'unknown': developer = model_id model = model_id else: - if "/" in model_id: - developer, model = model_id.split("/", 1) + if '/' in model_id: + developer, model = model_id.split('/', 1) else: developer = model_info.developer model = model_id filepath = save_evaluation_log( eval_log, - f"data/{leaderboard_name}", + f'data/{leaderboard_name}', developer, model, ) - print(f"Saved: {filepath}") + print(f'Saved: {filepath}') -if __name__ == "__main__": +if __name__ == '__main__': args = parse_args() leaderboard_name = args.leaderboard_name.lower() - print(f"Fetching {leaderboard_name} data from {args.source_data_url}") + print(f'Fetching {leaderboard_name} data from {args.source_data_url}') leaderboard_data = fetch_json(args.source_data_url) convert( @@ -346,4 +360,4 @@ def convert( eval_library_version=args.eval_library_version, ) - print("Done!") + print('Done!') diff --git a/utils/hfopenllm_v2/adapter.py b/utils/hfopenllm_v2/adapter.py index e9f7d89d7..d11bcb3e1 100644 --- a/utils/hfopenllm_v2/adapter.py +++ b/utils/hfopenllm_v2/adapter.py @@ -21,73 +21,70 @@ ScoreType, SourceDataHf, ) - from every_eval_ever.helpers import ( SCHEMA_VERSION, fetch_json, - get_developer, make_model_info, make_source_metadata, save_evaluation_log, ) - # Source URL -SOURCE_URL = "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted" -OUTPUT_DIR = "data/hfopenllm_v2" +SOURCE_URL = 'https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted' +OUTPUT_DIR = 'data/hfopenllm_v2' # Evaluation name mapping from API keys to display names EVALUATION_MAPPING = { - "ifeval": "IFEval", - "bbh": "BBH", - "math": "MATH Level 5", - "gpqa": "GPQA", - "musr": "MUSR", - "mmlu_pro": "MMLU-PRO", + 'ifeval': 'IFEval', + 'bbh': 'BBH', + 'math': 'MATH Level 5', + 'gpqa': 'GPQA', + 'musr': 'MUSR', + 'mmlu_pro': 'MMLU-PRO', } # Evaluation descriptions EVALUATION_DESCRIPTIONS = { - "IFEval": "Accuracy on IFEval", - "BBH": "Accuracy on BBH", - "MATH Level 5": "Exact Match on MATH Level 5", - "GPQA": "Accuracy on GPQA", - "MUSR": "Accuracy on MUSR", - "MMLU-PRO": "Accuracy on MMLU-PRO", + 'IFEval': 'Accuracy on IFEval', + 'BBH': 'Accuracy on BBH', + 'MATH Level 5': 'Exact Match on MATH Level 5', + 'GPQA': 'Accuracy on GPQA', + 'MUSR': 'Accuracy on MUSR', + 'MMLU-PRO': 'Accuracy on MMLU-PRO', } # Source data mapping: eval_key -> SourceDataHf SOURCE_DATA_MAPPING = { - "ifeval": SourceDataHf( - dataset_name="IFEval", - source_type="hf_dataset", - hf_repo="google/IFEval", + 'ifeval': SourceDataHf( + dataset_name='IFEval', + source_type='hf_dataset', + hf_repo='google/IFEval', ), - "bbh": SourceDataHf( - dataset_name="BBH", - source_type="hf_dataset", - hf_repo="SaylorTwift/bbh", + 'bbh': SourceDataHf( + dataset_name='BBH', + source_type='hf_dataset', + hf_repo='SaylorTwift/bbh', ), - "math": SourceDataHf( - dataset_name="MATH Level 5", - source_type="hf_dataset", - hf_repo="DigitalLearningGmbH/MATH-lighteval", + 'math': SourceDataHf( + dataset_name='MATH Level 5', + source_type='hf_dataset', + hf_repo='DigitalLearningGmbH/MATH-lighteval', ), - "gpqa": SourceDataHf( - dataset_name="GPQA", - source_type="hf_dataset", - hf_repo="Idavidrein/gpqa", + 'gpqa': SourceDataHf( + dataset_name='GPQA', + source_type='hf_dataset', + hf_repo='Idavidrein/gpqa', ), - "musr": SourceDataHf( - dataset_name="MUSR", - source_type="hf_dataset", - hf_repo="TAUR-Lab/MuSR", + 'musr': SourceDataHf( + dataset_name='MUSR', + source_type='hf_dataset', + hf_repo='TAUR-Lab/MuSR', ), - "mmlu_pro": SourceDataHf( - dataset_name="MMLU-PRO", - source_type="hf_dataset", - hf_repo="TIGER-Lab/MMLU-Pro", + 'mmlu_pro': SourceDataHf( + dataset_name='MMLU-PRO', + source_type='hf_dataset', + hf_repo='TIGER-Lab/MMLU-Pro', ), } @@ -96,17 +93,19 @@ def convert_model( model_data: Dict[str, Any], retrieved_timestamp: str ) -> EvaluationLog: """Convert a single model's data to EvaluationLog format.""" - model_id = model_data["model"]["name"] - if "/" not in model_id: + model_id = model_data['model']['name'] + if '/' not in model_id: raise ValueError(f"Expected 'org/model' format, got: {model_id}") - developer, model_name = model_id.split("/", 1) + developer, model_name = model_id.split('/', 1) # Build evaluation results eval_results: List[EvaluationResult] = [] - for eval_key, eval_data in model_data.get("evaluations", {}).items(): - display_name = eval_data.get("name", EVALUATION_MAPPING.get(eval_key, eval_key)) + for eval_key, eval_data in model_data.get('evaluations', {}).items(): + display_name = eval_data.get( + 'name', EVALUATION_MAPPING.get(eval_key, eval_key) + ) description = EVALUATION_DESCRIPTIONS.get( - display_name, f"Accuracy on {display_name}" + display_name, f'Accuracy on {display_name}' ) source_data = SOURCE_DATA_MAPPING.get(eval_key) if source_data is None: @@ -126,47 +125,51 @@ def convert_model( max_score=1.0, ), score_details=ScoreDetails( - score=round(eval_data.get("value", 0.0), 4), + score=round(eval_data.get('value', 0.0), 4), ), ) ) # Build additional details additional_details = {} - if "precision" in model_data["model"]: - additional_details["precision"] = str(model_data["model"]["precision"]) - if "architecture" in model_data["model"]: - additional_details["architecture"] = str(model_data["model"]["architecture"]) - if "params_billions" in model_data.get("metadata", {}): - additional_details["params_billions"] = str( - model_data["metadata"]["params_billions"] + if 'precision' in model_data['model']: + additional_details['precision'] = str(model_data['model']['precision']) + if 'architecture' in model_data['model']: + additional_details['architecture'] = str( + model_data['model']['architecture'] + ) + if 'params_billions' in model_data.get('metadata', {}): + additional_details['params_billions'] = str( + model_data['metadata']['params_billions'] ) # Build model info model_info = make_model_info( model_name=model_name, developer=developer, - inference_platform="unknown", + inference_platform='unknown', additional_details=additional_details if additional_details else None, ) # Build evaluation ID - evaluation_id = f"hfopenllm_v2/{developer}_{model_name}/{retrieved_timestamp}" + evaluation_id = ( + f'hfopenllm_v2/{developer}_{model_name}/{retrieved_timestamp}' + ) return EvaluationLog( schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=make_source_metadata( - source_name="HF Open LLM v2", - organization_name="Hugging Face", + source_name='HF Open LLM v2', + organization_name='Hugging Face', evaluator_relationship=EvaluatorRelationship.third_party, ), eval_library=EvalLibrary( - name="lm-evaluation-harness", - version="0.4.0", + name='lm-evaluation-harness', + version='0.4.0', additional_details={ - "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" + 'fork': 'https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess' }, ), model_info=model_info, @@ -174,37 +177,43 @@ def convert_model( ) -def process_models(models_data: List[Dict[str, Any]], output_dir: str = OUTPUT_DIR): +def process_models( + models_data: List[Dict[str, Any]], output_dir: str = OUTPUT_DIR +): """Process a list of model evaluation dicts and save them.""" retrieved_timestamp = str(time.time()) count = 0 for model_data in models_data: try: - model_id = model_data["model"]["name"] - if "/" not in model_id: - raise ValueError(f"Expected 'org/model' format, got: {model_id}") - developer, model = model_id.split("/", 1) + model_id = model_data['model']['name'] + if '/' not in model_id: + raise ValueError( + f"Expected 'org/model' format, got: {model_id}" + ) + developer, model = model_id.split('/', 1) # Convert to EvaluationLog eval_log = convert_model(model_data, retrieved_timestamp) # Save - filepath = save_evaluation_log(eval_log, output_dir, developer, model) - print(f"Saved: {filepath}") + filepath = save_evaluation_log( + eval_log, output_dir, developer, model + ) + print(f'Saved: {filepath}') count += 1 except Exception as e: - model_name = model_data.get("model", {}).get("name", "unknown") - print(f"Error processing {model_name}: {e}") + model_name = model_data.get('model', {}).get('name', 'unknown') + print(f'Error processing {model_name}: {e}') return count -if __name__ == "__main__": - print(f"Fetching data from {SOURCE_URL}...") +if __name__ == '__main__': + print(f'Fetching data from {SOURCE_URL}...') all_models = fetch_json(SOURCE_URL) - print(f"Processing {len(all_models)} models...") + print(f'Processing {len(all_models)} models...') count = process_models(all_models) - print(f"Done! Processed {count} models.") + print(f'Done! Processed {count} models.') diff --git a/utils/livecodebenchpro/adapter.py b/utils/livecodebenchpro/adapter.py index 0c6031443..e400ae5e0 100644 --- a/utils/livecodebenchpro/adapter.py +++ b/utils/livecodebenchpro/adapter.py @@ -11,78 +11,84 @@ import json from pathlib import Path -BASE_URL = "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty" -DATA_DIR = Path(__file__).parent.parent.parent / "data" / "livecodebenchpro" +BASE_URL = 'https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty' +DATA_DIR = Path(__file__).parent.parent.parent / 'data' / 'livecodebenchpro' # Map evaluation_name -> difficulty for URL matching DIFFICULTY_FOR_EVAL = { - "Hard Problems": "hard", - "Medium Problems": "medium", - "Easy Problems": "easy", + 'Hard Problems': 'hard', + 'Medium Problems': 'medium', + 'Easy Problems': 'easy', } def make_source_data(difficulty: str) -> dict: """Build a SourceDataUrl dict for a given difficulty.""" return { - "dataset_name": f"{difficulty.capitalize()} Problems", - "source_type": "url", - "url": [f"{BASE_URL}?difficulty={difficulty}&benchmark_mode=live"], + 'dataset_name': f'{difficulty.capitalize()} Problems', + 'source_type': 'url', + 'url': [f'{BASE_URL}?difficulty={difficulty}&benchmark_mode=live'], } def migrate_file(filepath: Path) -> None: """Migrate a single JSON file from 0.1.0 to 0.2.0.""" - with open(filepath, "r") as f: + with open(filepath, 'r') as f: data = json.load(f) - if data.get("schema_version") == "0.2.0": - print(f"Skipping (already 0.2.0): {filepath}") + if data.get('schema_version') == '0.2.0': + print(f'Skipping (already 0.2.0): {filepath}') return - if data.get("schema_version") != "0.1.0": - raise ValueError(f"{filepath}: expected schema_version 0.1.0, got {data.get('schema_version')}") + if data.get('schema_version') != '0.1.0': + raise ValueError( + f'{filepath}: expected schema_version 0.1.0, got {data.get("schema_version")}' + ) # Remove top-level source_data - if "source_data" not in data: - raise ValueError(f"{filepath}: missing top-level source_data") - del data["source_data"] + if 'source_data' not in data: + raise ValueError(f'{filepath}: missing top-level source_data') + del data['source_data'] # Add source_data to each evaluation_result - for result in data["evaluation_results"]: - eval_name = result.get("evaluation_name") + for result in data['evaluation_results']: + eval_name = result.get('evaluation_name') if not eval_name: - raise ValueError(f"{filepath}: evaluation_result missing evaluation_name") + raise ValueError( + f'{filepath}: evaluation_result missing evaluation_name' + ) difficulty = DIFFICULTY_FOR_EVAL.get(eval_name) if not difficulty: - raise ValueError(f"{filepath}: unknown evaluation_name '{eval_name}'") + raise ValueError( + f"{filepath}: unknown evaluation_name '{eval_name}'" + ) - result["source_data"] = make_source_data(difficulty) + result['source_data'] = make_source_data(difficulty) - data["schema_version"] = "0.2.0" + data['schema_version'] = '0.2.0' - with open(filepath, "w") as f: + with open(filepath, 'w') as f: json.dump(data, f, indent=2) - f.write("\n") + f.write('\n') def main(): if not DATA_DIR.exists(): - raise FileNotFoundError(f"Data directory not found: {DATA_DIR}") + raise FileNotFoundError(f'Data directory not found: {DATA_DIR}') - files = list(DATA_DIR.rglob("*.json")) + files = list(DATA_DIR.rglob('*.json')) if not files: - raise FileNotFoundError(f"No JSON files found in {DATA_DIR}") + raise FileNotFoundError(f'No JSON files found in {DATA_DIR}') - print(f"Migrating {len(files)} files in {DATA_DIR}...") + print(f'Migrating {len(files)} files in {DATA_DIR}...') for filepath in files: migrate_file(filepath) - print(f"Migrated: {filepath}") + print(f'Migrated: {filepath}') - print(f"\nDone! Migrated {len(files)} files to schema 0.2.0.") + print(f'\nDone! Migrated {len(files)} files to schema 0.2.0.') -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/utils/rewardbench/adapter.py b/utils/rewardbench/adapter.py index 3e4ac001f..0784ed239 100644 --- a/utils/rewardbench/adapter.py +++ b/utils/rewardbench/adapter.py @@ -28,7 +28,6 @@ SourceDataHf, SourceMetadata, ) - from every_eval_ever.helpers import ( fetch_csv, fetch_json, @@ -38,64 +37,64 @@ ) # Schema version -SCHEMA_VERSION = "0.2.1" +SCHEMA_VERSION = '0.2.1' # Data source URLs -REWARDBENCH_V1_CSV = "https://huggingface.co/spaces/allenai/reward-bench/resolve/main/leaderboard/final-rbv1-data.csv" -REWARDBENCH_V2_TREE_API = "https://huggingface.co/api/datasets/allenai/reward-bench-2-results/tree/main/eval-set" -REWARDBENCH_V2_FILE_BASE = "https://huggingface.co/datasets/allenai/reward-bench-2-results/resolve/main/eval-set" +REWARDBENCH_V1_CSV = 'https://huggingface.co/spaces/allenai/reward-bench/resolve/main/leaderboard/final-rbv1-data.csv' +REWARDBENCH_V2_TREE_API = 'https://huggingface.co/api/datasets/allenai/reward-bench-2-results/tree/main/eval-set' +REWARDBENCH_V2_FILE_BASE = 'https://huggingface.co/datasets/allenai/reward-bench-2-results/resolve/main/eval-set' -OUTPUT_DIR = Path("data/reward-bench") +OUTPUT_DIR = Path('data/reward-bench') # RewardBench v1 source data (shared across all v1 evaluation results) V1_SOURCE_DATA = SourceDataHf( - dataset_name="RewardBench", - source_type="hf_dataset", - hf_repo="allenai/reward-bench", + dataset_name='RewardBench', + source_type='hf_dataset', + hf_repo='allenai/reward-bench', ) # RewardBench v2 source data (shared across all v2 evaluation results) V2_SOURCE_DATA = SourceDataHf( - dataset_name="RewardBench 2", - source_type="hf_dataset", - hf_repo="allenai/reward-bench-2-results", + dataset_name='RewardBench 2', + source_type='hf_dataset', + hf_repo='allenai/reward-bench-2-results', ) # Source metadata (shared) V1_SOURCE_METADATA = SourceMetadata( - source_name="RewardBench", - source_type="documentation", - source_organization_name="Allen Institute for AI", - source_organization_url="https://allenai.org", + source_name='RewardBench', + source_type='documentation', + source_organization_name='Allen Institute for AI', + source_organization_url='https://allenai.org', evaluator_relationship=EvaluatorRelationship.third_party, ) V2_SOURCE_METADATA = SourceMetadata( - source_name="RewardBench 2", - source_type="documentation", - source_organization_name="Allen Institute for AI", - source_organization_url="https://allenai.org", + source_name='RewardBench 2', + source_type='documentation', + source_organization_name='Allen Institute for AI', + source_organization_url='https://allenai.org', evaluator_relationship=EvaluatorRelationship.third_party, ) # RewardBench v1 metrics with descriptions V1_METRICS = { - "Score": "Overall RewardBench Score", - "Chat": "Chat accuracy - includes easy chat subsets", - "Chat Hard": "Chat Hard accuracy - includes hard chat subsets", - "Safety": "Safety accuracy - includes safety subsets", - "Reasoning": "Reasoning accuracy - includes code and math subsets", - "Prior Sets (0.5 weight)": "Prior Sets score (weighted 0.5) - includes test sets", + 'Score': 'Overall RewardBench Score', + 'Chat': 'Chat accuracy - includes easy chat subsets', + 'Chat Hard': 'Chat Hard accuracy - includes hard chat subsets', + 'Safety': 'Safety accuracy - includes safety subsets', + 'Reasoning': 'Reasoning accuracy - includes code and math subsets', + 'Prior Sets (0.5 weight)': 'Prior Sets score (weighted 0.5) - includes test sets', } # RewardBench v2 metrics with descriptions V2_METRICS = [ - ("Factuality", "Factuality score - measures factual accuracy"), - ("Precise IF", "Precise Instruction Following score"), - ("Math", "Math score - measures mathematical reasoning"), - ("Safety", "Safety score - measures safety awareness"), - ("Focus", "Focus score - measures response focus"), - ("Ties", "Ties score - ability to identify tie cases"), + ('Factuality', 'Factuality score - measures factual accuracy'), + ('Precise IF', 'Precise Instruction Following score'), + ('Math', 'Math score - measures mathematical reasoning'), + ('Safety', 'Safety score - measures safety awareness'), + ('Focus', 'Focus score - measures response focus'), + ('Ties', 'Ties score - ability to identify tie cases'), ] @@ -137,10 +136,12 @@ def _make_model_info( def _save_eval_log(eval_log: EvaluationLog, developer: str, model: str) -> Path: """Save an evaluation log to the standard directory structure.""" - dir_path = OUTPUT_DIR / sanitize_filename(developer) / sanitize_filename(model) + dir_path = ( + OUTPUT_DIR / sanitize_filename(developer) / sanitize_filename(model) + ) dir_path.mkdir(parents=True, exist_ok=True) - filepath = dir_path / f"{uuid.uuid4()}.json" + filepath = dir_path / f'{uuid.uuid4()}.json' json_str = eval_log.model_dump_json(indent=2, exclude_none=True) filepath.write_text(json_str) return filepath @@ -148,13 +149,13 @@ def _save_eval_log(eval_log: EvaluationLog, developer: str, model: str) -> Path: def extract_model_name_from_html(html_string: str) -> str: """Extract the model name from an HTML anchor tag.""" - pattern = r">([^<]+)<" + pattern = r'>([^<]+)<' match = re.search(pattern, html_string) if match: name = match.group(1).strip() - name = re.sub(r"\s*[\*⚠️]+$", "", name).strip() + name = re.sub(r'\s*[\*⚠️]+$', '', name).strip() return name - return re.sub(r"\s*[\*⚠️]+$", "", html_string).strip() + return re.sub(r'\s*[\*⚠️]+$', '', html_string).strip() def parse_score(value: str) -> Optional[float]: @@ -173,25 +174,25 @@ def parse_score(value: str) -> Optional[float]: def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: """Fetch and process RewardBench v1 results from the CSV file.""" - print("Fetching RewardBench v1 CSV...") + print('Fetching RewardBench v1 CSV...') rows = fetch_csv(REWARDBENCH_V1_CSV) count = 0 for row in rows: # Extract model name from HTML link - model_html = row.get("Model", "") + model_html = row.get('Model', '') model_name = extract_model_name_from_html(model_html) - if not model_name or model_name == "random": + if not model_name or model_name == 'random': continue - model_type = row.get("Model Type", "") + model_type = row.get('Model Type', '') developer = get_developer(model_name) # Create evaluation results for each metric eval_results: List[EvaluationResult] = [] for metric_name, description in V1_METRICS.items(): - score = parse_score(row.get(metric_name, "")) + score = parse_score(row.get(metric_name, '')) if score is not None: eval_results.append( _make_eval_result( @@ -209,29 +210,31 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: model_info = _make_model_info( model_name=model_name, developer=developer, - additional_details={"model_type": model_type} if model_type else None, + additional_details={'model_type': model_type} + if model_type + else None, ) # Build evaluation log - evaluation_id = f"reward-bench/{model_info.id.replace('/', '_')}/{retrieved_timestamp}" + evaluation_id = f'reward-bench/{model_info.id.replace("/", "_")}/{retrieved_timestamp}' eval_log = EvaluationLog( schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=V1_SOURCE_METADATA, - eval_library=EvalLibrary(name="unknown", version="unknown"), + eval_library=EvalLibrary(name='unknown', version='unknown'), model_info=model_info, evaluation_results=eval_results, ) # Parse model path for saving - if "/" in model_info.id: - dev, model = model_info.id.split("/", 1) + if '/' in model_info.id: + dev, model = model_info.id.split('/', 1) else: - dev, model = "unknown", model_info.id + dev, model = 'unknown', model_info.id filepath = _save_eval_log(eval_log, dev, model) - print(f"Saved: {filepath}") + print(f'Saved: {filepath}') count += 1 return count @@ -239,42 +242,44 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: """Fetch and process RewardBench v2 results from the HuggingFace dataset.""" - print("Fetching RewardBench v2 model list...") + print('Fetching RewardBench v2 model list...') orgs = fetch_json(REWARDBENCH_V2_TREE_API) count = 0 for org_item in orgs: - if org_item["type"] != "directory": + if org_item['type'] != 'directory': continue - org_path = org_item["path"] - org_name = org_path.split("/")[-1] - print(f" Processing organization: {org_name}") + org_path = org_item['path'] + org_name = org_path.split('/')[-1] + print(f' Processing organization: {org_name}') # Get models for this org - org_tree_url = f"https://huggingface.co/api/datasets/allenai/reward-bench-2-results/tree/main/{org_path}" + org_tree_url = f'https://huggingface.co/api/datasets/allenai/reward-bench-2-results/tree/main/{org_path}' try: model_files = fetch_json(org_tree_url) except Exception as e: - print(f" Error fetching org tree: {e}") + print(f' Error fetching org tree: {e}') continue for model_file in model_files: - if model_file["type"] != "file" or not model_file["path"].endswith(".json"): + if model_file['type'] != 'file' or not model_file['path'].endswith( + '.json' + ): continue - model_path = model_file["path"] - model_url = f"{REWARDBENCH_V2_FILE_BASE}/{'/'.join(model_path.split('/')[1:])}" + model_path = model_file['path'] + model_url = f'{REWARDBENCH_V2_FILE_BASE}/{"/".join(model_path.split("/")[1:])}' try: model_data = fetch_json(model_url) except Exception as e: - print(f" Error fetching {model_path}: {e}") + print(f' Error fetching {model_path}: {e}') continue - model_name = model_data.get("model", "unknown") - model_type = model_data.get("model_type", "") + model_name = model_data.get('model', 'unknown') + model_type = model_data.get('model_type', '') developer = get_developer(model_name) # Build evaluation results @@ -282,7 +287,10 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: scores_for_average = [] for metric_name, description in V2_METRICS: - if metric_name in model_data and model_data[metric_name] is not None: + if ( + metric_name in model_data + and model_data[metric_name] is not None + ): try: score = float(model_data[metric_name]) scores_for_average.append(score) @@ -306,9 +314,9 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: eval_results.insert( 0, _make_eval_result( - name="Score", + name='Score', score=mean_score, - description="Overall RewardBench 2 Score (mean of all metrics)", + description='Overall RewardBench 2 Score (mean of all metrics)', source_data=V2_SOURCE_DATA, ), ) @@ -317,29 +325,31 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: model_info = _make_model_info( model_name=model_name, developer=developer, - additional_details={"model_type": model_type} if model_type else None, + additional_details={'model_type': model_type} + if model_type + else None, ) # Build evaluation log - evaluation_id = f"reward-bench-2/{model_info.id.replace('/', '_')}/{retrieved_timestamp}" + evaluation_id = f'reward-bench-2/{model_info.id.replace("/", "_")}/{retrieved_timestamp}' eval_log = EvaluationLog( schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, source_metadata=V2_SOURCE_METADATA, - eval_library=EvalLibrary(name="unknown", version="unknown"), + eval_library=EvalLibrary(name='unknown', version='unknown'), model_info=model_info, evaluation_results=eval_results, ) # Parse model path for saving - if "/" in model_info.id: - dev, model = model_info.id.split("/", 1) + if '/' in model_info.id: + dev, model = model_info.id.split('/', 1) else: - dev, model = "unknown", model_info.id + dev, model = 'unknown', model_info.id filepath = _save_eval_log(eval_log, dev, model) - print(f" Saved: {filepath}") + print(f' Saved: {filepath}') count += 1 return count @@ -349,34 +359,36 @@ def main(): """Main function to fetch and process RewardBench results.""" retrieved_timestamp = str(time.time()) - print("=" * 60) - print("Fetching RewardBench v1 results...") - print("=" * 60) + print('=' * 60) + print('Fetching RewardBench v1 results...') + print('=' * 60) try: v1_count = fetch_rewardbench_v1(retrieved_timestamp) - print(f"\nProcessed {v1_count} models from RewardBench v1") + print(f'\nProcessed {v1_count} models from RewardBench v1') except Exception as e: - print(f"Error processing RewardBench v1: {e}") + print(f'Error processing RewardBench v1: {e}') import traceback + traceback.print_exc() - print("\n" + "=" * 60) - print("Fetching RewardBench v2 results...") - print("=" * 60) + print('\n' + '=' * 60) + print('Fetching RewardBench v2 results...') + print('=' * 60) try: v2_count = fetch_rewardbench_v2(retrieved_timestamp) - print(f"\nProcessed {v2_count} models from RewardBench v2") + print(f'\nProcessed {v2_count} models from RewardBench v2') except Exception as e: - print(f"Error processing RewardBench v2: {e}") + print(f'Error processing RewardBench v2: {e}') import traceback + traceback.print_exc() - print("\n" + "=" * 60) - print("Done!") - print("=" * 60) + print('\n' + '=' * 60) + print('Done!') + print('=' * 60) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/utils/rewardbench/migrate_to_v020.py b/utils/rewardbench/migrate_to_v020.py index 0346d7c69..656e8d846 100644 --- a/utils/rewardbench/migrate_to_v020.py +++ b/utils/rewardbench/migrate_to_v020.py @@ -20,19 +20,18 @@ import json from pathlib import Path - -DATA_DIR = Path("data/reward-bench") +DATA_DIR = Path('data/reward-bench') V1_SOURCE_DATA = { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench", + 'dataset_name': 'RewardBench', + 'source_type': 'hf_dataset', + 'hf_repo': 'allenai/reward-bench', } V2_SOURCE_DATA = { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results", + 'dataset_name': 'RewardBench 2', + 'source_type': 'hf_dataset', + 'hf_repo': 'allenai/reward-bench-2-results', } @@ -42,40 +41,40 @@ def migrate_file(filepath: Path) -> bool: Returns True if the file was modified, False if it was already up to date. """ - with open(filepath, "r") as f: + with open(filepath, 'r') as f: data = json.load(f) # Skip files that are already v0.2.0 - if data.get("schema_version") == "0.2.0": + if data.get('schema_version') == '0.2.0': return False # Determine source_data based on evaluation_id - evaluation_id = data.get("evaluation_id", "") - if evaluation_id.startswith("reward-bench-2/"): + evaluation_id = data.get('evaluation_id', '') + if evaluation_id.startswith('reward-bench-2/'): source_data = V2_SOURCE_DATA else: source_data = V1_SOURCE_DATA # 1. Update schema_version - data["schema_version"] = "0.2.0" + data['schema_version'] = '0.2.0' # 2. Remove top-level source_data - data.pop("source_data", None) + data.pop('source_data', None) # 3. Add source_data to each evaluation result - for result in data.get("evaluation_results", []): - if "source_data" not in result: - result["source_data"] = source_data + for result in data.get('evaluation_results', []): + if 'source_data' not in result: + result['source_data'] = source_data # 4. Clean up model_info: remove inference_platform if "unknown" - model_info = data.get("model_info", {}) - if model_info.get("inference_platform") == "unknown": - del model_info["inference_platform"] + model_info = data.get('model_info', {}) + if model_info.get('inference_platform') == 'unknown': + del model_info['inference_platform'] # Write back - with open(filepath, "w") as f: + with open(filepath, 'w') as f: json.dump(data, f, indent=2) - f.write("\n") + f.write('\n') return True @@ -83,11 +82,11 @@ def migrate_file(filepath: Path) -> bool: def main(): """Migrate all reward-bench JSON files to v0.2.0.""" if not DATA_DIR.exists(): - print(f"Error: {DATA_DIR} does not exist") + print(f'Error: {DATA_DIR} does not exist') return - json_files = sorted(DATA_DIR.rglob("*.json")) - print(f"Found {len(json_files)} JSON files in {DATA_DIR}") + json_files = sorted(DATA_DIR.rglob('*.json')) + print(f'Found {len(json_files)} JSON files in {DATA_DIR}') migrated = 0 skipped = 0 @@ -100,14 +99,14 @@ def main(): else: skipped += 1 except Exception as e: - print(f" Error migrating {filepath}: {e}") + print(f' Error migrating {filepath}: {e}') errors += 1 - print(f"\nMigration complete:") - print(f" Migrated: {migrated}") - print(f" Skipped (already v0.2.0): {skipped}") - print(f" Errors: {errors}") + print('\nMigration complete:') + print(f' Migrated: {migrated}') + print(f' Skipped (already v0.2.0): {skipped}') + print(f' Errors: {errors}') -if __name__ == "__main__": +if __name__ == '__main__': main()