diff --git a/every_eval_ever/helpers/developer.py b/every_eval_ever/helpers/developer.py index 861a583a6..d27164e16 100644 --- a/every_eval_ever/helpers/developer.py +++ b/every_eval_ever/helpers/developer.py @@ -32,6 +32,7 @@ # Mistral models 'mistral': 'mistralai', 'mixtral': 'mistralai', + 'devstral': 'mistralai', # Alibaba models 'qwen': 'alibaba', # Microsoft models @@ -71,6 +72,7 @@ 'olmo': 'allenai', 'nova': 'amazon', 'grok': 'xai', + 'kimi': 'moonshotai', } diff --git a/utils/swe_bench_verified/__init__.py b/utils/swe_bench_verified/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/utils/swe_bench_verified/adapter.py b/utils/swe_bench_verified/adapter.py new file mode 100644 index 000000000..588be3236 --- /dev/null +++ b/utils/swe_bench_verified/adapter.py @@ -0,0 +1,262 @@ +""" +Script to convert SWE-bench Verified leaderboard data to the EvalEval schema format. + +Data source: +- SWE-bench experiments repo: https://github.com/swe-bench/experiments + Cloned to a temporary directory at runtime; cleaned up on exit. + +Each subdirectory under evaluation/verified/ is a submission with: + - metadata.yaml: model/org info, tags + - results/results.json: resolved/no_generation/no_logs instance lists + +Score = len(resolved) / 500 (500 total SWE-bench Verified instances) + +Usage: + cd every_eval_ever + .venv/bin/python -m utils.swe_bench_verified.adapter +""" + +import json +import re +import subprocess +import tempfile +import time +from pathlib import Path + +from every_eval_ever.eval_types import ( + AgenticEvalConfig, + AvailableTool, + EvalLibrary, + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + GenerationArgs, + GenerationConfig, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataUrl, + SourceMetadata, +) +from every_eval_ever.helpers import SCHEMA_VERSION, get_developer, get_model_id, save_evaluation_log + +SWE_BENCH_REPO = "https://github.com/swe-bench/experiments" +SWE_BENCH_SUBDIR = "evaluation/verified" +OUTPUT_DIR = "data/swe-bench-verified-leaderboard" + + +def parse_date_from_dir(dir_name: str) -> str | None: + """Extract ISO date from directory name prefix like '20250225_sweagent_...'""" + m = re.match(r'^(\d{4})(\d{2})(\d{2})_', dir_name) + if m: + return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" + return None + + +def normalize_org(org) -> str: + """Normalize org field which can be str, list, or None.""" + if isinstance(org, list): + return ", ".join(str(o) for o in org if o) + return str(org) if org else "" + + +def normalize_model_name(model) -> str: + """Normalize a raw model value to a clean model name string. + + Handles: + - HuggingFace URLs: https://huggingface.co/org/model → org/model + - Plain strings returned as-is + """ + if not model: + return "" + s = str(model) + if s.startswith("https://huggingface.co/"): + s = s[len("https://huggingface.co/"):] + return s + + +def get_primary_model(tags: dict, info: dict, dir_name: str) -> str: + """Extract the primary model name from tags, falling back to submission info.""" + raw = tags.get("model") + # tags.model can be a list or a plain string + if isinstance(raw, list): + models = raw + elif raw is not None: + models = [raw] + else: + models = [] + + if models: + return normalize_model_name(models[0]) + # Fallback: use submission name from info + return info.get("name", dir_name) + + +def convert_submission(submission_dir: Path, retrieved_timestamp: str, total_instances: int) -> EvaluationLog: + """Convert a single SWE-bench submission directory to an EvaluationLog.""" + dir_name = submission_dir.name + + try: + import yaml + except ImportError as e: + raise ImportError( + "pyyaml is required to run this adapter. Install it with: pip install pyyaml" + ) from e + + # Read metadata + with open(submission_dir / "metadata.yaml") as f: + metadata = yaml.safe_load(f) + + # Read results + with open(submission_dir / "results" / "results.json") as f: + results = json.load(f) + + tags = metadata.get("tags", {}) or {} + info = metadata.get("info", {}) or {} + + # Primary model: first element of tags.model (list or string), fallback to submission name + primary_model = get_primary_model(tags, info, dir_name) + + developer = get_developer(primary_model) + model_id = get_model_id(primary_model, developer) + + # Score: resolved / total_instances + resolved = results.get("resolved", []) + score = len(resolved) / total_instances + + # Build additional_details (all values must be strings) + additional_details: dict[str, str] = { + "submission_name": str(info.get("name", "")), + "agent_organization": normalize_org(tags.get("org", "")), + "open_source_model": str(tags.get("os_model", "")), + "open_source_system": str(tags.get("os_system", "")), + "verified": str(tags.get("checked", "")), + "attempts": str((tags.get("system") or {}).get("attempts", "")), + "submission_dir": dir_name, + } + site = info.get("site") + if site: + additional_details["site"] = str(site) + report = info.get("report") + if report: + additional_details["report"] = str(report) + + # Score details + score_details: dict[str, str] = { + "resolved_count": str(len(resolved)), + } + no_generation = results.get("no_generation", []) + if no_generation: + score_details["no_generation_count"] = str(len(no_generation)) + no_logs = results.get("no_logs", []) + if no_logs: + score_details["no_logs_count"] = str(len(no_logs)) + + # Sanitize identifier components for use in evaluation_id + sanitized_id = re.sub(r"[^a-zA-Z0-9_.-]", "_", model_id.replace("/", "_")) + submission_slug = re.sub(r"[^a-zA-Z0-9_.-]", "_", dir_name) + eval_id = ( + f"swe-bench-verified/{sanitized_id}/{submission_slug}/{retrieved_timestamp}" + ) + evaluation_timestamp = parse_date_from_dir(dir_name) + + eval_result = EvaluationResult( + evaluation_name="SWE-bench Verified", + source_data=SourceDataUrl( + dataset_name="SWE-bench Verified", + source_type="url", + url=["https://www.swebench.com"], + ), + evaluation_timestamp=evaluation_timestamp, + metric_config=MetricConfig( + evaluation_description=( + "Fraction of 500 verified GitHub issues resolved (0.0–1.0)" + ), + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=1.0, + ), + score_details=ScoreDetails( + score=score, + details=score_details, + ), + generation_config=GenerationConfig( + generation_args=GenerationArgs( + agentic_eval_config=AgenticEvalConfig( + available_tools=[AvailableTool(name="bash")], + ), + ), + ), + ) + + return EvaluationLog( + schema_version=SCHEMA_VERSION, + evaluation_id=eval_id, + retrieved_timestamp=retrieved_timestamp, + evaluation_timestamp=evaluation_timestamp, + source_metadata=SourceMetadata( + source_name="SWE-bench Verified Leaderboard", + source_type="documentation", + source_organization_name="SWE-bench", + source_organization_url="https://www.swebench.com", + evaluator_relationship=EvaluatorRelationship.third_party, + ), + eval_library=EvalLibrary(name="swe-bench", version="unknown"), + model_info=ModelInfo( + name=primary_model, + id=model_id, + developer=developer if developer != "unknown" else None, + additional_details=additional_details, + ), + evaluation_results=[eval_result], + ) + + +def main(): + try: + from datasets import load_dataset + except ImportError as e: + raise ImportError( + "datasets is required to run this adapter. Install it with: pip install datasets" + ) from e + + retrieved_timestamp = str(time.time()) + count = 0 + errors = 0 + + ds = load_dataset("SWE-bench/SWE-bench_Verified", split="test") + total_instances = len(ds) + print(f"Loaded {total_instances} instances from SWE-bench/SWE-bench_Verified\n") + + with tempfile.TemporaryDirectory() as tmpdir: + print(f"Cloning {SWE_BENCH_REPO} into {tmpdir} ...") + subprocess.run( + ["git", "clone", "--depth=1", SWE_BENCH_REPO, tmpdir], + check=True, + ) + + swe_bench_path = Path(tmpdir) / SWE_BENCH_SUBDIR + submissions = sorted(d for d in swe_bench_path.iterdir() if d.is_dir()) + print(f"Found {len(submissions)} submission directories\n") + + for submission_dir in submissions: + try: + eval_log = convert_submission(submission_dir, retrieved_timestamp, total_instances) + dev = eval_log.model_info.developer or "unknown" + # Use model name without developer prefix for the directory + model_name = eval_log.model_info.name.split("/")[-1] + filepath = save_evaluation_log(eval_log, OUTPUT_DIR, dev, model_name) + score = eval_log.evaluation_results[0].score_details.score + print(f" [{score:.1%}] {submission_dir.name} → {filepath}") + count += 1 + except Exception as e: + print(f" ERROR {submission_dir.name}: {e}") + errors += 1 + + print(f"\nGenerated {count} files, {errors} errors → {OUTPUT_DIR}/") + + +if __name__ == "__main__": + main()