From 29267b5c98d194a8b6cd3362af4621491faa1330 Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Tue, 31 Mar 2026 17:27:36 -0500 Subject: [PATCH 1/2] chore: migrate harness dataclasses to Pydantic BaseModel (AC-489, AC-481) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Converted 12 dataclasses across 7 harness files from @dataclass to Pydantic BaseModel. Eliminates ~155 lines of to_dict/from_dict boilerplate. Files converted: - pipeline/objective_guardrail.py (2 classes) - pipeline/advancement.py (2 classes) - pipeline/holdout.py (2 classes) - scoring/backends.py (TrialResult only — RatingUpdate stays @dataclass) - evaluation/self_play.py (2 classes) - evaluation/dimensional.py (2 classes) NOT converted (frozen dataclasses with explicit is_dataclass tests): - adapt/types.py, audit/types.py, cost/types.py - optimizer/pareto.py (mixed Candidate @dataclass + ActionableSideInfo) Also fixed 3 test files: positional → keyword construction. 9 files changed, 72 insertions, 227 deletions (-155 net) --- .../harness/evaluation/dimensional.py | 35 +++------ .../harness/evaluation/self_play.py | 39 +++------- .../harness/pipeline/advancement.py | 72 +++---------------- .../autocontext/harness/pipeline/holdout.py | 51 +++---------- .../harness/pipeline/objective_guardrail.py | 48 +++---------- .../autocontext/harness/scoring/backends.py | 22 ++---- autocontext/tests/test_dimensional_scoring.py | 4 +- autocontext/tests/test_pareto_optimizer.py | 6 +- autocontext/tests/test_self_play.py | 22 +++--- 9 files changed, 72 insertions(+), 227 deletions(-) diff --git a/autocontext/src/autocontext/harness/evaluation/dimensional.py b/autocontext/src/autocontext/harness/evaluation/dimensional.py index ef3ae45a..8ccd2325 100644 --- a/autocontext/src/autocontext/harness/evaluation/dimensional.py +++ b/autocontext/src/autocontext/harness/evaluation/dimensional.py @@ -14,12 +14,12 @@ from __future__ import annotations from collections.abc import Sequence -from dataclasses import dataclass, field from typing import Any +from pydantic import BaseModel, Field -@dataclass(slots=True) -class ScoringDimension: + +class ScoringDimension(BaseModel): """A named scoring dimension with weight.""" name: str @@ -27,28 +27,19 @@ class ScoringDimension: description: str = "" def to_dict(self) -> dict[str, Any]: - return { - "name": self.name, - "weight": self.weight, - "description": self.description, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> ScoringDimension: - return cls( - name=data["name"], - weight=data.get("weight", 1.0), - description=data.get("description", ""), - ) + return cls.model_validate(data) -@dataclass(slots=True) -class DimensionalScore: +class DimensionalScore(BaseModel): """Aggregate score plus per-dimension breakdown.""" aggregate: float dimensions: dict[str, float] - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def weighted_aggregate(self, dimension_specs: Sequence[ScoringDimension]) -> float: """Compute weighted aggregate from dimension specs.""" @@ -62,19 +53,11 @@ def weighted_aggregate(self, dimension_specs: Sequence[ScoringDimension]) -> flo return round(weighted_sum / total_weight, 6) def to_dict(self) -> dict[str, Any]: - return { - "aggregate": self.aggregate, - "dimensions": self.dimensions, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> DimensionalScore: - return cls( - aggregate=data.get("aggregate", 0.0), - dimensions=data.get("dimensions", {}), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) def normalize_dimension_specs( diff --git a/autocontext/src/autocontext/harness/evaluation/self_play.py b/autocontext/src/autocontext/harness/evaluation/self_play.py index e7355772..495529b8 100644 --- a/autocontext/src/autocontext/harness/evaluation/self_play.py +++ b/autocontext/src/autocontext/harness/evaluation/self_play.py @@ -14,42 +14,29 @@ import json from collections.abc import Sequence -from dataclasses import dataclass, field from typing import Any +from pydantic import BaseModel, Field -@dataclass(slots=True) -class SelfPlayOpponent: + +class SelfPlayOpponent(BaseModel): """A prior generation's strategy used as an opponent.""" strategy: dict[str, Any] generation: int elo: float score: float - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "strategy": self.strategy, - "generation": self.generation, - "elo": self.elo, - "score": self.score, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> SelfPlayOpponent: - return cls( - strategy=data.get("strategy", {}), - generation=data.get("generation", 0), - elo=data.get("elo", 1000.0), - score=data.get("score", 0.0), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) -@dataclass(slots=True) -class SelfPlayConfig: +class SelfPlayConfig(BaseModel): """Configuration for self-play opponent pool.""" enabled: bool = False @@ -57,19 +44,11 @@ class SelfPlayConfig: weight: float = 0.5 # fraction of matches vs self-play opponents def to_dict(self) -> dict[str, Any]: - return { - "enabled": self.enabled, - "pool_size": self.pool_size, - "weight": self.weight, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> SelfPlayConfig: - return cls( - enabled=data.get("enabled", False), - pool_size=data.get("pool_size", 3), - weight=data.get("weight", 0.5), - ) + return cls.model_validate(data) class SelfPlayPool: diff --git a/autocontext/src/autocontext/harness/pipeline/advancement.py b/autocontext/src/autocontext/harness/pipeline/advancement.py index 93ebdae8..06b18649 100644 --- a/autocontext/src/autocontext/harness/pipeline/advancement.py +++ b/autocontext/src/autocontext/harness/pipeline/advancement.py @@ -14,17 +14,17 @@ from __future__ import annotations -from dataclasses import dataclass, field from typing import Any +from pydantic import BaseModel, Field + # Thresholds _ERROR_RATE_THRESHOLD = 0.2 _LOW_CONFIDENCE_THRESHOLD = 0.5 _HIGH_VARIANCE_THRESHOLD = 0.04 -@dataclass(slots=True) -class AdvancementMetrics: +class AdvancementMetrics(BaseModel): """Composite metrics input to gate decisions.""" best_score: float @@ -42,57 +42,21 @@ class AdvancementMetrics: generalization_gap: float | None = None cost_usd: float = 0.0 tokens_used: int = 0 - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) @property def delta(self) -> float: return round(self.best_score - self.previous_best, 6) def to_dict(self) -> dict[str, Any]: - return { - "best_score": self.best_score, - "mean_score": self.mean_score, - "previous_best": self.previous_best, - "score_variance": self.score_variance, - "sample_count": self.sample_count, - "error_rate": self.error_rate, - "crash_count": self.crash_count, - "confidence": self.confidence, - "sample_agreement": self.sample_agreement, - "search_proxy_score": self.search_proxy_score, - "resolved_truth_score": self.resolved_truth_score, - "previous_resolved_truth_score": self.previous_resolved_truth_score, - "generalization_gap": self.generalization_gap, - "cost_usd": self.cost_usd, - "tokens_used": self.tokens_used, - "delta": self.delta, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> AdvancementMetrics: - return cls( - best_score=data.get("best_score", 0.0), - mean_score=data.get("mean_score", 0.0), - previous_best=data.get("previous_best", 0.0), - score_variance=data.get("score_variance", 0.0), - sample_count=data.get("sample_count", 0), - error_rate=data.get("error_rate", 0.0), - crash_count=data.get("crash_count", 0), - confidence=data.get("confidence", 1.0), - sample_agreement=data.get("sample_agreement", 1.0), - search_proxy_score=data.get("search_proxy_score"), - resolved_truth_score=data.get("resolved_truth_score"), - previous_resolved_truth_score=data.get("previous_resolved_truth_score"), - generalization_gap=data.get("generalization_gap"), - cost_usd=data.get("cost_usd", 0.0), - tokens_used=data.get("tokens_used", 0), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) -@dataclass(slots=True) -class AdvancementRationale: +class AdvancementRationale(BaseModel): """Operator-visible gate decision explanation.""" decision: str # advance, retry, rollback @@ -101,30 +65,14 @@ class AdvancementRationale: binding_checks: list[str] proxy_signals: list[str] risk_flags: list[str] - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "decision": self.decision, - "reason": self.reason, - "component_scores": self.component_scores, - "binding_checks": self.binding_checks, - "proxy_signals": self.proxy_signals, - "risk_flags": self.risk_flags, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> AdvancementRationale: - return cls( - decision=data.get("decision", "rollback"), - reason=data.get("reason", ""), - component_scores=data.get("component_scores", {}), - binding_checks=data.get("binding_checks", []), - proxy_signals=data.get("proxy_signals", []), - risk_flags=data.get("risk_flags", []), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) def evaluate_advancement( diff --git a/autocontext/src/autocontext/harness/pipeline/holdout.py b/autocontext/src/autocontext/harness/pipeline/holdout.py index 586c4197..a57554d0 100644 --- a/autocontext/src/autocontext/harness/pipeline/holdout.py +++ b/autocontext/src/autocontext/harness/pipeline/holdout.py @@ -15,12 +15,12 @@ import statistics from collections.abc import Callable -from dataclasses import dataclass, field from typing import Any +from pydantic import BaseModel, Field -@dataclass(slots=True) -class HoldoutPolicy: + +class HoldoutPolicy(BaseModel): """Configurable holdout evaluation policy.""" holdout_seeds: int = 5 @@ -28,32 +28,17 @@ class HoldoutPolicy: max_generalization_gap: float = 0.2 seed_offset: int = 10000 enabled: bool = True - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "holdout_seeds": self.holdout_seeds, - "min_holdout_score": self.min_holdout_score, - "max_generalization_gap": self.max_generalization_gap, - "seed_offset": self.seed_offset, - "enabled": self.enabled, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> HoldoutPolicy: - return cls( - holdout_seeds=data.get("holdout_seeds", 5), - min_holdout_score=data.get("min_holdout_score", 0.5), - max_generalization_gap=data.get("max_generalization_gap", 0.2), - seed_offset=data.get("seed_offset", 10000), - enabled=data.get("enabled", True), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) -@dataclass(slots=True) -class HoldoutResult: +class HoldoutResult(BaseModel): """Outcome of holdout evaluation.""" holdout_mean_score: float @@ -62,30 +47,14 @@ class HoldoutResult: generalization_gap: float passed: bool reason: str - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "holdout_mean_score": self.holdout_mean_score, - "holdout_scores": self.holdout_scores, - "in_sample_score": self.in_sample_score, - "generalization_gap": self.generalization_gap, - "passed": self.passed, - "reason": self.reason, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> HoldoutResult: - return cls( - holdout_mean_score=data.get("holdout_mean_score", 0.0), - holdout_scores=data.get("holdout_scores", []), - in_sample_score=data.get("in_sample_score", 0.0), - generalization_gap=data.get("generalization_gap", 0.0), - passed=data.get("passed", False), - reason=data.get("reason", ""), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) def holdout_check( diff --git a/autocontext/src/autocontext/harness/pipeline/objective_guardrail.py b/autocontext/src/autocontext/harness/pipeline/objective_guardrail.py index 6f45e2a7..93391301 100644 --- a/autocontext/src/autocontext/harness/pipeline/objective_guardrail.py +++ b/autocontext/src/autocontext/harness/pipeline/objective_guardrail.py @@ -14,12 +14,13 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any +from pydantic import BaseModel, Field -@dataclass(slots=True) -class ObjectiveGuardrailPolicy: + +class ObjectiveGuardrailPolicy(BaseModel): """Configurable thresholds for objective verification guardrail.""" min_recall: float = 0.5 @@ -27,58 +28,31 @@ class ObjectiveGuardrailPolicy: max_false_positive_rate: float = 0.3 max_rubric_objective_gap: float = 0.2 enabled: bool = True - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "min_recall": self.min_recall, - "min_precision": self.min_precision, - "max_false_positive_rate": self.max_false_positive_rate, - "max_rubric_objective_gap": self.max_rubric_objective_gap, - "enabled": self.enabled, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> ObjectiveGuardrailPolicy: - return cls( - min_recall=data.get("min_recall", 0.5), - min_precision=data.get("min_precision", 0.5), - max_false_positive_rate=data.get("max_false_positive_rate", 0.3), - max_rubric_objective_gap=data.get("max_rubric_objective_gap", 0.2), - enabled=data.get("enabled", True), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) -@dataclass(slots=True) -class GuardrailResult: +class GuardrailResult(BaseModel): """Outcome of an objective guardrail check.""" passed: bool reason: str violations: list[str] metrics: dict[str, float] - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> dict[str, Any]: - return { - "passed": self.passed, - "reason": self.reason, - "violations": self.violations, - "metrics": self.metrics, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> GuardrailResult: - return cls( - passed=data.get("passed", False), - reason=data.get("reason", ""), - violations=data.get("violations", []), - metrics=data.get("metrics", {}), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) def check_objective_guardrail( diff --git a/autocontext/src/autocontext/harness/scoring/backends.py b/autocontext/src/autocontext/harness/scoring/backends.py index 53047013..42cbb5dc 100644 --- a/autocontext/src/autocontext/harness/scoring/backends.py +++ b/autocontext/src/autocontext/harness/scoring/backends.py @@ -21,6 +21,8 @@ from dataclasses import dataclass, field from typing import Any +from pydantic import BaseModel, Field + _WIN_THRESHOLD = 0.55 _ELO_K = 32.0 _GLICKO_Q = math.log(10) / 400 @@ -30,37 +32,27 @@ def _normalize_score(score: float) -> float: return max(0.0, min(1.0, float(score))) -@dataclass(slots=True) -class TrialResult: +class TrialResult(BaseModel): """A single trial preserving the continuous score.""" score: float seed: int opponent_rating: float - metadata: dict[str, Any] = field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) def is_win(self, threshold: float = _WIN_THRESHOLD) -> bool: return self.score >= threshold def to_dict(self) -> dict[str, Any]: - return { - "score": self.score, - "seed": self.seed, - "opponent_rating": self.opponent_rating, - "metadata": self.metadata, - } + return self.model_dump() @classmethod def from_dict(cls, data: dict[str, Any]) -> TrialResult: - return cls( - score=data.get("score", 0.0), - seed=data.get("seed", 0), - opponent_rating=data.get("opponent_rating", 1000.0), - metadata=data.get("metadata", {}), - ) + return cls.model_validate(data) @dataclass(slots=True) +@dataclass class RatingUpdate: """Result of a scoring backend update.""" diff --git a/autocontext/tests/test_dimensional_scoring.py b/autocontext/tests/test_dimensional_scoring.py index 3ffdd9db..bcd29baa 100644 --- a/autocontext/tests/test_dimensional_scoring.py +++ b/autocontext/tests/test_dimensional_scoring.py @@ -61,8 +61,8 @@ def test_weighted_aggregate(self) -> None: ) dims = [ - ScoringDimension("a", 0.6), - ScoringDimension("b", 0.4), + ScoringDimension(name="a", weight=0.6), + ScoringDimension(name="b", weight=0.4), ] score = DimensionalScore( aggregate=0.0, diff --git a/autocontext/tests/test_pareto_optimizer.py b/autocontext/tests/test_pareto_optimizer.py index 5a650cd7..1f4f63b0 100644 --- a/autocontext/tests/test_pareto_optimizer.py +++ b/autocontext/tests/test_pareto_optimizer.py @@ -27,7 +27,7 @@ def test_construction(self) -> None: def test_roundtrip(self) -> None: from autocontext.harness.optimizer.pareto import ActionableSideInfo - asi = ActionableSideInfo("ex-2", "near_miss", "Almost correct but off by 1", "Fix loop bound") + asi = ActionableSideInfo(example_id="ex-2", outcome="near_miss", diagnosis="Almost correct but off by 1", suggested_fix="Fix loop bound") d = asi.to_dict() restored = ActionableSideInfo.from_dict(d) assert restored.diagnosis == "Almost correct but off by 1" @@ -203,8 +203,8 @@ def test_merge_combines_asi(self) -> None: merge_candidates, ) - a = Candidate("a", "art-a", {}, [ActionableSideInfo("e1", "fail", "diag1", "fix1")]) - b = Candidate("b", "art-b", {}, [ActionableSideInfo("e2", "fail", "diag2", "fix2")]) + a = Candidate("a", "art-a", {}, [ActionableSideInfo(example_id="e1", outcome="fail", diagnosis="diag1", suggested_fix="fix1")]) + b = Candidate("b", "art-b", {}, [ActionableSideInfo(example_id="e2", outcome="fail", diagnosis="diag2", suggested_fix="fix2")]) merged = merge_candidates(a, b) assert len(merged.asi) == 2 diff --git a/autocontext/tests/test_self_play.py b/autocontext/tests/test_self_play.py index 08da7a69..80cd9b19 100644 --- a/autocontext/tests/test_self_play.py +++ b/autocontext/tests/test_self_play.py @@ -85,8 +85,8 @@ def test_add_and_get(self) -> None: config = SelfPlayConfig(enabled=True, pool_size=3) pool = SelfPlayPool(config) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) - pool.add(SelfPlayOpponent({"a": 2}, generation=2, elo=1050, score=0.6)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 2}, generation=2, elo=1050, score=0.6)) opponents = pool.get_opponents() assert len(opponents) == 2 @@ -101,9 +101,9 @@ def test_pool_size_limit(self) -> None: config = SelfPlayConfig(enabled=True, pool_size=2) pool = SelfPlayPool(config) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) - pool.add(SelfPlayOpponent({"a": 2}, generation=2, elo=1050, score=0.6)) - pool.add(SelfPlayOpponent({"a": 3}, generation=3, elo=1100, score=0.7)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 2}, generation=2, elo=1050, score=0.6)) + pool.add(SelfPlayOpponent(strategy={"a": 3}, generation=3, elo=1100, score=0.7)) opponents = pool.get_opponents() assert len(opponents) == 2 @@ -120,7 +120,7 @@ def test_disabled_pool_returns_empty(self) -> None: config = SelfPlayConfig(enabled=False) pool = SelfPlayPool(config) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) assert pool.get_opponents() == [] @@ -164,8 +164,8 @@ def test_includes_self_play_opponents(self) -> None: baselines = [{"strategy": "baseline"}] config = SelfPlayConfig(enabled=True, pool_size=3, weight=0.5) pool = SelfPlayPool(config) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) - pool.add(SelfPlayOpponent({"a": 2}, generation=2, elo=1050, score=0.6)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 2}, generation=2, elo=1050, score=0.6)) result = build_opponent_pool(baselines, pool) # Should have baselines + self-play opponents @@ -181,7 +181,7 @@ def test_weight_shapes_live_schedule_when_trials_provided(self) -> None: baselines = [{"strategy": "baseline"}] pool = SelfPlayPool(SelfPlayConfig(enabled=True, pool_size=3, weight=0.25)) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) result = build_opponent_pool(baselines, pool, trials=4) @@ -199,7 +199,7 @@ def test_self_play_tagged(self) -> None: baselines = [{"strategy": "baseline"}] pool = SelfPlayPool(SelfPlayConfig(enabled=True)) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) result = build_opponent_pool(baselines, pool) self_play_entries = [e for e in result if e.get("source") == "self_play"] @@ -214,7 +214,7 @@ def test_empty_baselines_with_self_play(self) -> None: ) pool = SelfPlayPool(SelfPlayConfig(enabled=True)) - pool.add(SelfPlayOpponent({"a": 1}, generation=1, elo=1000, score=0.5)) + pool.add(SelfPlayOpponent(strategy={"a": 1}, generation=1, elo=1000, score=0.5)) result = build_opponent_pool([], pool) assert len(result) >= 1 From 7bda691fb4697e1a7c5f47d2a362d51ac0eb8bfc Mon Sep 17 00:00:00 2001 From: Jay Scambler Date: Tue, 31 Mar 2026 20:57:44 -0500 Subject: [PATCH 2/2] Preserve harness serialization contracts --- .../harness/pipeline/advancement.py | 4 ++- .../tests/test_advancement_contract.py | 1 + autocontext/tests/test_pareto_optimizer.py | 35 +++++++++++++++++-- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/autocontext/src/autocontext/harness/pipeline/advancement.py b/autocontext/src/autocontext/harness/pipeline/advancement.py index 06b18649..56dc24f8 100644 --- a/autocontext/src/autocontext/harness/pipeline/advancement.py +++ b/autocontext/src/autocontext/harness/pipeline/advancement.py @@ -49,7 +49,9 @@ def delta(self) -> float: return round(self.best_score - self.previous_best, 6) def to_dict(self) -> dict[str, Any]: - return self.model_dump() + data = self.model_dump() + data["delta"] = self.delta + return data @classmethod def from_dict(cls, data: dict[str, Any]) -> AdvancementMetrics: diff --git a/autocontext/tests/test_advancement_contract.py b/autocontext/tests/test_advancement_contract.py index 8809bc40..3a2e8463 100644 --- a/autocontext/tests/test_advancement_contract.py +++ b/autocontext/tests/test_advancement_contract.py @@ -53,6 +53,7 @@ def test_roundtrip(self) -> None: previous_resolved_truth_score=0.84, ) d = m.to_dict() + assert d["delta"] == 0.1 restored = AdvancementMetrics.from_dict(d) assert restored.best_score == 0.9 assert restored.confidence == 0.95 diff --git a/autocontext/tests/test_pareto_optimizer.py b/autocontext/tests/test_pareto_optimizer.py index 1f4f63b0..59b8e507 100644 --- a/autocontext/tests/test_pareto_optimizer.py +++ b/autocontext/tests/test_pareto_optimizer.py @@ -27,7 +27,12 @@ def test_construction(self) -> None: def test_roundtrip(self) -> None: from autocontext.harness.optimizer.pareto import ActionableSideInfo - asi = ActionableSideInfo(example_id="ex-2", outcome="near_miss", diagnosis="Almost correct but off by 1", suggested_fix="Fix loop bound") + asi = ActionableSideInfo( + example_id="ex-2", + outcome="near_miss", + diagnosis="Almost correct but off by 1", + suggested_fix="Fix loop bound", + ) d = asi.to_dict() restored = ActionableSideInfo.from_dict(d) assert restored.diagnosis == "Almost correct but off by 1" @@ -203,8 +208,32 @@ def test_merge_combines_asi(self) -> None: merge_candidates, ) - a = Candidate("a", "art-a", {}, [ActionableSideInfo(example_id="e1", outcome="fail", diagnosis="diag1", suggested_fix="fix1")]) - b = Candidate("b", "art-b", {}, [ActionableSideInfo(example_id="e2", outcome="fail", diagnosis="diag2", suggested_fix="fix2")]) + a = Candidate( + "a", + "art-a", + {}, + [ + ActionableSideInfo( + example_id="e1", + outcome="fail", + diagnosis="diag1", + suggested_fix="fix1", + ), + ], + ) + b = Candidate( + "b", + "art-b", + {}, + [ + ActionableSideInfo( + example_id="e2", + outcome="fail", + diagnosis="diag2", + suggested_fix="fix2", + ), + ], + ) merged = merge_candidates(a, b) assert len(merged.asi) == 2