Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 9 additions & 26 deletions autocontext/src/autocontext/harness/evaluation/dimensional.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,32 @@
from __future__ import annotations

from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field

@dataclass(slots=True)
class ScoringDimension:

class ScoringDimension(BaseModel):
"""A named scoring dimension with weight."""

name: str
weight: float = 1.0
description: str = ""

def to_dict(self) -> dict[str, Any]:
return {
"name": self.name,
"weight": self.weight,
"description": self.description,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ScoringDimension:
return cls(
name=data["name"],
weight=data.get("weight", 1.0),
description=data.get("description", ""),
)
return cls.model_validate(data)


@dataclass(slots=True)
class DimensionalScore:
class DimensionalScore(BaseModel):
"""Aggregate score plus per-dimension breakdown."""

aggregate: float
dimensions: dict[str, float]
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

def weighted_aggregate(self, dimension_specs: Sequence[ScoringDimension]) -> float:
"""Compute weighted aggregate from dimension specs."""
Expand All @@ -62,19 +53,11 @@ def weighted_aggregate(self, dimension_specs: Sequence[ScoringDimension]) -> flo
return round(weighted_sum / total_weight, 6)

def to_dict(self) -> dict[str, Any]:
return {
"aggregate": self.aggregate,
"dimensions": self.dimensions,
"metadata": self.metadata,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> DimensionalScore:
return cls(
aggregate=data.get("aggregate", 0.0),
dimensions=data.get("dimensions", {}),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


def normalize_dimension_specs(
Expand Down
39 changes: 9 additions & 30 deletions autocontext/src/autocontext/harness/evaluation/self_play.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,62 +14,41 @@

import json
from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field

@dataclass(slots=True)
class SelfPlayOpponent:

class SelfPlayOpponent(BaseModel):
"""A prior generation's strategy used as an opponent."""

strategy: dict[str, Any]
generation: int
elo: float
score: float
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

def to_dict(self) -> dict[str, Any]:
return {
"strategy": self.strategy,
"generation": self.generation,
"elo": self.elo,
"score": self.score,
"metadata": self.metadata,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> SelfPlayOpponent:
return cls(
strategy=data.get("strategy", {}),
generation=data.get("generation", 0),
elo=data.get("elo", 1000.0),
score=data.get("score", 0.0),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


@dataclass(slots=True)
class SelfPlayConfig:
class SelfPlayConfig(BaseModel):
"""Configuration for self-play opponent pool."""

enabled: bool = False
pool_size: int = 3
weight: float = 0.5 # fraction of matches vs self-play opponents

def to_dict(self) -> dict[str, Any]:
return {
"enabled": self.enabled,
"pool_size": self.pool_size,
"weight": self.weight,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> SelfPlayConfig:
return cls(
enabled=data.get("enabled", False),
pool_size=data.get("pool_size", 3),
weight=data.get("weight", 0.5),
)
return cls.model_validate(data)


class SelfPlayPool:
Expand Down
74 changes: 12 additions & 62 deletions autocontext/src/autocontext/harness/pipeline/advancement.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,17 @@

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field

# Thresholds
_ERROR_RATE_THRESHOLD = 0.2
_LOW_CONFIDENCE_THRESHOLD = 0.5
_HIGH_VARIANCE_THRESHOLD = 0.04


@dataclass(slots=True)
class AdvancementMetrics:
class AdvancementMetrics(BaseModel):
"""Composite metrics input to gate decisions."""

best_score: float
Expand All @@ -42,57 +42,23 @@ class AdvancementMetrics:
generalization_gap: float | None = None
cost_usd: float = 0.0
tokens_used: int = 0
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

@property
def delta(self) -> float:
return round(self.best_score - self.previous_best, 6)

def to_dict(self) -> dict[str, Any]:
return {
"best_score": self.best_score,
"mean_score": self.mean_score,
"previous_best": self.previous_best,
"score_variance": self.score_variance,
"sample_count": self.sample_count,
"error_rate": self.error_rate,
"crash_count": self.crash_count,
"confidence": self.confidence,
"sample_agreement": self.sample_agreement,
"search_proxy_score": self.search_proxy_score,
"resolved_truth_score": self.resolved_truth_score,
"previous_resolved_truth_score": self.previous_resolved_truth_score,
"generalization_gap": self.generalization_gap,
"cost_usd": self.cost_usd,
"tokens_used": self.tokens_used,
"delta": self.delta,
"metadata": self.metadata,
}
data = self.model_dump()
data["delta"] = self.delta
return data

@classmethod
def from_dict(cls, data: dict[str, Any]) -> AdvancementMetrics:
return cls(
best_score=data.get("best_score", 0.0),
mean_score=data.get("mean_score", 0.0),
previous_best=data.get("previous_best", 0.0),
score_variance=data.get("score_variance", 0.0),
sample_count=data.get("sample_count", 0),
error_rate=data.get("error_rate", 0.0),
crash_count=data.get("crash_count", 0),
confidence=data.get("confidence", 1.0),
sample_agreement=data.get("sample_agreement", 1.0),
search_proxy_score=data.get("search_proxy_score"),
resolved_truth_score=data.get("resolved_truth_score"),
previous_resolved_truth_score=data.get("previous_resolved_truth_score"),
generalization_gap=data.get("generalization_gap"),
cost_usd=data.get("cost_usd", 0.0),
tokens_used=data.get("tokens_used", 0),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


@dataclass(slots=True)
class AdvancementRationale:
class AdvancementRationale(BaseModel):
"""Operator-visible gate decision explanation."""

decision: str # advance, retry, rollback
Expand All @@ -101,30 +67,14 @@ class AdvancementRationale:
binding_checks: list[str]
proxy_signals: list[str]
risk_flags: list[str]
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

def to_dict(self) -> dict[str, Any]:
return {
"decision": self.decision,
"reason": self.reason,
"component_scores": self.component_scores,
"binding_checks": self.binding_checks,
"proxy_signals": self.proxy_signals,
"risk_flags": self.risk_flags,
"metadata": self.metadata,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> AdvancementRationale:
return cls(
decision=data.get("decision", "rollback"),
reason=data.get("reason", ""),
component_scores=data.get("component_scores", {}),
binding_checks=data.get("binding_checks", []),
proxy_signals=data.get("proxy_signals", []),
risk_flags=data.get("risk_flags", []),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


def evaluate_advancement(
Expand Down
51 changes: 10 additions & 41 deletions autocontext/src/autocontext/harness/pipeline/holdout.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,45 +15,30 @@

import statistics
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import Any

from pydantic import BaseModel, Field

@dataclass(slots=True)
class HoldoutPolicy:

class HoldoutPolicy(BaseModel):
"""Configurable holdout evaluation policy."""

holdout_seeds: int = 5
min_holdout_score: float = 0.5
max_generalization_gap: float = 0.2
seed_offset: int = 10000
enabled: bool = True
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

def to_dict(self) -> dict[str, Any]:
return {
"holdout_seeds": self.holdout_seeds,
"min_holdout_score": self.min_holdout_score,
"max_generalization_gap": self.max_generalization_gap,
"seed_offset": self.seed_offset,
"enabled": self.enabled,
"metadata": self.metadata,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> HoldoutPolicy:
return cls(
holdout_seeds=data.get("holdout_seeds", 5),
min_holdout_score=data.get("min_holdout_score", 0.5),
max_generalization_gap=data.get("max_generalization_gap", 0.2),
seed_offset=data.get("seed_offset", 10000),
enabled=data.get("enabled", True),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


@dataclass(slots=True)
class HoldoutResult:
class HoldoutResult(BaseModel):
"""Outcome of holdout evaluation."""

holdout_mean_score: float
Expand All @@ -62,30 +47,14 @@ class HoldoutResult:
generalization_gap: float
passed: bool
reason: str
metadata: dict[str, Any] = field(default_factory=dict)
metadata: dict[str, Any] = Field(default_factory=dict)

def to_dict(self) -> dict[str, Any]:
return {
"holdout_mean_score": self.holdout_mean_score,
"holdout_scores": self.holdout_scores,
"in_sample_score": self.in_sample_score,
"generalization_gap": self.generalization_gap,
"passed": self.passed,
"reason": self.reason,
"metadata": self.metadata,
}
return self.model_dump()

@classmethod
def from_dict(cls, data: dict[str, Any]) -> HoldoutResult:
return cls(
holdout_mean_score=data.get("holdout_mean_score", 0.0),
holdout_scores=data.get("holdout_scores", []),
in_sample_score=data.get("in_sample_score", 0.0),
generalization_gap=data.get("generalization_gap", 0.0),
passed=data.get("passed", False),
reason=data.get("reason", ""),
metadata=data.get("metadata", {}),
)
return cls.model_validate(data)


def holdout_check(
Expand Down
Loading