Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions evolution/skills/evolve_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def evolve(

# ── 7. Validate evolved skill ───────────────────────────────────────
console.print(f"\n[bold]Validating evolved skill[/bold]")
evolved_constraints = validator.validate_all(evolved_body, "skill", baseline_text=skill["body"])
evolved_constraints = validator.validate_all(evolved_full, "skill", baseline_text=skill["raw"])
all_pass = True
for c in evolved_constraints:
icon = "✓" if c.passed else "✗"
Expand All @@ -204,27 +204,32 @@ def evolve(
console.print(f" Saved failed variant to {output_path}")
return

# ── 8. Evaluate on holdout set ──────────────────────────────────────
# ── 8. Evaluate on holdout set ──────────────────────────────────────
console.print(f"\n[bold]Evaluating on holdout set ({len(dataset.holdout)} examples)[/bold]")

holdout_examples = dataset.to_dspy_examples("holdout")

baseline_scores = []
evolved_scores = []
for ex in holdout_examples:
# Score baseline
with dspy.context(lm=lm):
baseline_pred = baseline_module(task_input=ex.task_input)
baseline_score = skill_fitness_metric(ex, baseline_pred)
baseline_scores.append(baseline_score)

evolved_pred = optimized_module(task_input=ex.task_input)
evolved_score = skill_fitness_metric(ex, evolved_pred)
evolved_scores.append(evolved_score)

avg_baseline = sum(baseline_scores) / max(1, len(baseline_scores))
avg_evolved = sum(evolved_scores) / max(1, len(evolved_scores))
improvement = avg_evolved - avg_baseline
try:
for ex in holdout_examples:
# Score baseline
with dspy.context(lm=lm):
baseline_pred = baseline_module(task_input=ex.task_input)
baseline_score = skill_fitness_metric(ex, baseline_pred)
baseline_scores.append(baseline_score)

evolved_pred = optimized_module(task_input=ex.task_input)
evolved_score = skill_fitness_metric(ex, evolved_pred)
evolved_scores.append(evolved_score)
except Exception as e:
console.print(f"[yellow]⚠ Holdout evaluation failed (DSPy adapter error): {e}[/yellow]")
console.print(" Constraints passed — skill is valid but holdout scores unavailable.")
avg_baseline = avg_evolved = improvement = 0.0
else:
avg_baseline = sum(baseline_scores) / max(1, len(baseline_scores))
avg_evolved = sum(evolved_scores) / max(1, len(evolved_scores))
improvement = avg_evolved - avg_baseline

# ── 9. Report results ───────────────────────────────────────────────
table = Table(title="Evolution Results")
Expand Down
7 changes: 7 additions & 0 deletions evolution/skills/skill_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ def load_skill(skill_path: Path) -> dict:
"description": str,
}
"""
# If skill_path is a directory, find SKILL.md inside it
if skill_path.is_dir():
skill_md = skill_path / "SKILL.md"
if not skill_md.exists():
raise FileNotFoundError(f"No SKILL.md found in {skill_path}")
skill_path = skill_md

raw = skill_path.read_text()

# Parse YAML frontmatter
Expand Down