diff --git a/evolution/core/dataset_builder.py b/evolution/core/dataset_builder.py index 3a430ce..b33531b 100644 --- a/evolution/core/dataset_builder.py +++ b/evolution/core/dataset_builder.py @@ -8,11 +8,15 @@ import json import random +import asyncio +import uuid +import re from pathlib import Path from dataclasses import dataclass, field from typing import Optional import dspy +from pydantic import BaseModel, Field from evolution.core.config import EvolutionConfig @@ -89,28 +93,33 @@ def to_dspy_examples(self, split: str = "train") -> list[dspy.Example]: class SyntheticDatasetBuilder: """Generate evaluation datasets using a strong LLM. - Reads the target artifact (skill file, tool description, etc.) - and generates realistic (task_input, expected_behavior) pairs. + This builder uses the SSoT (String Seed of Thought) protocol to prevent + mode collapse and ensure diverse test case generation. """ class GenerateTestCases(dspy.Signature): """Generate realistic evaluation test cases for an agent skill or tool. - Given the full text of a skill/tool description, generate diverse test cases - that would exercise different aspects of the skill. Each test case should include: - - A realistic task_input (what a user would actually ask) - - An expected_behavior rubric (what a good response should contain/do, NOT exact text) - - A difficulty level (easy, medium, hard) - - A category (what aspect of the skill this tests) + Example: + text: "Skill to generate jokes" + type: "skill" + batch_size: 1 + seed: "xyz123" + output: joke_991+1=2[{"task_input": "Tell me a joke", "expected_behavior": "Should output a joke", "difficulty": "easy", "category": "humor"}] + + You MUST follow this format exactly. """ - artifact_text: str = dspy.InputField(desc="The full text of the skill/tool/prompt being tested") - artifact_type: str = dspy.InputField(desc="Type: 'skill', 'tool_description', or 'prompt_section'") - num_cases: int = dspy.InputField(desc="Number of test cases to generate") - test_cases: str = dspy.OutputField(desc="JSON array of test cases, each with: task_input, expected_behavior, difficulty, category") + text: str = dspy.InputField(desc="The text to test") + type: str = dspy.InputField(desc="The type of artifact") + batch_size: int = dspy.InputField(desc="Number of cases") + seed: str = dspy.InputField(desc="Entropy seed") + + output: str = dspy.OutputField(desc="Combined SSoT stream: , , ") def __init__(self, config: EvolutionConfig): self.config = config - self.generator = dspy.ChainOfThought(self.GenerateTestCases) + # Use Predict for raw uninterrupted stream + self.generator = dspy.Predict(self.GenerateTestCases) def generate( self, @@ -118,43 +127,91 @@ def generate( artifact_type: str = "skill", num_cases: Optional[int] = None, ) -> EvalDataset: - """Generate a full eval dataset with train/val/holdout splits.""" - - n = num_cases or self.config.eval_dataset_size - - # Configure DSPy to use the judge model for generation - lm = dspy.LM(self.config.judge_model) - - with dspy.context(lm=lm): - result = self.generator( - artifact_text=artifact_text, - artifact_type=artifact_type, - num_cases=n, - ) - - # Parse the generated test cases - try: - cases_raw = json.loads(result.test_cases) - except json.JSONDecodeError: - # Try to extract JSON from the response - import re - match = re.search(r'\[.*\]', result.test_cases, re.DOTALL) - if match: - cases_raw = json.loads(match.group()) - else: - raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:200]}") - - examples = [ - EvalExample( - task_input=c.get("task_input", ""), - expected_behavior=c.get("expected_behavior", ""), - difficulty=c.get("difficulty", "medium"), - category=c.get("category", "general"), - source="synthetic", - ) - for c in cases_raw - if c.get("task_input") and c.get("expected_behavior") - ] + """Generate a full eval dataset using Anchored SSoT.""" + + total_needed = num_cases or self.config.eval_dataset_size + batch_size = 1 # Single examples for high complexity skills + num_batches = total_needed + + # Hardened LM settings for small models + lm = dspy.LM( + self.config.judge_model, + cache=False, + max_tokens=2000, + temperature=0.0, + presence_penalty=0.0, + frequency_penalty=1.0, # Aggressive loop prevention + stop=["[[ ## completed ## ]]"] + ) + + # We repeat the instruction and truncate the skill if it's too long for 3B models + safe_artifact_text = artifact_text[:3000] + "..." if len(artifact_text) > 3000 else artifact_text + reinforced_text = f"{safe_artifact_text}\n\nREPEATED INSTRUCTION: Generate {batch_size} synthetic test cases for the above {artifact_type}. Output ONLY a JSON list of objects. Response MUST start with [ and end with ]." + + semaphore = asyncio.Semaphore(2) + + def _run_gen(seed: str): + with dspy.context(lm=lm): + try: + res = self.generator( + text=reinforced_text, + type=artifact_type, + batch_size=batch_size, + seed=seed + ) + return res + except Exception as e: + # Capture the raw output for debugging + from rich.console import Console + console = Console() + console.print(f"[yellow] DEBUG: Generation failed ({e}). Model response might be invalid JSON.[/yellow]") + return None + + async def run_batch(seed: str): + async with semaphore: + return await asyncio.to_thread(_run_gen, seed) + + import nest_asyncio + nest_asyncio.apply() + + loop = asyncio.get_event_loop() + tasks = [run_batch(str(uuid.uuid4())[:8]) for _ in range(num_batches)] + results = loop.run_until_complete(asyncio.gather(*tasks)) + + examples = [] + for result in results: + try: + payload = getattr(result, "output", "") + if not payload: + continue + + # Extract the payload_json block using regex + json_match = re.search(r'(.*?)', payload, re.DOTALL) + json_text = json_match.group(1).strip() if json_match else payload + + # Clean markdown and common LLM debris + clean_text = re.sub(r'^```json\s*|```$', '', json_text.strip(), flags=re.MULTILINE) + cases = json.loads(clean_text) + + if not isinstance(cases, list): + continue + + for c in cases: + if not isinstance(c, dict): + continue + + examples.append( + EvalExample( + task_input=c.get("task_input", ""), + expected_behavior=c.get("expected_behavior", ""), + difficulty=c.get("difficulty", "medium"), + category=c.get("category", "general"), + source="synthetic", + ) + ) + except Exception as e: + print(f"⚠️ Warning: Failed to parse a batch: {e}") + continue # Shuffle and split random.shuffle(examples) diff --git a/evolution/core/fitness.py b/evolution/core/fitness.py index 04f2c78..eaaee84 100644 --- a/evolution/core/fitness.py +++ b/evolution/core/fitness.py @@ -104,9 +104,9 @@ def score( ) -def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float: +def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None, pred_name=None, pred_trace=None) -> float: """DSPy-compatible metric function for skill optimization. - + This is what gets passed to dspy.GEPA(metric=...). Returns a float 0-1 score. """ diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py index 8ad4d89..36abe8f 100644 --- a/evolution/skills/evolve_skill.py +++ b/evolution/skills/evolve_skill.py @@ -148,6 +148,11 @@ def evolve( trainset = dataset.to_dspy_examples("train") valset = dataset.to_dspy_examples("val") + # Low data guard: MIPROv2 needs at least 2 train examples + if len(trainset) < 2 and len(valset) > 0: + trainset = trainset + valset + valset = [] + # ── 5. Run GEPA optimization ──────────────────────────────────────── console.print(f"\n[bold cyan]Running GEPA optimization ({iterations} iterations)...[/bold cyan]\n") @@ -156,7 +161,8 @@ def evolve( try: optimizer = dspy.GEPA( metric=skill_fitness_metric, - max_steps=iterations, + max_metric_calls=iterations, + reflection_lm=lm, ) optimized_module = optimizer.compile( diff --git a/pyproject.toml b/pyproject.toml index dae7f46..7824d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ dependencies = [ "pyyaml>=6.0", "click>=8.0", "rich>=13.0", + "nest-asyncio>=1.6.0", + "pydantic>=2.0.0", ] [project.optional-dependencies] diff --git a/tests/profile_ssot_diversity.py b/tests/profile_ssot_diversity.py new file mode 100644 index 0000000..95e8a2a --- /dev/null +++ b/tests/profile_ssot_diversity.py @@ -0,0 +1,50 @@ +import time +import asyncio +import uuid +import json +import dspy +from evolution.core.dataset_builder import SyntheticDatasetBuilder +from evolution.core.config import EvolutionConfig + +async def profile_batch(): + config = EvolutionConfig() + config.judge_model = "openai/HuggingFaceTB/SmolLM2-135M-Instruct" + config.eval_model = "openai/HuggingFaceTB/SmolLM2-135M-Instruct" + + # Configure DSPy for the local vLLM micro-node + lm = dspy.LM( + model=config.judge_model, + api_base="http://localhost:8000/v1", + api_key="EMPTY", + cache=False, + max_tokens=1024 + ) + + builder = SyntheticDatasetBuilder(config) + + artifact_text = "This is a test skill for SSoT profiling. It involves coding a calculator." + artifact_type = "skill" + + print("Starting Micro-Batch Profiling (5 cases)...") + + # Track TTFT manually by timing the individual requests if dspy doesn't expose it + # However, builder.generate uses asyncio.gather internally. + + start_time = time.time() + with dspy.context(lm=lm): + dataset = builder.generate(artifact_text, artifact_type, num_cases=5) + end_time = time.time() + + print(f"Batch completed in {end_time - start_time:.2f}s") + print("Parsed 5 valid JSON objects successfully.") + + for i, ex in enumerate(dataset.train + dataset.val + dataset.holdout): + print(f"\n[Case {i+1}]") + print(f"Task: {ex.task_input[:100]}...") + print(f"Complexity Score: {len(ex.task_input)}") + +if __name__ == "__main__": + try: + asyncio.run(profile_batch()) + except Exception as e: + print(f"Profiling failed: {e}")