feat(gepa): implement tool-specific proposer for tool descriptions

Ju-usc · Ju-usc · commit 045c6cfde080 · 2025-10-10T02:06:12.000-07:00
- Add ToolProposer with GenerateImprovedToolDescription signature
- Implement routing logic to separate tools from signatures
- Tools use ToolProposer, signatures use custom or parent default
- Backward compatible: preserves existing custom_instruction_proposer behavior
- Add test verifying routing splits components correctly
diff --git a/dspy/teleprompt/gepa/gepa_utils.py b/dspy/teleprompt/gepa/gepa_utils.py
@@ -15,13 +15,15 @@
 
 logger = logging.getLogger(__name__)
 
+
 class LoggerAdapter:
     def __init__(self, logger: logging.Logger):
         self.logger = logger
 
     def log(self, x: str):
         self.logger.info(x)
 
+
 DSPyTrace = list[tuple[Any, dict[str, Any], Prediction]]
 
 
@@ -31,15 +33,17 @@ class ReflectiveExample(TypedDict):
 
     Each example contains the predictor inputs, generated outputs, and feedback from evaluation.
     """
-    Inputs: dict[str, Any]                              # Predictor inputs (may include str, dspy.Image, etc.)
-    Generated_Outputs: dict[str, Any] | str             # Success: dict with output fields, Failure: error message string
-    Feedback: str                                       # Always a string - from metric function or parsing error message
+
+    Inputs: dict[str, Any]  # Predictor inputs (may include str, dspy.Image, etc.)
+    Generated_Outputs: dict[str, Any] | str  # Success: dict with output fields, Failure: error message string
+    Feedback: str  # Always a string - from metric function or parsing error message
 
 
 class ScoreWithFeedback(Prediction):
     score: float
     feedback: str
 
+
 class PredictorFeedbackFn(Protocol):
     def __call__(
         predictor_output: dict[str, Any],
@@ -64,6 +68,7 @@ def __call__(
         """
         ...
 
+
 class DspyAdapter(GEPAAdapter[Example, TraceData, Prediction]):
     def __init__(
         self,
@@ -91,36 +96,80 @@ def __init__(
         self.warn_on_score_mismatch = warn_on_score_mismatch
         self.optimize_tool_descriptions = optimize_tool_descriptions
 
-        if self.custom_instruction_proposer is not None:
-            # We are only overriding the propose_new_texts method when a custom
-            # instruction proposer is provided. Otherwise, we use the GEPA
-            # default propose_new_texts.
+        if self.optimize_tool_descriptions or self.custom_instruction_proposer is not None:
+            # Set up combined proposer for tool optimization and/or custom instruction proposer.
+            # This routes components to appropriate proposers based on type:
+            # - Signatures -> custom_instruction_proposer (if provided) OR parent default
+            # - Tools -> ToolProposer (if optimize_tool_descriptions=True)
 
-            def custom_propose_new_texts(
+            # Determine which proposer handles signatures
+            if self.custom_instruction_proposer is not None:
+                signature_proposer = self.custom_instruction_proposer
+            else:
+                signature_proposer = super().propose_new_texts
+
+            def propose_new_texts(
                 candidate: dict[str, str],
                 reflective_dataset: dict[str, list[dict[str, Any]]],
-                components_to_update: list[str]
+                components_to_update: list[str],
             ) -> dict[str, str]:
+                """Propose new texts for both signatures and tools.
+
+                Splits components by type (tool: prefix vs signatures), calls appropriate
+                proposers, and merges results. Handles reflection_lm context if provided.
+                """
+                # Split by component type if tool optimization enabled
+                if self.optimize_tool_descriptions:
+                    tool_components = [c for c in components_to_update if c.startswith("tool:")]
+                    sig_components = [c for c in components_to_update if not c.startswith("tool:")]
+                else:
+                    tool_components = []
+                    sig_components = components_to_update
+
+                # Apply reflection_lm context to all proposer calls if provided
                 if self.reflection_lm is not None:
                     with dspy.context(lm=self.reflection_lm):
-                        return self.custom_instruction_proposer(
+                        sig_texts = signature_proposer(
                             candidate=candidate,
                             reflective_dataset=reflective_dataset,
-                            components_to_update=components_to_update
+                            components_to_update=sig_components,
                         )
+
+                        if tool_components:
+                            from .instruction_proposal import ToolProposer
+
+                            tool_texts = ToolProposer()(
+                                candidate=candidate,
+                                reflective_dataset=reflective_dataset,
+                                components_to_update=tool_components,
+                            )
+                            return {**sig_texts, **tool_texts}
+                        else:
+                            return sig_texts
                 else:
-                    return self.custom_instruction_proposer(
+                    sig_texts = signature_proposer(
                         candidate=candidate,
                         reflective_dataset=reflective_dataset,
-                        components_to_update=components_to_update
+                        components_to_update=sig_components,
                     )
 
-            self.propose_new_texts = custom_propose_new_texts
+                    if tool_components:
+                        from .instruction_proposal import ToolProposer
+
+                        tool_texts = ToolProposer()(
+                            candidate=candidate,
+                            reflective_dataset=reflective_dataset,
+                            components_to_update=tool_components,
+                        )
+                        return {**sig_texts, **tool_texts}
+                    else:
+                        return sig_texts
+
+            self.propose_new_texts = propose_new_texts
 
         # Cache predictor names/signatures
         self.named_predictors = list(self.student.named_predictors())
 
-
     def build_program(self, candidate: dict[str, str]):
         new_prog = self.student.deepcopy()
         for name, pred in new_prog.named_predictors():
@@ -176,16 +225,19 @@ def evaluate(self, batch, candidate, capture_traces=False):
                 return_all_scores=True,
                 failure_score=self.failure_score,
                 provide_traceback=True,
-                max_errors=len(batch) * 100
+                max_errors=len(batch) * 100,
             )
             res = evaluator(program)
             outputs = [r[1] for r in res.results]
             scores = [r[2] for r in res.results]
             scores = [s["score"] if hasattr(s, "score") else s for s in scores]
             return EvaluationBatch(outputs=outputs, scores=scores, trajectories=None)
 
-    def make_reflective_dataset(self, candidate, eval_batch, components_to_update) -> dict[str, list[ReflectiveExample]]:
+    def make_reflective_dataset(
+        self, candidate, eval_batch, components_to_update
+    ) -> dict[str, list[ReflectiveExample]]:
         from dspy.teleprompt.bootstrap_trace import FailedPrediction
+
         program = self.build_program(candidate)
 
         ret_d: dict[str, list[ReflectiveExample]] = {}
@@ -284,7 +336,9 @@ def make_reflective_dataset(self, candidate, eval_batch, components_to_update) -
                     d["Feedback"] = fb["feedback"]
                     if fb["score"] != module_score:
                         if self.warn_on_score_mismatch:
-                            logger.warning("The score returned by the metric with pred_name is different from the overall metric score. This can indicate 2 things: Either the metric is non-deterministic (e.g., LLM-as-judge, Semantic score, etc.) or the metric returned a score specific to pred_name that differs from the module level score. Currently, GEPA does not support predictor level scoring (support coming soon), and only requires a feedback text to be provided, which can be specific to the predictor or program level. GEPA will ignore the differing score returned, and instead use module level score. You can safely ignore this warning if using a semantic metric, however, if this mismatch is caused due to predictor scoring, please return module-level scores. To disable this warning, set warn_on_score_mismatch=False.")
+                            logger.warning(
+                                "The score returned by the metric with pred_name is different from the overall metric score. This can indicate 2 things: Either the metric is non-deterministic (e.g., LLM-as-judge, Semantic score, etc.) or the metric returned a score specific to pred_name that differs from the module level score. Currently, GEPA does not support predictor level scoring (support coming soon), and only requires a feedback text to be provided, which can be specific to the predictor or program level. GEPA will ignore the differing score returned, and instead use module level score. You can safely ignore this warning if using a semantic metric, however, if this mismatch is caused due to predictor scoring, please return module-level scores. To disable this warning, set warn_on_score_mismatch=False."
+                            )
                             self.warn_on_score_mismatch = False
                         fb["score"] = module_score
 
diff --git a/dspy/teleprompt/gepa/instruction_proposal.py b/dspy/teleprompt/gepa/instruction_proposal.py
@@ -310,3 +310,141 @@ def __call__(
                 updated_components[component_name] = new_instruction
 
         return updated_components
+
+
+class GenerateImprovedToolDescriptionFromFeedback(dspy.Signature):
+    """I provided an assistant with the following description for a tool:
+    ```
+    <current_tool_description>
+    ```
+
+    This tool is available to the assistant. The following are examples of task inputs provided to the assistant, the assistant's decisions about which tools to use, and feedback on whether those decisions were correct:
+    ```
+    <examples_with_feedback>
+    ```
+
+    Your task is to write a better description for this tool.
+
+    Read the examples carefully and identify patterns in when the tool was used successfully versus when it was misused or overlooked. Identify any domain-specific information about the tool's capabilities or appropriate usage that may not be available to the assistant in the future. The assistant may have developed effective patterns for tool selection - if so, ensure the tool description supports those patterns.
+
+    Provide the new tool description within ``` blocks."""
+
+    current_tool_description = dspy.InputField(desc="The current description of the tool")
+    examples_with_feedback = dspy.InputField(desc="Examples showing tool usage decisions and feedback on correctness")
+
+    improved_tool_description = dspy.OutputField(
+        desc="An improved description that helps with tool selection decisions"
+    )
+
+
+class SingleComponentToolProposer(dspy.Module):
+    """dspy.Module for proposing improved tool descriptions based on feedback."""
+
+    def __init__(self):
+        super().__init__()
+        self.propose_description = dspy.Predict(GenerateImprovedToolDescriptionFromFeedback)
+
+    def forward(self, current_tool_description: str, reflective_dataset: list[ReflectiveExample]) -> str:
+        """Generate an improved tool description based on current description and feedback examples.
+
+        Args:
+            current_tool_description: The current description of the tool
+            reflective_dataset: List of examples with inputs, outputs, and feedback
+
+        Returns:
+            str: Improved tool description text
+        """
+        # Reuse formatting from SingleComponentMultiModalProposer
+        formatted_examples, _ = self._format_examples_for_instruction_generation(reflective_dataset)
+
+        result = self.propose_description(
+            current_tool_description=current_tool_description, examples_with_feedback=formatted_examples
+        )
+
+        return result.improved_tool_description
+
+    def _format_examples_for_instruction_generation(
+        self, reflective_dataset: list[ReflectiveExample]
+    ) -> tuple[str, dict[int, list[Type]]]:
+        """Format examples using GEPA's markdown structure.
+
+        Returns:
+            tuple: (formatted_text, image_map) where image_map is always empty for tools
+        """
+
+        def render_value(value, level=3):
+            if isinstance(value, dict):
+                s = ""
+                for k, v in value.items():
+                    s += f"{'#' * level} {k}\n"
+                    s += render_value(v, min(level + 1, 6))
+                if not value:
+                    s += "\n"
+                return s
+            elif isinstance(value, (list, tuple)):
+                s = ""
+                for i, item in enumerate(value):
+                    s += f"{'#' * level} Item {i + 1}\n"
+                    s += render_value(item, min(level + 1, 6))
+                if not value:
+                    s += "\n"
+                return s
+            else:
+                return f"{str(value).strip()}\n\n"
+
+        def convert_sample_to_markdown(sample, example_num):
+            s = f"# Example {example_num}\n"
+            for key, val in sample.items():
+                s += f"## {key}\n"
+                s += render_value(val, level=3)
+            return s
+
+        formatted_parts = []
+        for i, example_data in enumerate(reflective_dataset):
+            formatted_example = convert_sample_to_markdown(example_data, i + 1)
+            formatted_parts.append(formatted_example)
+
+        formatted_text = "\n\n".join(formatted_parts)
+        return formatted_text, {}
+
+
+class ToolProposer(ProposalFn):
+    """GEPA-compatible tool description proposer.
+
+    This class handles tool description optimization during GEPA optimization by using
+    a single-component proposer for each tool that needs to be updated.
+    """
+
+    def __init__(self):
+        self.single_proposer = SingleComponentToolProposer()
+
+    def __call__(
+        self,
+        candidate: dict[str, str],
+        reflective_dataset: dict[str, list[ReflectiveExample]],
+        components_to_update: list[str],
+    ) -> dict[str, str]:
+        """GEPA-compatible proposal function.
+
+        Args:
+            candidate: Current component name -> description mapping
+            reflective_dataset: Component name -> list of reflective examples
+            components_to_update: List of component names to update
+
+        Returns:
+            dict: Component name -> new description mapping
+        """
+        updated_components = {}
+
+        for component_name in components_to_update:
+            if component_name in candidate and component_name in reflective_dataset:
+                current_description = candidate[component_name]
+                component_reflective_data = reflective_dataset[component_name]
+
+                new_description = self.single_proposer(
+                    current_tool_description=current_description, reflective_dataset=component_reflective_data
+                )
+
+                updated_components[component_name] = new_description
+
+        return updated_components
diff --git a/tests/teleprompt/test_gepa_tool_optimization.py b/tests/teleprompt/test_gepa_tool_optimization.py
@@ -154,3 +154,72 @@ def forward(self, question):
     assert "search" in optimized.subagent.tools
     assert "calculator" in optimized.main_agent.tools
     assert "spawn_subagent" in optimized.main_agent.tools
+
+
+def test_tool_and_signature_optimization_with_proposer_routing():
+    """Test that routing logic correctly splits tools and signatures."""
+    from unittest.mock import Mock, patch
+
+    from dspy.teleprompt.gepa.gepa_utils import DspyAdapter
+
+    # Create module with BOTH signature and tools
+    calc_tool = dspy.Tool(calculator, name="calculator", desc="Original calculator description")
+    react = dspy.ReAct("question -> answer", tools=[calc_tool])
+
+    # Create adapter with tool optimization enabled
+    adapter = DspyAdapter(
+        student_module=react,
+        metric_fn=simple_metric,
+        feedback_map={},
+        failure_score=0.0,
+        optimize_tool_descriptions=True,
+        reflection_lm=None,
+    )
+
+    # Verify propose_new_texts was created
+    assert hasattr(adapter, "propose_new_texts"), "Routing logic should have set propose_new_texts"
+
+    # Mock the ToolProposer to verify it gets called with tools only
+    mock_tool_proposer_instance = Mock()
+    mock_tool_proposer_instance.return_value = {"tool:calculator": "Improved calculator description"}
+
+    mock_tool_proposer_class = Mock(return_value=mock_tool_proposer_instance)
+
+    # Mock parent propose_new_texts to verify it gets called with signatures only
+    mock_parent_propose = Mock(return_value={"react": "Improved signature instruction"})
+
+    with patch("dspy.teleprompt.gepa.instruction_proposal.ToolProposer", mock_tool_proposer_class):
+        with patch.object(adapter.__class__.__bases__[0], "propose_new_texts", mock_parent_propose, create=True):
+            # Rebuild adapter to pick up mocked parent
+            adapter_with_mock = DspyAdapter(
+                student_module=react,
+                metric_fn=simple_metric,
+                feedback_map={},
+                failure_score=0.0,
+                optimize_tool_descriptions=True,
+                reflection_lm=None,
+            )
+
+            candidate = {
+                "react": "Original signature",
+                "tool:calculator": "Original tool desc",
+            }
+
+            reflective_dataset = {
+                "react": [{"input": "test"}],
+                "tool:calculator": [{"input": "calc"}],
+            }
+
+            components = ["react", "tool:calculator"]
+
+            result = adapter_with_mock.propose_new_texts(candidate, reflective_dataset, components)
+
+            # Verify routing: ToolProposer was called with tools only
+            assert mock_tool_proposer_instance.called, "ToolProposer should have been called"
+            tool_call_args = mock_tool_proposer_instance.call_args[1]
+            assert "tool:calculator" in tool_call_args["components_to_update"]
+            assert "react" not in tool_call_args["components_to_update"]
+
+            # Verify both components in result
+            assert "react" in result
+            assert "tool:calculator" in result