[Feature] Add tool calls to validate() method (#99)

ulya-tkch · web-flow · commit b051ae2561bb · 2025-07-29T16:14:13.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
 ]
 dependencies = [
   "cleanlab-tlm~=1.1,>=1.1.14",
-  "codex-sdk==0.1.0a23",
+  "codex-sdk==0.1.0a24",
   "pydantic>=2.0.0, <3",
 ]
 
diff --git a/src/cleanlab_codex/project.py b/src/cleanlab_codex/project.py
@@ -7,7 +7,7 @@
 from typing import Dict, Optional, Union, cast
 
 from codex import AuthenticationError
-from codex.types.project_validate_params import Response
+from codex.types.project_validate_params import Response, Tool
 
 from cleanlab_codex.internal.analytics import _AnalyticsMetadata
 from cleanlab_codex.internal.sdk_client import client_from_access_key
@@ -18,7 +18,7 @@
 
     from codex import Codex as _Codex
     from codex.types.project_validate_response import ProjectValidateResponse
-    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
+    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam, ChatCompletionToolParam
 
 
 _ERROR_CREATE_ACCESS_KEY = (
@@ -154,6 +154,7 @@ def validate(
         context: str,
         rewritten_query: Optional[str] = None,
         metadata: Optional[object] = None,
+        tools: Optional[list[ChatCompletionToolParam]] = None,
         eval_scores: Optional[Dict[str, float]] = None,
     ) -> ProjectValidateResponse:
         """Evaluate the quality of an AI-generated `response` based on the same exact inputs that your LLM used to generate the response.
@@ -176,6 +177,7 @@ def validate(
             context (str): All retrieved context (e.g., from your RAG/retrieval/search system) that was supplied as part of `messages` for generating the LLM `response`. Specifying the `context` (as a part of the full `messages` object) enables Cleanlab to run certain Evals and display the retrieved context in the Web Inferface.
             rewritten_query (str, optional): An optional reformulation of `query` (e.g. to form a self-contained question out of a multi-turn conversation history) to improve retrieval quality. If you are using a query-rewriter in your RAG system, you can provide its output here. If not provided, Cleanlab may internally do its own query rewrite when necessary.
             metadata (object, optional): Arbitrary metadata to associate with this LLM `response` for logging/analytics inside the Project.
+            tools (list[ChatCompletionToolParam], optional): Optional definitions of tools that were provided to the LLM in the response-generation call. Should match the `tools` argument in OpenAI's Chat Completions API. When provided to the LLM, its response might be to call one of these tools rather than natural language.
             eval_scores (dict[str, float], optional): Pre-computed evaluation scores to bypass automatic scoring. Providing `eval_scores` for specific evaluations bypasses automated scoring and uses the supplied scores instead. If you already have them pre-computed, this can reduce runtime.
 
         Returns:
@@ -188,7 +190,6 @@ def validate(
 
                 When available, consider swapping your AI response with the expert answer before serving the response to your user.
         """
-
         return self._sdk_client.projects.validate(
             self._id,
             messages=messages,
@@ -197,6 +198,7 @@ def validate(
             query=query,
             rewritten_question=rewritten_query,
             custom_metadata=metadata,
+            tools=[cast(Tool, tool) for tool in tools] if tools else None,
             eval_scores=eval_scores,
         )
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
     openai_messages_bad_no_user,
     openai_messages_conversational,
     openai_messages_single_turn,
+    openai_tools,
 )
 
 __all__ = [
@@ -14,4 +15,5 @@
     "openai_messages_conversational",
     "openai_messages_single_turn",
     "openai_messages_bad_no_user",
+    "openai_tools",
 ]
diff --git a/tests/fixtures/validate.py b/tests/fixtures/validate.py
@@ -6,6 +6,7 @@
     ChatCompletionAssistantMessageParam,
     ChatCompletionMessageParam,
     ChatCompletionSystemMessageParam,
+    ChatCompletionToolParam,
     ChatCompletionUserMessageParam,
 )
 
@@ -38,6 +39,25 @@ def openai_chat_completion() -> ChatCompletion:
     return ChatCompletion.model_validate(raw_response)
 
 
+@pytest.fixture
+def openai_tools() -> list[ChatCompletionToolParam]:
+    """Fixture that returns a list containing one static fake OpenAI Tool object."""
+    raw_tool = {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location.",
+            "parameters": {
+                "type": "object",
+                "properties": {"location": {"type": "string", "description": "The location to get the weather for."}},
+                "required": ["location"],
+            },
+        },
+    }
+    openai_tool = cast(ChatCompletionToolParam, raw_tool)
+    return [openai_tool]
+
+
 @pytest.fixture
 def openai_messages_single_turn() -> list[ChatCompletionMessageParam]:
     """Fixture that returns a single-turn message format."""
diff --git a/tests/test_project.py b/tests/test_project.py
@@ -11,7 +11,7 @@
 )
 
 if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam
+    from openai.types.chat import ChatCompletion, ChatCompletionMessageParam, ChatCompletionToolParam
 
 from cleanlab_codex.project import MissingProjectError, Project
 
@@ -75,6 +75,7 @@ def test_project_validate_with_dict_response(
         rewritten_question=None,
         custom_metadata=None,
         eval_scores=None,
+        tools=None,
     )
 
     # conversational
@@ -97,6 +98,7 @@ def test_project_validate_with_dict_response(
                 rewritten_question=None,
                 custom_metadata=None,
                 eval_scores=None,
+                tools=None,
             ),
             call(
                 FAKE_PROJECT_ID,
@@ -107,12 +109,69 @@ def test_project_validate_with_dict_response(
                 rewritten_question=None,
                 custom_metadata=None,
                 eval_scores=None,
+                tools=None,
             ),
         ]
     )
     assert mock_client_from_api_key.projects.validate.call_count == 2
 
 
+def test_project_validate_with_tools(
+    mock_client_from_api_key: MagicMock,
+    openai_chat_completion: "ChatCompletion",
+    openai_messages_single_turn: list["ChatCompletionMessageParam"],
+    openai_tools: list["ChatCompletionToolParam"],
+) -> None:
+    expected_result = ProjectValidateResponse(
+        is_bad_response=True,
+        expert_answer=None,
+        eval_scores={
+            "response_helpfulness": EvalScores(
+                score=0.8,
+                triggered=True,
+                triggered_escalation=False,
+                triggered_guardrail=False,
+            )
+        },
+        escalated_to_sme=True,
+        should_guardrail=False,
+    )
+    mock_client_from_api_key.projects.validate.return_value = expected_result
+    mock_client_from_api_key.projects.create.return_value.id = FAKE_PROJECT_ID
+    mock_client_from_api_key.organization_id = FAKE_ORGANIZATION_ID
+    project = Project.create(
+        mock_client_from_api_key,
+        FAKE_ORGANIZATION_ID,
+        FAKE_PROJECT_NAME,
+        FAKE_PROJECT_DESCRIPTION,
+    )
+
+    context = "Cities in France: Paris, Lyon, Marseille"
+    query = "What is the capitol of France?"
+
+    # single turn
+    result = project.validate(
+        messages=openai_messages_single_turn,
+        response=openai_chat_completion,
+        tools=openai_tools,
+        context=context,
+        query=query,
+    )
+
+    assert result == expected_result
+    mock_client_from_api_key.projects.validate.assert_called_once_with(
+        FAKE_PROJECT_ID,
+        messages=openai_messages_single_turn,
+        response=openai_chat_completion,
+        context=context,
+        query=query,
+        tools=openai_tools,
+        rewritten_question=None,
+        custom_metadata=None,
+        eval_scores=None,
+    )
+
+
 def test_from_access_key(mock_client_from_access_key: MagicMock) -> None:
     mock_client_from_access_key.projects.access_keys.retrieve_project_id.return_value = (
         AccessKeyRetrieveProjectIDResponse(

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ classifiers = [`
`26`	`26`	`]`
`27`	`27`	`dependencies = [`
`28`	`28`	`"cleanlab-tlm~=1.1,>=1.1.14",`
`29`		`- "codex-sdk==0.1.0a23",`
	`29`	`+ "codex-sdk==0.1.0a24",`
`30`	`30`	`"pydantic>=2.0.0, <3",`
`31`	`31`	`]`
`32`	`32`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@`
`4`	`4`	`openai_messages_bad_no_user,`
`5`	`5`	`openai_messages_conversational,`
`6`	`6`	`openai_messages_single_turn,`
	`7`	`+ openai_tools,`
`7`	`8`	`)`
`8`	`9`
`9`	`10`	`__all__ = [`
`@@ -14,4 +15,5 @@`
`14`	`15`	`"openai_messages_conversational",`
`15`	`16`	`"openai_messages_single_turn",`
`16`	`17`	`"openai_messages_bad_no_user",`
	`18`	`+ "openai_tools",`
`17`	`19`	`]`