diff --git a/js/llm.test.ts b/js/llm.test.ts index 758acec..fcdb30f 100644 --- a/js/llm.test.ts +++ b/js/llm.test.ts @@ -222,4 +222,122 @@ Issue Description: {{page_content}} expect(response.error).toBeUndefined(); } }); + + test("LLMClassifierFromTemplate omits optional parameters when not specified", async () => { + let capturedRequestBody: any; + + server.use( + http.post( + "https://api.openai.com/v1/chat/completions", + async ({ request }) => { + capturedRequestBody = await request.json(); + + return HttpResponse.json({ + id: "chatcmpl-test", + object: "chat.completion", + created: 1234567890, + model: "gpt-4o", + choices: [ + { + index: 0, + message: { + role: "assistant", + tool_calls: [ + { + id: "call_test", + type: "function", + function: { + name: "select_choice", + arguments: JSON.stringify({ choice: "1" }), + }, + }, + ], + }, + finish_reason: "tool_calls", + }, + ], + }); + }, + ), + ); + + init({ + client: new OpenAI({ + apiKey: "test-api-key", + baseURL: "https://api.openai.com/v1", + }), + }); + + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Test prompt: {{output}} vs {{expected}}", + choiceScores: { "1": 1, "2": 0 }, + }); + + await classifier({ output: "test output", expected: "test expected" }); + + // Verify that max_tokens and temperature are NOT in the request + expect(capturedRequestBody.max_tokens).toBeUndefined(); + expect(capturedRequestBody.temperature).toBeUndefined(); + }); + + test("LLMClassifierFromTemplate includes parameters when specified", async () => { + let capturedRequestBody: any; + + server.use( + http.post( + "https://api.openai.com/v1/chat/completions", + async ({ request }) => { + capturedRequestBody = await request.json(); + + return HttpResponse.json({ + id: "chatcmpl-test", + object: "chat.completion", + created: 1234567890, + model: "gpt-4o", + choices: [ + { + index: 0, + message: { + role: "assistant", + tool_calls: [ + { + id: "call_test", + type: "function", + function: { + name: "select_choice", + arguments: JSON.stringify({ choice: "1" }), + }, + }, + ], + }, + finish_reason: "tool_calls", + }, + ], + }); + }, + ), + ); + + init({ + client: new OpenAI({ + apiKey: "test-api-key", + baseURL: "https://api.openai.com/v1", + }), + }); + + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Test prompt: {{output}} vs {{expected}}", + choiceScores: { "1": 1, "2": 0 }, + maxTokens: 256, + temperature: 0.5, + }); + + await classifier({ output: "test output", expected: "test expected" }); + + // Verify that max_tokens and temperature ARE in the request with correct values + expect(capturedRequestBody.max_tokens).toBe(256); + expect(capturedRequestBody.temperature).toBe(0.5); + }); }); diff --git a/js/llm.ts b/js/llm.ts index 28ec3d3..066e00f 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -108,10 +108,13 @@ export async function OpenAIClassifier( ...remainingRenderArgs } = remaining; - const extraArgs = { - temperature: temperature || 0, - max_tokens: maxTokens, - }; + const extraArgs: { temperature?: number; max_tokens?: number } = {}; + if (temperature !== undefined) { + extraArgs.temperature = temperature; + } + if (maxTokens !== undefined) { + extraArgs.max_tokens = maxTokens; + } const renderArgs = { output, @@ -203,6 +206,7 @@ export function LLMClassifierFromTemplate({ model = DEFAULT_MODEL, useCoT: useCoTArg, temperature, + maxTokens: maxTokensArg, }: { name: string; promptTemplate: string; @@ -210,6 +214,7 @@ export function LLMClassifierFromTemplate({ model?: string; useCoT?: boolean; temperature?: number; + maxTokens?: number; }): Scorer> { const choiceStrings = Object.keys(choiceScores); const ret = async ( @@ -220,7 +225,7 @@ export function LLMClassifierFromTemplate({ const prompt = promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX); - const maxTokens = 512; + const maxTokens = runtimeArgs.maxTokens ?? maxTokensArg; const messages: ChatCompletionMessageParam[] = [ { role: "user", @@ -263,6 +268,7 @@ export function LLMClassifierFromSpec( model: spec.model, useCoT: spec.use_cot, temperature: spec.temperature, + maxTokens: spec.max_tokens, }); } diff --git a/js/templates.ts b/js/templates.ts index 636f712..3580e44 100644 --- a/js/templates.ts +++ b/js/templates.ts @@ -17,6 +17,7 @@ export const modelGradedSpecSchema = z.object({ model: z.string().optional(), use_cot: z.boolean().optional(), temperature: z.number().optional(), + max_tokens: z.number().optional(), }); export type ModelGradedSpec = z.infer; diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index 03252b0..d8a0324 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -3,8 +3,9 @@ This module provides a collection of pre-built LLM scorers for common evaluation tasks. All evaluators accept the following common arguments: -- model: Model to use (defaults to gpt-4) -- temperature: Controls randomness (0-1, defaults to 0) +- model: Model to use (defaults to gpt-4o) +- temperature: Controls randomness (0-1). If not specified, uses the model's default. +- max_tokens: Maximum tokens to generate. If not specified, uses the model's default. - client: OpenAI client (defaults to global client from init()) Example: @@ -151,7 +152,8 @@ def __init__( base_url=base_url, client=client, ) - self.extra_args["temperature"] = temperature or 0 + if temperature is not None: + self.extra_args["temperature"] = temperature class OpenAILLMClassifier(OpenAILLMScorer): @@ -174,6 +176,7 @@ def __init__( client=client, api_key=api_key, base_url=base_url, + temperature=temperature, ) self.name = name @@ -182,9 +185,7 @@ def __init__( self.engine = engine self.messages = messages - self.extra_args["temperature"] = temperature or 0 - - if max_tokens: + if max_tokens is not None: self.extra_args["max_tokens"] = max(max_tokens, 5) self.render_args = {} @@ -268,6 +269,7 @@ class ModelGradedSpec: engine: str | None = None use_cot: bool | None = None temperature: float | None = None + max_tokens: int | None = None class LLMClassifier(OpenAILLMClassifier): @@ -306,8 +308,8 @@ class LLMClassifier(OpenAILLMClassifier): choice_scores: Mapping of choices to scores (e.g. `{"good": 1, "bad": 0}`) model: Model to use. Defaults to DEFAULT_MODEL. use_cot: Enable chain of thought reasoning. Defaults to True. - max_tokens: Maximum tokens to generate. Defaults to 512. - temperature: Controls randomness (0-1). Defaults to 0. + max_tokens: Maximum tokens to generate. If not specified, uses the model's default. + temperature: Controls randomness (0-1). If not specified, uses the model's default. engine: Deprecated by OpenAI. Use model instead. api_key: Deprecated. Use client instead. base_url: Deprecated. Use client instead. @@ -324,8 +326,8 @@ def __init__( choice_scores, model=DEFAULT_MODEL, use_cot=True, - max_tokens=512, - temperature=0, + max_tokens=None, + temperature=None, engine=None, api_key=None, base_url=None, @@ -359,7 +361,19 @@ def __init__( @classmethod def from_spec(cls, name: str, spec: ModelGradedSpec, client: Client | None = None, **kwargs): - return cls(name, spec.prompt, spec.choice_scores, client=client, **kwargs) + spec_kwargs = {} + if spec.model is not None: + spec_kwargs["model"] = spec.model + if spec.engine is not None: + spec_kwargs["engine"] = spec.engine + if spec.use_cot is not None: + spec_kwargs["use_cot"] = spec.use_cot + if spec.temperature is not None: + spec_kwargs["temperature"] = spec.temperature + if spec.max_tokens is not None: + spec_kwargs["max_tokens"] = spec.max_tokens + # kwargs can override spec values + return cls(name, spec.prompt, spec.choice_scores, client=client, **spec_kwargs, **kwargs) @classmethod def from_spec_file(cls, name: str, path: str, client: Client | None = None, **kwargs): diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index 19eabe6..543eafb 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -1,8 +1,10 @@ import asyncio +import json from typing import cast import pytest import respx +from httpx import Response from openai import OpenAI from pydantic import BaseModel @@ -354,3 +356,117 @@ def test_battle(): print(response.as_json(indent=2)) assert response.score == 0 assert response.error is None + + +@respx.mock +def test_llm_classifier_omits_optional_parameters_when_not_specified(): + """Test that max_tokens and temperature are not included in API request when not specified.""" + captured_request_body = None + + def capture_request(request): + nonlocal captured_request_body + captured_request_body = json.loads(request.content.decode("utf-8")) + + return Response( + 200, + json={ + "id": "chatcmpl-test", + "object": "chat.completion", + "created": 1234567890, + "model": "gpt-4o", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_test", + "type": "function", + "function": {"name": "select_choice", "arguments": '{"choice": "1"}'}, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + }, + ) + + respx.post("https://api.openai.com/v1/chat/completions").mock(side_effect=capture_request) + + client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1") + init(client) + + # Create classifier without specifying max_tokens or temperature + classifier = LLMClassifier( + "test", + "Test prompt: {{output}} vs {{expected}}", + {"1": 1, "2": 0}, + ) + + classifier.eval(output="test output", expected="test expected") + + # Verify that max_tokens and temperature are NOT in the request + assert "max_tokens" not in captured_request_body + assert "temperature" not in captured_request_body + + +@respx.mock +def test_llm_classifier_includes_parameters_when_specified(): + """Test that max_tokens and temperature are included in API request when specified.""" + captured_request_body = None + + def capture_request(request): + nonlocal captured_request_body + captured_request_body = json.loads(request.content.decode("utf-8")) + + return Response( + 200, + json={ + "id": "chatcmpl-test", + "object": "chat.completion", + "created": 1234567890, + "model": "gpt-4o", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_test", + "type": "function", + "function": {"name": "select_choice", "arguments": '{"choice": "1"}'}, + } + ], + }, + "finish_reason": "tool_calls", + } + ], + "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, + }, + ) + + respx.post("https://api.openai.com/v1/chat/completions").mock(side_effect=capture_request) + + client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1") + init(client) + + # Create classifier with max_tokens and temperature specified + classifier = LLMClassifier( + "test", + "Test prompt: {{output}} vs {{expected}}", + {"1": 1, "2": 0}, + max_tokens=256, + temperature=0.5, + ) + + classifier.eval(output="test output", expected="test expected") + + # Verify that max_tokens and temperature ARE in the request with correct values + assert captured_request_body["max_tokens"] == 256 + assert captured_request_body["temperature"] == 0.5