braintrustdata · Qard · Jan 13, 2026 · Jan 12, 2026
diff --git a/js/llm.test.ts b/js/llm.test.ts
@@ -222,4 +222,122 @@ Issue Description: {{page_content}}
       expect(response.error).toBeUndefined();
     }
   });
+
+  test("LLMClassifierFromTemplate omits optional parameters when not specified", async () => {
+    let capturedRequestBody: any;
+
+    server.use(
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          capturedRequestBody = await request.json();
+
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: "gpt-4o",
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "tool_calls",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    await classifier({ output: "test output", expected: "test expected" });
+
+    // Verify that max_tokens and temperature are NOT in the request
+    expect(capturedRequestBody.max_tokens).toBeUndefined();
+    expect(capturedRequestBody.temperature).toBeUndefined();
+  });
+
+  test("LLMClassifierFromTemplate includes parameters when specified", async () => {
+    let capturedRequestBody: any;
+
+    server.use(
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          capturedRequestBody = await request.json();
+
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: "gpt-4o",
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "tool_calls",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+      maxTokens: 256,
+      temperature: 0.5,
+    });
+
+    await classifier({ output: "test output", expected: "test expected" });
+
+    // Verify that max_tokens and temperature ARE in the request with correct values
+    expect(capturedRequestBody.max_tokens).toBe(256);
+    expect(capturedRequestBody.temperature).toBe(0.5);
+  });
 });
diff --git a/js/llm.ts b/js/llm.ts
@@ -108,10 +108,13 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     ...remainingRenderArgs
   } = remaining;
 
-  const extraArgs = {
-    temperature: temperature || 0,
-    max_tokens: maxTokens,
-  };
+  const extraArgs: { temperature?: number; max_tokens?: number } = {};
+  if (temperature !== undefined) {
+    extraArgs.temperature = temperature;
+  }
+  if (maxTokens !== undefined) {
+    extraArgs.max_tokens = maxTokens;
+  }
 
   const renderArgs = {
     output,
@@ -203,13 +206,15 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   model = DEFAULT_MODEL,
   useCoT: useCoTArg,
   temperature,
+  maxTokens: maxTokensArg,
 }: {
   name: string;
   promptTemplate: string;
   choiceScores: Record<string, number>;
   model?: string;
   useCoT?: boolean;
   temperature?: number;
+  maxTokens?: number;
 }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
   const choiceStrings = Object.keys(choiceScores);
   const ret = async (
@@ -220,7 +225,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
     const prompt =
       promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX);
 
-    const maxTokens = 512;
+    const maxTokens = runtimeArgs.maxTokens ?? maxTokensArg;
     const messages: ChatCompletionMessageParam[] = [
       {
         role: "user",
@@ -263,6 +268,7 @@ export function LLMClassifierFromSpec<RenderArgs>(
     model: spec.model,
     useCoT: spec.use_cot,
     temperature: spec.temperature,
+    maxTokens: spec.max_tokens,
   });
 }
 

diff --git a/js/templates.ts b/js/templates.ts
@@ -17,6 +17,7 @@ export const modelGradedSpecSchema = z.object({
   model: z.string().optional(),
   use_cot: z.boolean().optional(),
   temperature: z.number().optional(),
+  max_tokens: z.number().optional(),
 });
 
 export type ModelGradedSpec = z.infer<typeof modelGradedSpecSchema>;

diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
@@ -3,8 +3,9 @@
 This module provides a collection of pre-built LLM scorers for common evaluation tasks.
 
 All evaluators accept the following common arguments:
-- model: Model to use (defaults to gpt-4)
-- temperature: Controls randomness (0-1, defaults to 0)
+- model: Model to use (defaults to gpt-4o)
+- temperature: Controls randomness (0-1). If not specified, uses the model's default.
+- max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
 - client: OpenAI client (defaults to global client from init())
 
 Example:
@@ -151,7 +152,8 @@ def __init__(
             base_url=base_url,
             client=client,
         )
-        self.extra_args["temperature"] = temperature or 0
+        if temperature is not None:
+            self.extra_args["temperature"] = temperature
 
 
 class OpenAILLMClassifier(OpenAILLMScorer):
@@ -174,6 +176,7 @@ def __init__(
             client=client,
             api_key=api_key,
             base_url=base_url,
+            temperature=temperature,
         )
 
         self.name = name
@@ -182,9 +185,7 @@ def __init__(
         self.engine = engine
         self.messages = messages
 
-        self.extra_args["temperature"] = temperature or 0
-
-        if max_tokens:
+        if max_tokens is not None:
             self.extra_args["max_tokens"] = max(max_tokens, 5)
 
         self.render_args = {}
@@ -268,6 +269,7 @@ class ModelGradedSpec:
     engine: str | None = None
     use_cot: bool | None = None
     temperature: float | None = None
+    max_tokens: int | None = None
 
 
 class LLMClassifier(OpenAILLMClassifier):
@@ -306,8 +308,8 @@ class LLMClassifier(OpenAILLMClassifier):
         choice_scores: Mapping of choices to scores (e.g. `{"good": 1, "bad": 0}`)
         model: Model to use. Defaults to DEFAULT_MODEL.
         use_cot: Enable chain of thought reasoning. Defaults to True.
-        max_tokens: Maximum tokens to generate. Defaults to 512.
-        temperature: Controls randomness (0-1). Defaults to 0.
+        max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
+        temperature: Controls randomness (0-1). If not specified, uses the model's default.
         engine: Deprecated by OpenAI. Use model instead.
         api_key: Deprecated. Use client instead.
         base_url: Deprecated. Use client instead.
@@ -324,8 +326,8 @@ def __init__(
         choice_scores,
         model=DEFAULT_MODEL,
         use_cot=True,
-        max_tokens=512,
-        temperature=0,
+        max_tokens=None,
+        temperature=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -359,7 +361,19 @@ def __init__(
 
     @classmethod
     def from_spec(cls, name: str, spec: ModelGradedSpec, client: Client | None = None, **kwargs):
-        return cls(name, spec.prompt, spec.choice_scores, client=client, **kwargs)
+        spec_kwargs = {}
+        if spec.model is not None:
+            spec_kwargs["model"] = spec.model
+        if spec.engine is not None:
+            spec_kwargs["engine"] = spec.engine
+        if spec.use_cot is not None:
+            spec_kwargs["use_cot"] = spec.use_cot
+        if spec.temperature is not None:
+            spec_kwargs["temperature"] = spec.temperature
+        if spec.max_tokens is not None:
+            spec_kwargs["max_tokens"] = spec.max_tokens
+        # kwargs can override spec values
+        return cls(name, spec.prompt, spec.choice_scores, client=client, **spec_kwargs, **kwargs)
 
     @classmethod
     def from_spec_file(cls, name: str, path: str, client: Client | None = None, **kwargs):