Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions js/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,4 +222,122 @@ Issue Description: {{page_content}}
expect(response.error).toBeUndefined();
}
});

test("LLMClassifierFromTemplate omits optional parameters when not specified", async () => {
let capturedRequestBody: any;

server.use(
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "tool_calls",
},
],
});
},
),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
choiceScores: { "1": 1, "2": 0 },
});

await classifier({ output: "test output", expected: "test expected" });

// Verify that max_tokens and temperature are NOT in the request
expect(capturedRequestBody.max_tokens).toBeUndefined();
expect(capturedRequestBody.temperature).toBeUndefined();
});

test("LLMClassifierFromTemplate includes parameters when specified", async () => {
let capturedRequestBody: any;

server.use(
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
capturedRequestBody = await request.json();

return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "tool_calls",
},
],
});
},
),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
choiceScores: { "1": 1, "2": 0 },
maxTokens: 256,
temperature: 0.5,
});

await classifier({ output: "test output", expected: "test expected" });

// Verify that max_tokens and temperature ARE in the request with correct values
expect(capturedRequestBody.max_tokens).toBe(256);
expect(capturedRequestBody.temperature).toBe(0.5);
});
});
16 changes: 11 additions & 5 deletions js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,13 @@ export async function OpenAIClassifier<RenderArgs, Output>(
...remainingRenderArgs
} = remaining;

const extraArgs = {
temperature: temperature || 0,
max_tokens: maxTokens,
};
const extraArgs: { temperature?: number; max_tokens?: number } = {};
if (temperature !== undefined) {
extraArgs.temperature = temperature;
}
if (maxTokens !== undefined) {
extraArgs.max_tokens = maxTokens;
}

const renderArgs = {
output,
Expand Down Expand Up @@ -203,13 +206,15 @@ export function LLMClassifierFromTemplate<RenderArgs>({
model = DEFAULT_MODEL,
useCoT: useCoTArg,
temperature,
maxTokens: maxTokensArg,
}: {
name: string;
promptTemplate: string;
choiceScores: Record<string, number>;
model?: string;
useCoT?: boolean;
temperature?: number;
maxTokens?: number;
}): Scorer<string, LLMClassifierArgs<RenderArgs>> {
const choiceStrings = Object.keys(choiceScores);
const ret = async (
Expand All @@ -220,7 +225,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
const prompt =
promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX);

const maxTokens = 512;
const maxTokens = runtimeArgs.maxTokens ?? maxTokensArg;
const messages: ChatCompletionMessageParam[] = [
{
role: "user",
Expand Down Expand Up @@ -263,6 +268,7 @@ export function LLMClassifierFromSpec<RenderArgs>(
model: spec.model,
useCoT: spec.use_cot,
temperature: spec.temperature,
maxTokens: spec.max_tokens,
});
}

Expand Down
1 change: 1 addition & 0 deletions js/templates.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export const modelGradedSpecSchema = z.object({
model: z.string().optional(),
use_cot: z.boolean().optional(),
temperature: z.number().optional(),
max_tokens: z.number().optional(),
});

export type ModelGradedSpec = z.infer<typeof modelGradedSpecSchema>;
Expand Down
36 changes: 25 additions & 11 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
This module provides a collection of pre-built LLM scorers for common evaluation tasks.

All evaluators accept the following common arguments:
- model: Model to use (defaults to gpt-4)
- temperature: Controls randomness (0-1, defaults to 0)
- model: Model to use (defaults to gpt-4o)
- temperature: Controls randomness (0-1). If not specified, uses the model's default.
- max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
- client: OpenAI client (defaults to global client from init())

Example:
Expand Down Expand Up @@ -151,7 +152,8 @@ def __init__(
base_url=base_url,
client=client,
)
self.extra_args["temperature"] = temperature or 0
if temperature is not None:
self.extra_args["temperature"] = temperature


class OpenAILLMClassifier(OpenAILLMScorer):
Expand All @@ -174,6 +176,7 @@ def __init__(
client=client,
api_key=api_key,
base_url=base_url,
temperature=temperature,
)

self.name = name
Expand All @@ -182,9 +185,7 @@ def __init__(
self.engine = engine
self.messages = messages

self.extra_args["temperature"] = temperature or 0

if max_tokens:
if max_tokens is not None:
self.extra_args["max_tokens"] = max(max_tokens, 5)

self.render_args = {}
Expand Down Expand Up @@ -268,6 +269,7 @@ class ModelGradedSpec:
engine: str | None = None
use_cot: bool | None = None
temperature: float | None = None
max_tokens: int | None = None


class LLMClassifier(OpenAILLMClassifier):
Expand Down Expand Up @@ -306,8 +308,8 @@ class LLMClassifier(OpenAILLMClassifier):
choice_scores: Mapping of choices to scores (e.g. `{"good": 1, "bad": 0}`)
model: Model to use. Defaults to DEFAULT_MODEL.
use_cot: Enable chain of thought reasoning. Defaults to True.
max_tokens: Maximum tokens to generate. Defaults to 512.
temperature: Controls randomness (0-1). Defaults to 0.
max_tokens: Maximum tokens to generate. If not specified, uses the model's default.
temperature: Controls randomness (0-1). If not specified, uses the model's default.
engine: Deprecated by OpenAI. Use model instead.
api_key: Deprecated. Use client instead.
base_url: Deprecated. Use client instead.
Expand All @@ -324,8 +326,8 @@ def __init__(
choice_scores,
model=DEFAULT_MODEL,
use_cot=True,
max_tokens=512,
temperature=0,
max_tokens=None,
temperature=None,
engine=None,
api_key=None,
base_url=None,
Expand Down Expand Up @@ -359,7 +361,19 @@ def __init__(

@classmethod
def from_spec(cls, name: str, spec: ModelGradedSpec, client: Client | None = None, **kwargs):
return cls(name, spec.prompt, spec.choice_scores, client=client, **kwargs)
spec_kwargs = {}
if spec.model is not None:
spec_kwargs["model"] = spec.model
if spec.engine is not None:
spec_kwargs["engine"] = spec.engine
if spec.use_cot is not None:
spec_kwargs["use_cot"] = spec.use_cot
if spec.temperature is not None:
spec_kwargs["temperature"] = spec.temperature
if spec.max_tokens is not None:
spec_kwargs["max_tokens"] = spec.max_tokens
# kwargs can override spec values
return cls(name, spec.prompt, spec.choice_scores, client=client, **spec_kwargs, **kwargs)

@classmethod
def from_spec_file(cls, name: str, path: str, client: Client | None = None, **kwargs):
Expand Down
Loading