diff --git a/evals/src/autoevals.eval.ts b/evals/src/autoevals.eval.ts index 0d6f55b..8dc3dc3 100644 --- a/evals/src/autoevals.eval.ts +++ b/evals/src/autoevals.eval.ts @@ -15,7 +15,6 @@ import { DEFAULT_MODEL, Factuality, NumericDiff, - Score, } from "autoevals"; const experimentNamePrefix = process.env.EXPERIMENT_NAME; @@ -77,15 +76,13 @@ Eval("Autoevals", { tags: [...(tags ?? []), name], })); }), - task: async (input, hooks) => { + task: async (input) => { const { scorer, ...rest } = input; - let result: Score | null = null; - try { - result = await runScorerT(scorer, rest); - } catch (e) { - hooks.meta({ error: `${e}` }); + const result = await runScorerT(scorer, rest); + if (result.score === null) { + throw new Error(`Scorer ${scorer} did not return a score.`); } - return result?.score ?? -1; + return result.score; }, scores: [NumericDiff], experimentName: experimentNamePrefix ?? undefined, diff --git a/js/llm.ts b/js/llm.ts index d8f1644..c6f83d4 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -20,7 +20,7 @@ export type LLMArgs = { temperature?: number; } & OpenAIAuth; -export const DEFAULT_MODEL = "gpt-4o"; +export const DEFAULT_MODEL = "gpt-5-mini"; const PLAIN_RESPONSE_SCHEMA = { properties: { @@ -217,7 +217,6 @@ export function LLMClassifierFromTemplate({ const prompt = promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX); - const maxTokens = 512; const messages: ChatCompletionMessageParam[] = [ { role: "user", @@ -231,7 +230,6 @@ export function LLMClassifierFromTemplate({ choiceScores, classificationTools: buildClassificationTools(useCoT, choiceStrings), model, - maxTokens, temperature, __choices: choiceStrings, ...runtimeArgs,