braintrustdata · ankrgyl · Aug 8, 2025
diff --git a/evals/src/autoevals.eval.ts b/evals/src/autoevals.eval.ts
@@ -15,7 +15,6 @@ import {
   DEFAULT_MODEL,
   Factuality,
   NumericDiff,
-  Score,
 } from "autoevals";
 
 const experimentNamePrefix = process.env.EXPERIMENT_NAME;
@@ -77,15 +76,13 @@ Eval("Autoevals", {
           tags: [...(tags ?? []), name],
         }));
     }),
-  task: async (input, hooks) => {
+  task: async (input) => {
     const { scorer, ...rest } = input;
-    let result: Score | null = null;
-    try {
-      result = await runScorerT(scorer, rest);
-    } catch (e) {
-      hooks.meta({ error: `${e}` });
+    const result = await runScorerT(scorer, rest);
+    if (result.score === null) {
+      throw new Error(`Scorer ${scorer} did not return a score.`);
     }
-    return result?.score ?? -1;
+    return result.score;
   },
   scores: [NumericDiff],
   experimentName: experimentNamePrefix ?? undefined,

diff --git a/js/llm.ts b/js/llm.ts
@@ -20,7 +20,7 @@ export type LLMArgs = {
   temperature?: number;
 } & OpenAIAuth;
 
-export const DEFAULT_MODEL = "gpt-4o";
+export const DEFAULT_MODEL = "gpt-5-mini";
 
 const PLAIN_RESPONSE_SCHEMA = {
   properties: {
@@ -217,7 +217,6 @@ export function LLMClassifierFromTemplate<RenderArgs>({
     const prompt =
       promptTemplate + "\n" + (useCoT ? COT_SUFFIX : NO_COT_SUFFIX);
 
-    const maxTokens = 512;
     const messages: ChatCompletionMessageParam[] = [
       {
         role: "user",
@@ -231,7 +230,6 @@ export function LLMClassifierFromTemplate<RenderArgs>({
       choiceScores,
       classificationTools: buildClassificationTools(useCoT, choiceStrings),
       model,
-      maxTokens,
       temperature,
       __choices: choiceStrings,
       ...runtimeArgs,