diff --git a/.gitignore b/.gitignore index d3eb440..5fd66e7 100644 --- a/.gitignore +++ b/.gitignore @@ -108,6 +108,7 @@ dist # Astro generated types and cache **/.astro/ +packages/docs/src/content/docs/api/ # vitepress build output **/.vitepress/dist diff --git a/README.md b/README.md index 0147560..ba4c00c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,20 @@ Monorepo for the explicit-run `vitest-evals` shape: - `apps/demo-openai-agents`: end-to-end OpenAI Agents demo evals with app-local refund tools +## Reading Guide + +- Start with the public docs site for the guided setup path: + `https://vitest-evals.sentry.dev/docs` +- Use [packages/vitest-evals/README.md](packages/vitest-evals/README.md) for + the core package authoring model and examples. +- Use [docs/github-actions.md](docs/github-actions.md) when changing CI + reporting behavior. +- Use [docs/architecture.md](docs/architecture.md) and + [docs/development-guide.md](docs/development-guide.md) before changing package + boundaries or product shape. +- Follow [policies/docs-writing.md](policies/docs-writing.md) for docs copy, + hierarchy, and visual consistency. + ## Workspace Layout ```text diff --git a/packages/docs/astro.config.mjs b/packages/docs/astro.config.mjs index 688bd70..e78498a 100644 --- a/packages/docs/astro.config.mjs +++ b/packages/docs/astro.config.mjs @@ -1,52 +1,109 @@ import mdx from "@astrojs/mdx"; -import { defineConfig, fontProviders } from "astro/config"; -import rehypeAutolinkHeadings from "rehype-autolink-headings"; -import rehypeSlug from "rehype-slug"; +import starlight from "@astrojs/starlight"; +import { defineConfig } from "astro/config"; +import starlightTypeDoc, { typeDocSidebarGroup } from "starlight-typedoc"; +import { + monochromeCodeTheme, + vitestEvalsStarlightTheme, +} from "./src/theme/vitest-evals-starlight-theme.mjs"; export default defineConfig({ site: "https://vitest-evals.sentry.dev", - integrations: [mdx()], - markdown: { - shikiConfig: { - theme: "vitesse-black", - }, - rehypePlugins: [ - rehypeSlug, - [ - rehypeAutolinkHeadings, - { - behavior: "prepend", - properties: { className: ["heading-anchor"] }, - content: { type: "text", value: "#" }, - }, - ], - ], + devToolbar: { + enabled: false, }, - experimental: { - fonts: [ - { - name: "Geist Mono", - provider: fontProviders.local(), - cssVariable: "--font-geist-mono", - options: { - variants: [ + integrations: [ + starlight({ + title: "vitest-evals", + description: "Harness-backed AI testing on top of Vitest.", + pagination: false, + components: { + ThemeSelect: "./src/components/StarlightThemeSelect.astro", + }, + sidebar: [ + { + label: "Documentation", + items: [ + { label: "Overview", link: "/docs" }, { - weight: 400, - style: "normal", - src: [ - "./node_modules/geist/dist/fonts/geist-mono/GeistMono-Regular.woff2", + label: "Harnesses", + items: [ + { label: "Overview", link: "/docs/harnesses" }, + { label: "AI SDK", link: "/docs/harnesses/ai-sdk" }, + { + label: "OpenAI Agents", + link: "/docs/harnesses/openai-agents", + }, + { label: "Pi", link: "/docs/harnesses/pi-ai" }, + { + label: "Custom Harnesses", + link: "/docs/harnesses/custom", + }, ], }, { - weight: 600, - style: "normal", - src: [ - "./node_modules/geist/dist/fonts/geist-mono/GeistMono-SemiBold.woff2", + label: "Judges", + items: [ + { label: "Overview", link: "/docs/judges" }, + { label: "ToolCallJudge", link: "/docs/judges/tool-call" }, + { + label: "StructuredOutputJudge", + link: "/docs/judges/structured-output", + }, + { label: "Custom Judges", link: "/docs/judges/custom" }, ], }, + { label: "Tool Replay", link: "/docs/tool-replay" }, + { label: "GitHub Reporting", link: "/docs/github" }, ], }, - }, - ], + { + label: "API Reference", + items: [{ label: "Overview", link: "/api" }, typeDocSidebarGroup], + }, + ], + social: [ + { + icon: "github", + label: "GitHub", + href: "https://github.com/getsentry/vitest-evals", + }, + ], + plugins: [ + vitestEvalsStarlightTheme(), + starlightTypeDoc({ + entryPoints: ["../vitest-evals/src/index.ts"], + tsconfig: "../vitest-evals/tsconfig.json", + output: "api", + pagination: false, + sidebar: { + label: "Exports", + }, + typeDoc: { + disableSources: true, + entryPointStrategy: "resolve", + intentionallyNotExported: [ + "OutputField", + "JudgeAssertionArgs", + "JudgeAssertionHarness", + "JudgeAssertionInput", + "JudgeAssertionMetadata", + "JudgeAssertionOutput", + "JudgeAssertionParams", + "JudgeForReceived", + "HarnessInput", + "HarnessMetadataFor", + "HarnessOutput", + ], + }, + }), + ], + }), + mdx(), + ], + markdown: { + shikiConfig: { + theme: monochromeCodeTheme, + }, }, }); diff --git a/packages/docs/package.json b/packages/docs/package.json index fe6ab40..22c960c 100644 --- a/packages/docs/package.json +++ b/packages/docs/package.json @@ -4,21 +4,20 @@ "type": "module", "version": "0.0.1", "scripts": { - "api:generate": "typedoc --options typedoc.json", - "dev": "pnpm run api:generate && astro dev", - "build": "pnpm run api:generate && astro build", + "dev": "astro dev", + "build": "astro build", "preview": "astro preview" }, "dependencies": { - "@astrojs/mdx": "^4.0.0", + "@astrojs/mdx": "^5.0.6", + "@astrojs/starlight": "^0.39.2", "@vercel/functions": "^2.0.0", - "astro": "^5.0.0", - "geist": "^1.5.1", - "rehype-autolink-headings": "^7.1.0", - "rehype-slug": "^6.0.0", + "astro": "^6.3.5", + "starlight-typedoc": "^0.22.0", "vitest-evals": "workspace:*" }, "devDependencies": { - "typedoc": "^0.28.19" + "typedoc": "^0.28.19", + "typedoc-plugin-markdown": "^4.6.0" } } diff --git a/packages/docs/public/favicon.svg b/packages/docs/public/favicon.svg new file mode 100644 index 0000000..a57c629 --- /dev/null +++ b/packages/docs/public/favicon.svg @@ -0,0 +1,4 @@ + + + + diff --git a/packages/docs/public/llms.txt b/packages/docs/public/llms.txt index 7015618..8f52d79 100644 --- a/packages/docs/public/llms.txt +++ b/packages/docs/public/llms.txt @@ -12,13 +12,13 @@ judges and reporters. Canonical docs: - Overview: https://vitest-evals.sentry.dev/ -- Getting Started: https://vitest-evals.sentry.dev/docs +- Documentation: https://vitest-evals.sentry.dev/docs - API reference: https://vitest-evals.sentry.dev/api - Repository: https://github.com/getsentry/vitest-evals -The canonical Getting Started page walks through install, package scripts, -`vitest.evals.config.ts`, harness setup, eval authoring, running evals locally, -judges, tool replay, and GitHub Actions wiring. +The canonical documentation page routes readers to the right harness, then the +harness pages follow the same Paris example from app shape through harness +configuration, eval authoring, and judging. ## When to use this library @@ -65,7 +65,7 @@ Package purposes: tool calls, runtime tools, and custom entrypoints. - `@vitest-evals/harness-openai-agents`: adapter for OpenAI Agents SDK agents, runners, local tool capture, and replay metadata. -- `@vitest-evals/harness-pi-ai`: adapter for pi-ai agents and replay-capable +- `@vitest-evals/harness-pi-ai`: adapter for Pi agents and replay-capable tool execution. - `@vitest-evals/github-reporter`: implementation package behind the GitHub Action. Most users should consume the action as `getsentry/vitest-evals@v0`. @@ -128,36 +128,30 @@ rules. ```ts import { expect } from "vitest"; import { + createJudge, describeEval, - StructuredOutputJudge, - ToolCallJudge, - toolCalls, + type JudgeContext, } from "vitest-evals"; -import { refundHarness } from "./refundHarness"; +import { qaHarness } from "./qaHarness"; + +const CapitalJudge = createJudge( + "CapitalJudge", + async ({ output }: JudgeContext) => ({ + score: output.toLowerCase().includes("paris") ? 1 : 0, + }), +); describeEval( - "refund agent", + "capital questions", { - harness: refundHarness, - judges: [ - ToolCallJudge(), - StructuredOutputJudge(), - ], + harness: qaHarness, + judges: [CapitalJudge], }, (it) => { - it("approves a refundable invoice", async ({ run }) => { - const result = await run("Refund invoice inv_123", { - metadata: { - expected: { status: "approved" }, - expectedTools: ["lookupInvoice", "createRefund"], - }, - }); - - expect(result.output).toMatchObject({ status: "approved" }); - expect(toolCalls(result.session).map((call) => call.name)).toEqual([ - "lookupInvoice", - "createRefund", - ]); + it("knows the capital of France", async ({ run }) => { + const result = await run("What is the capital of France?"); + + expect(result.output).toContain("Paris"); }); }, ); @@ -170,34 +164,27 @@ criteria in `metadata`. ```ts describeEval( - "refund agent", + "capital questions", { - harness: refundHarness, - judges: [ToolCallJudge()], + harness: qaHarness, + judges: [CapitalJudge], }, (it) => { it.for([ { - name: "approves refundable invoice", - input: "Refund invoice inv_123", - expected: { status: "approved" }, - expectedTools: ["lookupInvoice", "createRefund"], + name: "France", + input: "What is the capital of France?", + expectedAnswer: "Paris", }, { - name: "denies non-refundable invoice", - input: "Refund invoice inv_404", - expected: { status: "denied" }, - expectedTools: ["lookupInvoice"], + name: "Japan", + input: "What is the capital of Japan?", + expectedAnswer: "Tokyo", }, ])("$name", async ({ input, ...metadata }, { run }) => { const result = await run(input, { metadata }); - expect(result.output).toMatchObject({ - status: metadata.expected.status, - }); - expect(toolCalls(result.session).map((call) => call.name)).toEqual( - metadata.expectedTools, - ); + expect(result.output).toContain(metadata.expectedAnswer); }); }, ); @@ -299,23 +286,18 @@ lightweight result and normalizes it into a full `HarnessRun`. ```ts import { createHarness } from "vitest-evals"; -export const refundHarness = createHarness< - string, - { status: "approved" | "denied"; invoiceId: string } ->({ - name: "refund-app", - run: async ({ input, metadata, setArtifact }) => { - const output = await runRefundFlow(input, metadata); +export const qaHarness = createHarness({ + name: "qa-app", + run: async ({ input, setArtifact }) => { + const output = await answerQuestion(input); - setArtifact("case", { invoiceId: output.invoiceId }); + setArtifact("question", { input }); return { output, - toolCalls: output.toolCalls, usage: { provider: "openai", - model: "gpt-4.1", - totalTokens: output.totalTokens, + model: "gpt-4o-mini", }, }; }, @@ -409,7 +391,7 @@ limits and provider-specific runner options. Key `toolReplay` by the OpenAI function tool name. Use `output` to parse `result.finalOutput` or project native structured output into the app-facing value. -### pi-ai +### Pi Import: @@ -417,16 +399,16 @@ Import: import { piAiHarness } from "@vitest-evals/harness-pi-ai"; ``` -Use it when the system under test uses pi-ai agents. The adapter captures +Use it when the system under test uses Pi agents. The adapter captures messages, inferred or configured toolsets, native tool calls, usage, and replay metadata for opt-in tools. -Compatible apps expose a pi-ai agent, a `toolset`, or a `run(input, runtime)` +Compatible apps expose a Pi agent, a `toolset`, or a `run(input, runtime)` entrypoint. Keep the native toolset on the agent when it is discoverable; pass `tools` only when the app hides the tool surface. Use `runtime.tools` for replay-aware execution, `runtime.events` to record assistant messages and usage, and return `{ output }` when tests should assert on a parsed domain object. -Key `toolReplay` by the pi-ai tool name. +Key `toolReplay` by the Pi tool name. ### Custom harness @@ -469,18 +451,14 @@ Create a deterministic judge: ```ts import { createJudge, type JudgeContext } from "vitest-evals"; -type RefundOutput = { status: "approved" | "denied" }; -type RefundMetadata = { expectedStatus: RefundOutput["status"] }; - -export const RefundStatusJudge = createJudge( - "RefundStatusJudge", - async ({ - output, - metadata, - }: JudgeContext) => ({ - score: output.status === metadata.expectedStatus ? 1 : 0, +export const CapitalJudge = createJudge( + "CapitalJudge", + async ({ output }: JudgeContext) => ({ + score: output.toLowerCase().includes("paris") ? 1 : 0, metadata: { - rationale: `Expected ${metadata.expectedStatus}, got ${output.status}`, + rationale: output.toLowerCase().includes("paris") + ? "The answer names Paris." + : `Expected Paris, got: ${output}`, }, }), ); @@ -490,10 +468,10 @@ Use suite-level judges when every `run(...)` in the suite should be judged: ```ts describeEval( - "refund agent", + "capital questions", { - harness: refundHarness, - judges: [RefundStatusJudge], + harness: qaHarness, + judges: [CapitalJudge], judgeThreshold: 1, }, (it) => { @@ -505,7 +483,7 @@ describeEval( Use explicit judge assertions when a single test needs a specific judge: ```ts -await expect(result).toSatisfyJudge(RefundStatusJudge); +await expect(result).toSatisfyJudge(CapitalJudge); ``` Set `judgeThreshold: null` or assertion threshold options when you want to diff --git a/packages/docs/src/components/DocsManual.astro b/packages/docs/src/components/DocsManual.astro deleted file mode 100644 index 9b53534..0000000 --- a/packages/docs/src/components/DocsManual.astro +++ /dev/null @@ -1,746 +0,0 @@ ---- -import DocsPageShell from "./DocsPageShell.astro"; -import PackageManagerTabs from "./PackageManagerTabs.astro"; -import Terminal from "./Terminal.astro"; -import { VITEST_EVALS_ACTION } from "../utils/version"; - -const packageScripts = `{ - "scripts": { - "evals": "vitest run --config vitest.evals.config.ts", - "evals:record": "VITEST_EVALS_REPLAY_MODE=record vitest run --config vitest.evals.config.ts", - "evals:strict": "VITEST_EVALS_REPLAY_MODE=strict vitest run --config vitest.evals.config.ts" - } -}`; - -const vitestConfig = `import { defineConfig } from "vitest/config"; - -export default defineConfig({ - test: { - include: ["evals/**/*.eval.ts"], - testTimeout: 30_000, - hookTimeout: 30_000, - reporters: ["vitest-evals/reporter"], - env: { - VITEST_EVALS_REPLAY_MODE: - process.env.VITEST_EVALS_REPLAY_MODE ?? "off", - VITEST_EVALS_REPLAY_DIR: ".vitest-evals/recordings", - }, - }, -});`; - -const runCommand = `pnpm evals`; - -const aiSdkAgent = `import { openai } from "@ai-sdk/openai"; -import { generateText, stepCountIs } from "ai"; -import type { AiSdkRuntime, AiSdkToolset } from "@vitest-evals/harness-ai-sdk"; - -export function createRefundAgent() { - return { - run: (input: string, runtime: AiSdkRuntime) => - generateText({ - model: openai("gpt-4o-mini"), - prompt: input, - tools: runtime.tools, - stopWhen: stepCountIs(5), - }), - }; -}`; - -const aiSdkHarness = `import { aiSdkHarness } from "@vitest-evals/harness-ai-sdk"; -import { createRefundAgent, parseRefundDecision } from "../src/refundAgent"; -import { lookupInvoice, lookupInvoiceSchema } from "../src/tools"; - -const tools = { - lookupInvoice: { - inputSchema: lookupInvoiceSchema, - execute: lookupInvoice, - }, -}; - -export const refundHarness = aiSdkHarness({ - agent: () => createRefundAgent(), - tools, - toolReplay: { - lookupInvoice: true, - }, - output: ({ result }) => parseRefundDecision(result.text), -});`; - -const openAiAgentsAgent = `import { Agent, tool } from "@openai/agents"; -import { z } from "zod"; - -const lookupInvoice = tool({ - name: "lookup_invoice", - description: "Look up invoice facts.", - parameters: z.object({ - invoiceId: z.string(), - }), - async execute({ invoiceId }) { - return fetchInvoice(invoiceId); - }, -}); - -export function createRefundAgent() { - return new Agent({ - name: "refund_agent", - instructions: "Return a JSON refund decision.", - tools: [lookupInvoice], - }); -}`; - -const openAiAgentsHarness = `import { Runner } from "@openai/agents"; -import { openaiAgentsHarness } from "@vitest-evals/harness-openai-agents"; -import { createRefundAgent, parseRefundDecision } from "../src/refundAgent"; - -export const refundHarness = openaiAgentsHarness({ - agent: () => createRefundAgent(), - runner: () => - new Runner({ - modelProvider, - tracingDisabled: true, - }), - runOptions: { - maxTurns: 5, - }, - toolReplay: { - lookup_invoice: true, - }, - output: ({ result }) => parseRefundDecision(result.finalOutput), -});`; - -const piAiAgent = `import type { PiAiRuntime, PiAiToolset } from "@vitest-evals/harness-pi-ai"; - -type RefundAgentOptions = { - instructions: string; - metadata?: Record; -}; - -export function createRefundAgent(options: RefundAgentOptions) { - return { - toolset: refundTools, - async run(input: string, runtime: PiAiRuntime) { - const response = await runPiAgent({ - input, - instructions: options.instructions, - metadata: options.metadata, - tools: runtime.tools, - }); - - runtime.events.assistant(response.text, { - provider: response.provider, - model: response.model, - totalTokens: response.totalTokens, - }); - - return { - output: parseRefundDecision(response.text), - }; - }, - }; -}`; - -const piAiHarness = `import { piAiHarness } from "@vitest-evals/harness-pi-ai"; -import { createRefundAgent } from "../src/refundAgent"; - -export const refundHarness = piAiHarness({ - agent: ({ input, context }) => - createRefundAgent({ - instructions: buildInstructions(input), - metadata: context.metadata, - }), - toolReplay: { - lookupInvoice: true, - }, -});`; - -const customAgent = `type RefundDecision = { - status: "approved" | "denied"; - invoiceId: string; - toolCalls: Array<{ - name: string; - arguments?: Record; - result?: Record; - }>; - totalTokens?: number; -}; - -export async function runRefundFlow(input: string): Promise { - const invoice = await lookupInvoice(input); - const shouldRefund = invoice.status === "paid" && invoice.daysOld < 30; - - return { - status: shouldRefund ? "approved" : "denied", - invoiceId: invoice.id, - toolCalls: [ - { - name: "lookupInvoice", - arguments: { query: input }, - result: invoice, - }, - ], - totalTokens: invoice.totalTokens, - }; -}`; - -const customHarness = `import { createHarness, type JsonValue } from "vitest-evals"; -import { runRefundFlow } from "../src/refundFlow"; - -type RefundDecision = { - status: "approved" | "denied"; - invoiceId: string; - toolCalls: Array<{ - name: string; - arguments?: Record; - result?: JsonValue; - }>; - totalTokens?: number; -}; - -type RefundMetadata = { - expected: { status: RefundDecision["status"] }; -}; - -export const refundHarness = createHarness< - string, - RefundDecision, - RefundMetadata ->({ - name: "refund-app", - run: async ({ input, setArtifact }) => { - const output = await runRefundFlow(input); - - setArtifact("invoice", { id: output.invoiceId }); - - return { - output, - toolCalls: output.toolCalls, - usage: { - provider: "openai", - model: "gpt-4o-mini", - totalTokens: output.totalTokens, - }, - }; - }, -});`; - -const evalSuite = `import { expect } from "vitest"; -import { - describeEval, - StructuredOutputJudge, - ToolCallJudge, - toolCalls, -} from "vitest-evals"; -import { refundHarness } from "./refundHarness"; - -describeEval( - "refund agent", - { - harness: refundHarness, - judges: [ - ToolCallJudge({ ordered: true }), - StructuredOutputJudge(), - ], - }, - (it) => { - it.for([ - { - name: "approves refundable invoice", - input: "Refund invoice inv_123", - expected: { status: "approved" }, - expectedTools: ["lookupInvoice", "createRefund"], - }, - { - name: "denies non-refundable invoice", - input: "Refund invoice inv_404", - expected: { status: "denied" }, - expectedTools: ["lookupInvoice"], - }, - ])("$name", async ({ input, ...metadata }, { run }) => { - const result = await run(input, { metadata }); - - expect(result.output).toMatchObject(metadata.expected); - expect(toolCalls(result.session).map((call) => call.name)).toEqual( - metadata.expectedTools, - ); - }); - }, -);`; - -const customJudge = `import { createJudge, type JudgeContext } from "vitest-evals"; - -type RefundOutput = { status: "approved" | "denied" }; -type RefundMetadata = { expected: { status: RefundOutput["status"] } }; - -export const RefundStatusJudge = createJudge( - "RefundStatusJudge", - async ({ - output, - metadata, - }: JudgeContext) => ({ - score: output.status === metadata.expected.status ? 1 : 0, - metadata: { - rationale: - "Expected " + metadata.expected.status + ", got " + output.status, - }, - }), -);`; - -const explicitJudge = `it("records a rubric score", async ({ run }) => { - const result = await run("Refund invoice inv_123", { - metadata: { - expected: { status: "approved" }, - }, - }); - - await expect(result).toSatisfyJudge(RefundStatusJudge, { - threshold: null, - }); -});`; - -const simpleReplayConfig = `export const refundHarness = aiSdkHarness({ - agent: () => createRefundAgent(), - tools, - toolReplay: { - webSearch: true, - lookupInvoice: true, - }, - output: ({ result }) => parseRefundDecision(result.text), -});`; - -const advancedReplayConfig = `export const refundHarness = aiSdkHarness({ - agent: () => createRefundAgent(), - tools, - toolReplay: { - lookupInvoice: { - version: "v1", - key: (args) => ({ - invoiceId: args.invoiceId, - }), - sanitize: (recording) => ({ - ...recording, - output: redactInvoice(recording.output), - }), - }, - }, - output: ({ result }) => parseRefundDecision(result.text), -});`; - -const githubWorkflow = `name: evals - -on: - pull_request: - push: - branches: - - main - -jobs: - evals: - runs-on: ubuntu-latest - permissions: - contents: read - checks: write - steps: - - uses: actions/checkout@v4 - - uses: pnpm/action-setup@v4 - - uses: actions/setup-node@v4 - with: - node-version: 24 - cache: pnpm - - run: pnpm install - - - name: Run evals - run: | - pnpm exec vitest run --config vitest.evals.config.ts \\ - --reporter=vitest-evals/reporter \\ - --reporter=json \\ - --outputFile.json=vitest-results.json - - - uses: ${VITEST_EVALS_ACTION} - if: always() - with: - results: vitest-results.json - publish-check: true - fail-on-failures: true`; - -const shardedWorkflow = `- uses: actions/upload-artifact@v4 - with: - name: vitest-evals-\${{ matrix.shard }} - path: vitest-results-\${{ matrix.shard }}.json - -- uses: actions/download-artifact@v4 - with: - pattern: vitest-evals-* - path: eval-results - merge-multiple: true - -- uses: ${VITEST_EVALS_ACTION} - with: - results: eval-results/*.json - publish-check: true - fail-on-failures: true`; ---- - - -

Getting Started

-

Getting Started

-

- Add vitest-evals to an existing Vitest project, adapt your agent - with a harness, write normal test bodies around run(input), and - publish the same results locally and in GitHub Actions. -

- -

Install

-

- Install the core package plus the harness that matches your app runtime. - Swap the harness package in the command if you use OpenAI Agents or pi-ai. -

- -
-
-
AI SDK
-
- @vitest-evals/harness-ai-sdk -
-
-
-
OpenAI Agents
-
- @vitest-evals/harness-openai-agents -
-
-
-
pi-ai
-
- @vitest-evals/harness-pi-ai -
-
-
- -

Configure Vitest

-

- Keep evals on their own command and Vitest config. The separate config keeps - longer provider timeouts, eval-only includes, reporter setup, and replay - defaults out of unit tests. -

- -

- Then add the eval config. The reporter line gives local runs the eval - summary, while GitHub later adds the JSON reporter beside it. -

- - -

Harness Setup

-

- A harness is the adapter between your app and the eval runner. It executes - the app once, returns typed output for assertions, and normalizes - transcript, tool calls, usage, artifacts, and errors for judges and reports. -

-

- The examples below show the production-facing agent shape first, then the - eval-only harness file that adapts it. In your project, keep the agent close - to application code and put harness files next to evals. -

-

- Start with the section for your runtime. Each adapter returns the same - eval-facing HarnessRun shape, so the eval suite you write later - stays mostly the same. -

- -
-

AI SDK

-

- Use the AI SDK harness when your production code already calls - generateText, streamText, or an agent wrapper around - them. The compatible shape is small: expose run(input, runtime) - or generate(input, runtime), and read tools from - runtime.tools so the harness can capture and replay local tool - calls. -

-
    -
  • - Pass the local AI SDK tools through tools; those are the calls - the harness can normalize for ToolCallJudge(). -
  • -
  • - Use output when your app returns a raw AI SDK result and tests - need a domain value such as {`{ status: "approved" }`}. -
  • -
  • - Omit output if your app entrypoint already returns - {`{ output }`} or a full HarnessRun. -
  • -
-

- The first file is the kind of production-facing agent shape the harness - expects. The second file is the eval-only adapter. -

- -

- The harness provides the replay-aware tool map, chooses which tools can be - recorded, and converts the AI SDK result into the output your tests assert - on. -

- -
- -
-

OpenAI Agents

-

- Use the OpenAI Agents harness when your app already owns an - Agent and runs it with a Runner. The harness keeps - that model intact: it creates or receives the agent and runner, calls - runner.run(agent, input, options), captures output items and - local function tool activity, then lets output select the typed - value for test assertions. -

-
    -
  • - Use agent and runner factories when each eval case - should get fresh state. -
  • -
  • - Put turn limits and provider-specific runner settings in - runOptions, not inside the test body. -
  • -
  • - Key toolReplay by the OpenAI function tool name, such as - lookup_invoice. -
  • -
  • - Use output to parse result.finalOutput or to - project a structured SDK output into the app-facing value. -
  • -
-

- The agent file stays close to normal OpenAI Agents code; the harness file - only describes how evals should run and normalize it. -

- -

- The harness keeps the runner setup in one place, applies run options for - every case, and parses the final output into your domain type. -

- -
- -
-

pi-ai

-

- Use the pi-ai harness when your app exposes a pi-ai agent, a - toolset, or an app entrypoint that can accept a harness runtime. - Compatible agents usually implement run(input, runtime). The - harness provides replay-aware tools through runtime.tools and a - lightweight event recorder through runtime.events. -

-
    -
  • - Keep your native toolset on the agent when the harness can discover it. - Pass tools only when the app hides the tool surface. -
  • -
  • - Call runtime.events.assistant(...) when the pi-ai flow returns - text, provider, model, or usage data that should appear in reports. -
  • -
  • - Return {`{ output }`} from run when tests should - assert on a parsed domain object instead of raw assistant text. -
  • -
  • - Key toolReplay by the pi-ai tool name, such as - lookupInvoice. -
  • -
-

- The agent file shows the minimum runtime contract. The harness file wires - that contract into eval execution. -

- -

- The harness can create the agent per case, pass metadata into your agent - factory, and opt individual pi-ai tools into replay. -

- -
- -
-

Custom Harness

-

- Use createHarness(...) when the first-party adapters do not fit. - This is the right choice for workflow engines, RAG pipelines, CLIs, or - service functions that can return JSON-safe output, messages, tool calls, or - usage without going through a supported SDK adapter. -

-
    -
  • - Type the harness as createHarness<Input, Output, Metadata> - so run(input, {`{ metadata }`}), result.output, - and judges stay typed. -
  • -
  • - Return output for ordinary Vitest assertions and - toolCalls for deterministic tool judges. -
  • -
  • - Use setArtifact for JSON-safe debug details that belong in - reports but not in the app output contract. -
  • -
  • - Return a full HarnessRun only when you need complete control - over messages, usage, timings, artifacts, and errors. -
  • -
-

- A custom harness can wrap a plain application function. Keep that function - focused on production behavior, then normalize only the eval-facing details - in the harness file. -

- -

- The core package normalizes this lightweight result into the same - HarnessRun shape used by the first-party adapters. -

- -
- -

Write Evals

-

- Bind one harness per suite. Put the user prompt or event in - input, expected values in metadata, and use ordinary - Vitest assertions on result.output. -

- - -

Run Evals

-

- At this point the eval suite should run locally with the dedicated Vitest - config. Use replay modes only when you want to record or enforce cached tool - calls. -

- - -

Judges

-

- Judges score the run that the harness already captured. Suite-level judges - run after each run(...); explicit judge assertions are useful for - one-off checks or rubric scores you want to record without failing. -

-
-
-
ToolCallJudge()
-
- Reads expectedTools from metadata or matcher options and - checks tool names, order, and arguments. -
-
-
-
StructuredOutputJudge()
-
- Reads expected from metadata or matcher options and checks - JSON-safe output fields. -
-
-
- -

- Use explicit judge assertions when a test needs an extra judge or when you - want to record a score without making that score fail the test. -

- - -

Tool Replay

-

- Replay lets evals keep testing real agent behavior without paying for every - external dependency on every run. Use it for local tool calls that are - expensive, slow, rate-limited, or unstable but still useful to preserve in - the eval trace: web search requests, retrieval calls, third-party APIs, - browser fetches, or internal service lookups. -

-

- Opt in per tool from the harness, sanitize anything sensitive or - high-cardinality, commit useful recordings alongside the evals, then choose - a mode with VITEST_EVALS_REPLAY_MODE. Keep live provider model - calls live unless your app exposes them as local tools. -

- - - - - - - - - - - - - - - - - - - - - - - - - - -
ModeBehavior
offCall live tools and do not read or write recordings.
recordCall live tools and overwrite recordings.
autoReplay when a recording exists; otherwise call live and record.
strictRequire an existing recording and fail when one is missing.
-

- Use a replay config object when the cache key needs to ignore unstable - values, when recordings need a version, or when outputs need redaction. -

- - -

GitHub Integration

-

- Emit Vitest JSON, then run the action with if: always(). The - action reads the JSON file and publishes summaries, annotations, and optional - Check Runs. -

- -

- For sharded evals, upload one JSON result per matrix job and publish once - from a reducer job. -

- -
- - diff --git a/packages/docs/src/components/DocsPageShell.astro b/packages/docs/src/components/DocsPageShell.astro deleted file mode 100644 index 55ccf80..0000000 --- a/packages/docs/src/components/DocsPageShell.astro +++ /dev/null @@ -1,25 +0,0 @@ ---- -interface Props { - links: Array<{ href: string; label: string; level?: number }>; -} - -const { links } = Astro.props; ---- - -
- -
- -
-
diff --git a/packages/docs/src/components/PackageManagerTabs.astro b/packages/docs/src/components/PackageManagerTabs.astro index b9ea233..b9406f5 100644 --- a/packages/docs/src/components/PackageManagerTabs.astro +++ b/packages/docs/src/components/PackageManagerTabs.astro @@ -7,6 +7,12 @@ interface Props { const { npm, pnpm, bun } = Astro.props; +const managers = [ + { id: "npm", command: npm }, + { id: "pnpm", command: pnpm }, + { id: "bun", command: bun }, +]; + function formatCommand(command: string) { return command .split("\n") @@ -15,17 +21,22 @@ function formatCommand(command: string) { } --- -
+
- - - + { + managers.map((manager, index) => ( + + )) + }
-
-    
diff --git a/packages/docs/src/components/ReportPreview.astro b/packages/docs/src/components/ReportPreview.astro index 8b25b89..82032c5 100644 --- a/packages/docs/src/components/ReportPreview.astro +++ b/packages/docs/src/components/ReportPreview.astro @@ -1,37 +1,33 @@
PASS - apps/refund/evals/refund.eval.ts + apps/qa/evals/capital.eval.ts
- approves refundable invoice - 1,062 tok · 2 tools · 5.0s + knows the capital of France + 142 tok · 1 judge · 1.8s
- ToolCallJudge + CapitalJudge 1.00
- StructuredOutputJudge - 1.00 -
-
- lookupInvoice - createRefund + expected + Paris
- {`{ "status": "approved", "refundId": "rf_inv_123" }`} + {`{ "output": "Paris is the capital of France." }`}