From 19fdec5f07b34b569e2241e81439f2f0e4f46be6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 13:48:50 +1100 Subject: [PATCH 1/9] Rename rubric description to expected_outcome Align CLI and rubric generator output with expected_outcome and add score-range rubric proposal + mapping analysis. --- apps/cli/src/commands/generate/rubrics.ts | 4 +- examples/features/rubric/evals/dataset.yaml | 41 +++++++++-- .../changes/add-rubric-score-ranges/design.md | 73 +++++++++++++++++++ .../add-rubric-score-ranges/proposal.md | 54 ++++++++++++++ .../specs/rubric-evaluator/spec.md | 44 +++++++++++ .../specs/yaml-schema/spec.md | 53 ++++++++++++++ .../changes/add-rubric-score-ranges/tasks.md | 26 +++++++ .../src/evaluation/evaluators/llm-judge.ts | 6 +- .../evaluation/generators/rubric-generator.ts | 6 +- .../evaluation/loaders/evaluator-parser.ts | 4 +- packages/core/src/evaluation/types.ts | 2 +- packages/core/src/evaluation/yaml-parser.ts | 6 +- 12 files changed, 300 insertions(+), 19 deletions(-) create mode 100644 openspec/changes/add-rubric-score-ranges/design.md create mode 100644 openspec/changes/add-rubric-score-ranges/proposal.md create mode 100644 openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md create mode 100644 openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md create mode 100644 openspec/changes/add-rubric-score-ranges/tasks.md diff --git a/apps/cli/src/commands/generate/rubrics.ts b/apps/cli/src/commands/generate/rubrics.ts index 3f2993f6..c2807195 100644 --- a/apps/cli/src/commands/generate/rubrics.ts +++ b/apps/cli/src/commands/generate/rubrics.ts @@ -160,9 +160,9 @@ export async function generateRubricsCommand(options: GenerateRubricsOptions): P caseNode.set( 'rubrics', rubrics.map( - (r: { id: string; description: string; weight: number; required: boolean }) => ({ + (r: { id: string; expected_outcome: string; weight: number; required: boolean }) => ({ id: r.id, - description: r.description, + expected_outcome: r.expected_outcome, weight: r.weight, required: r.required, }), diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml index b9b764f9..87837649 100644 --- a/examples/features/rubric/evals/dataset.yaml +++ b/examples/features/rubric/evals/dataset.yaml @@ -71,27 +71,27 @@ evalcases: # Detailed rubric objects with weights and required flags rubrics: - id: structure - description: Has clear headings and organization + expected_outcome: Has clear headings and organization weight: 1.0 required: true - id: success-codes - description: Covers 2xx success codes with examples + expected_outcome: Covers 2xx success codes with examples weight: 2.0 required: true - id: client-errors - description: Explains 4xx client error codes + expected_outcome: Explains 4xx client error codes weight: 2.0 required: true - id: server-errors - description: Explains 5xx server error codes + expected_outcome: Explains 5xx server error codes weight: 1.5 required: false - id: practical-examples - description: Includes practical use case examples + expected_outcome: Includes practical use case examples weight: 1.0 required: false @@ -177,3 +177,34 @@ evalcases: # No rubrics defined - will use default llm_judge evaluator # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml + + # ========================================== + # Example 5: Score-range rubrics (PROPOSED) + # Demonstrates: proposed 0–10 score_range rubrics with expected_outcome per range + # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`. + # It is NOT supported by the current runtime until that change is implemented. + # ========================================== + - id: correctness-score-range-proposed + + expected_outcome: |- + Answer the question correctly and completely. + + input_messages: + - role: user + content: What is 15 + 27? + + expected_messages: + - role: assistant + content: 42 + + # Proposed polymorphic `rubrics` entries. + # Each entry defines an inclusive integer score range (0..10) with a concrete expected outcome. + rubrics: + - score_range: [0, 2] + expected_outcome: Incorrect or nonsensical answer. + - score_range: [3, 6] + expected_outcome: Partially correct but has clear errors or missing reasoning. + - score_range: [7, 9] + expected_outcome: Correct answer with minor issues (e.g., unclear explanation). + - score_range: [10, 10] + expected_outcome: Fully correct and clear. diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md new file mode 100644 index 00000000..5c9764db --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/design.md @@ -0,0 +1,73 @@ +## Context +AgentV currently supports rubric-based evaluation by converting `rubrics` into `llm_judge` checklist items. The judge returns per-item `satisfied: boolean` checks and the runtime computes a weighted fraction score in 0..1. + +External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1. + +## Decision +Keep a **single rubric system** by extending the existing `rubrics` field to support an additional rubric entry shape for **score-range scoring**, without removing the existing checklist scoring. + +This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`. + +### Proposed YAML Shape +```yaml +evaluators: + - name: correctness + type: llm_judge + rubrics: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct but includes notable errors or omissions. + - score_range: [7, 9] + expected_outcome: Correct with minor missing details. + - score_range: [10, 10] + expected_outcome: Fully correct and complete. +``` + +### Output Contract +- Judge returns `score` as an **integer** in `0..10`. +- AgentV normalizes to `0..1` by dividing by 10. +- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail) and required-item behavior for checklist mode. + +## Validation Rules +- Ranges are inclusive integer bounds. +- Bounds must be within 0..10. +- No overlap. +- Must cover 0..10 inclusive. +- Each range must have non-empty `expected_outcome`. + +## Backwards Compatibility +- If `rubrics` contains checklist entries, existing checklist behavior remains the default. +- If `rubrics` contains score-range entries, range scoring is used. +- To avoid ambiguous mixing, the proposal treats `rubrics` as **either all checklist or all score-range entries**. + +### Migration +- Replace checklist rubric object field `description:` with `expected_outcome:`. + +## Open Questions +- Should AgentV allow gaps (e.g., reserve 0 for “unscorable”), or strictly require full coverage? (Proposal defaults to strict full coverage to match the cited best practice.) +- Should mixed `rubrics` (checklist + score-range) be allowed, and if so how to combine them? (Proposal: disallow mixing for simplicity and determinism.) + +## Deterministic Mapping to Checklist (Weighted-Average) Rubrics + +### Can score-range rubrics be deterministically mapped to the existing weighted-average system? +Not in a semantics-preserving way. + +Score-range rubrics define a *single holistic ordinal grade* (an integer 0..10) with an expected outcome per interval. +Checklist rubrics define *multiple independent criteria* with per-criterion weights and required flags, and compute a weighted fraction. + +Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector. +Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input. + +### Can checklist rubrics be deterministically mapped to score-range rubrics? +Only in a lossy, wrapper-style way. + +Given checklist results, AgentV can deterministically compute a normalized score $s \in [0,1]$ and then map it to a raw integer $r = \mathrm{round}(10s)$ (or $\lfloor 10s \rfloor$, etc.). +But that does not recreate the score-range *rubric definition* (expected outcomes per bucket), and it does not provide the core value of range rubrics: constraining the judge with explicit outcome descriptions per range. + +### Conclusion +The two rubric modes are not redundant: +- Checklist rubrics are best for requirement-driven grading (decomposable criteria, required flags, deterministic scoring). +- Score-range rubrics are best for holistic grading where the evaluator needs explicit outcome descriptions per band. + +The most practical unification is at the interface level: treat both as rubric-driven evaluators that produce a normalized $[0,1]$ score and a verdict, but keep both scoring modes as first-class options rather than making one a wrapper for the other. diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md new file mode 100644 index 00000000..50f26d44 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/proposal.md @@ -0,0 +1,54 @@ +# Change: Add 0–10 score-range rubrics for LLM judging + +## Why +AgentV’s current rubric support in `llm_judge` is a **binary checklist** (each rubric item is `satisfied: true|false` and the score is computed as a weighted fraction). This is great for requirements-style grading, but it does **not** support the “confine the judge into explicit score ranges” pattern used by common LLM-evals tooling. + +Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confident AI) recommends: +- A **0–10 integer scale** (more reliable than floats for LLMs) +- Explicit **non-overlapping** `score_range` definitions +- Clear **expected outcomes per range**, not vague labels +- **Normalization to 0–1** for downstream aggregation + +Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact. + +## What Changes +- Extend the existing `rubrics` concept to support **two rubric shapes** under a single field: + - **Checklist rubrics** (breaking rename): `{ id, expected_outcome, weight, required }` + - **Score-range rubrics** (new, optional): `{ score_range: [start, end], expected_outcome }` over **0–10 inclusive** + + This keeps a single rubric system and a single evaluator implementation while covering both use cases. + +- When the evaluator is configured with score-range rubrics, it: + - Constrains the judge to output an integer **raw score 0–10** + - Normalizes to **0–1** (divide by 10) for the existing `EvaluationScore.score` +- Add validation rules: + - Ranges MUST be integers within **0..10** + - Ranges MUST NOT overlap + - Ranges MUST cover **0..10** (inclusive) + - Each range MUST include a non-empty `expected_outcome` +- Preserve the current behavior: + - Existing `llm_judge` freeform scoring (0–1) unchanged + - Existing `llm_judge` rubric checklist scoring logic unchanged (only the field name changes) + +## Breaking Changes +- **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`. + - YAML before: + - `rubrics: [{ id: "x", description: "...", weight: 1, required: true }]` + - YAML after: + - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]` + - CLI `generate rubrics` output changes accordingly. + +## Impact +- Affected specs: `rubric-evaluator`, `yaml-schema`. +- Affected code (expected): + - `packages/core/src/evaluation/types.ts` (new config/type) + - `packages/core/src/evaluation/yaml-parser.ts` (parsing inline config) + - `packages/core/src/evaluation/loaders/evaluator-parser.ts` (validation) + - `packages/core/src/evaluation/evaluators/llm-judge.ts` (prompt + scoring) + - `packages/core/src/evaluation/validation/*` (range validation helper) + - Tests under `packages/core/test/**` + +## Non-Goals +- Do not replace checklist rubrics. +- Do not change `EvaluationScore.score` away from 0–1. +- Do not add new CLI UX beyond schema support (future enhancement could generate range rubrics). diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md new file mode 100644 index 00000000..3d0b2d48 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md @@ -0,0 +1,44 @@ +## MODIFIED Requirements + +### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics +The evaluator SHALL support rubric-based grading using a single `rubrics` field in one of two shapes: + +1) **Checklist rubrics** (BREAKING rename): per-item boolean checks with weighted aggregation, using `expected_outcome` (formerly `description`). +2) **Score-range rubrics** (new, optional): a set of non-overlapping integer score ranges over 0–10 inclusive, each with an explicit `expected_outcome`. + +If score-range rubrics are configured, the evaluator SHALL instruct the judge to output a **single integer score** in 0..10 and then normalize it to 0..1 for the reported evaluation score. + +The system SHALL reject ambiguous configurations where `rubrics` mixes checklist and score-range entries. + +#### Scenario: Checklist rubrics continue to work +- **GIVEN** an eval case with `rubrics` (id/description/weight/required) +- **WHEN** the rubric evaluator runs +- **THEN** it SHALL grade using per-item boolean checks +- **AND** the reported score SHALL be in 0..1 + +#### Scenario: Range rubrics constrain scoring +- **GIVEN** an eval case with `rubrics` consisting of multiple `score_range` entries and `expected_outcome` text +- **WHEN** the rubric evaluator runs +- **THEN** the judge SHALL be constrained to output an integer score in 0..10 +- **AND** the system SHALL normalize the score to 0..1 by dividing by 10 + +#### Scenario: Invalid range rubrics are rejected +- **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10 +- **WHEN** the eval suite is loaded +- **THEN** validation SHALL fail +- **AND** the error message SHALL indicate the violated rule (overlap, bounds, or coverage) + +### Requirement: Structured Grading MUST produce validated results +The evaluator SHALL validate judge output against a schema appropriate to the configured mode. + +#### Scenario: Range rubric output schema +- **GIVEN** a range-rubric configuration +- **WHEN** the judge responds +- **THEN** the evaluator SHALL accept a JSON object matching: +```typescript +z.object({ + score: z.number().int().min(0).max(10), + reasoning: z.string().optional(), +}) +``` +- **AND** AgentV SHALL normalize `score / 10` into the standard 0..1 result. diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md new file mode 100644 index 00000000..948d8f0c --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md @@ -0,0 +1,53 @@ +## ADDED Requirements + +### Requirement: Checklist rubric field name MUST be `expected_outcome` +The YAML schema SHALL accept checklist rubric objects using `expected_outcome` (replacing the legacy `description`). + +#### Scenario: Checklist rubric uses expected_outcome +- **GIVEN** a YAML eval case with: +```yaml +rubrics: + - id: structure + expected_outcome: Has clear headings and organization + weight: 1.0 + required: true +``` +- **WHEN** the YAML is parsed +- **THEN** schema validation succeeds + +### Requirement: Score-range rubrics MUST be supported for LLM judging +The YAML schema SHALL support configuring score-range rubrics for `llm_judge` evaluators via the existing `rubrics` field. + +#### Scenario: Configure score_rubric +- **GIVEN** a YAML eval case with: +```yaml +evaluators: + - name: correctness + type: llm_judge + rubrics: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct. + - score_range: [7, 9] + expected_outcome: Correct but missing minor details. + - score_range: [10, 10] + expected_outcome: Fully correct. +``` +- **WHEN** the YAML is parsed +- **THEN** the evaluator configuration SHALL include the provided score ranges + +#### Scenario: Reject overlapping score ranges +- **GIVEN** a YAML eval case with overlapping ranges +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail + +#### Scenario: Reject incomplete 0..10 coverage +- **GIVEN** a YAML eval case where score ranges do not cover 0..10 inclusive +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail + +#### Scenario: Reject empty expected_outcome +- **GIVEN** a YAML eval case where a range rubric entry has an empty `expected_outcome` +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail diff --git a/openspec/changes/add-rubric-score-ranges/tasks.md b/openspec/changes/add-rubric-score-ranges/tasks.md new file mode 100644 index 00000000..bd86dfb2 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/tasks.md @@ -0,0 +1,26 @@ +## 1. Schema & Types +- [ ] 1.1 Add `ScoreRangeRubric` types (0–10 integer ranges) to core evaluation types +- [ ] 1.2 Extend evaluator config to accept optional `score_rubric` (or `score_ranges`) field + +## 2. Validation +- [ ] 2.1 Validate ranges are integers within 0..10 and start <= end +- [ ] 2.2 Validate non-overlap across ranges +- [ ] 2.3 Validate full coverage of 0..10 inclusive +- [ ] 2.4 Validate each range has non-empty `expected_outcome` + +## 3. LLM Judge Integration +- [ ] 3.1 Add prompt template for range-rubric scoring that requests integer `score` 0..10 +- [ ] 3.2 Normalize final score to 0..1 (divide by 10) and keep existing verdict logic +- [ ] 3.3 Store raw 0–10 score in `details` (or `evaluatorRawRequest/Response`) for debugging + +## 4. YAML Support +- [ ] 4.1 Support `score_rubric` in YAML evaluator config (snake_case) +- [ ] 4.2 Decide if inline `rubrics:` sugar can support range rubrics (or keep evaluator-only) + +## 5. Tests +- [ ] 5.1 Unit tests for validation (overlap, gaps, bounds) +- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization + +## 6. Docs +- [ ] 6.1 Update rubric-evaluator skill/reference docs to include range rubrics +- [ ] 6.2 Add examples of good/bad range definitions diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index ab8a5dd5..fb643e8b 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -231,7 +231,7 @@ export class LlmJudgeEvaluator implements Evaluator { for (const rubric of rubrics) { const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`); + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`); } parts.push('', 'For each rubric, determine if it is satisfied and provide brief reasoning.'); @@ -353,9 +353,9 @@ function calculateRubricScore( if (check.satisfied) { earnedWeight += rubric.weight; - hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`); + hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`); } else { - misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`); + misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`); if (rubric.required) { failedRequired = true; } diff --git a/packages/core/src/evaluation/generators/rubric-generator.ts b/packages/core/src/evaluation/generators/rubric-generator.ts index 54ee01c3..44cb54e2 100644 --- a/packages/core/src/evaluation/generators/rubric-generator.ts +++ b/packages/core/src/evaluation/generators/rubric-generator.ts @@ -6,7 +6,7 @@ import type { RubricItem } from '../types.js'; const rubricItemSchema = z.object({ id: z.string().describe('Short identifier for this rubric (e.g., clarity, completeness)'), - description: z.string().describe('What this rubric checks for'), + expected_outcome: z.string().describe('Concrete expected outcome for this rubric item'), weight: z.number().default(1.0).describe('Relative importance (default 1.0)'), required: z.boolean().default(true).describe('Whether this is a mandatory requirement'), }); @@ -43,7 +43,7 @@ You must return a valid JSON object matching this schema: "rubrics": [ { "id": "string (short identifier)", - "description": "string (what to check)", + "expected_outcome": "string (concrete expected outcome for this rubric item)", "weight": number (default 1.0), "required": boolean (default true) } @@ -86,7 +86,7 @@ function buildPrompt(expectedOutcome: string, question?: string, referenceAnswer 'Each rubric should:', '- Be specific and testable', '- Have a short, descriptive ID', - '- Include a clear description of what to check', + '- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)', '- Indicate if it is required (mandatory) or optional', '- Have an appropriate weight (default 1.0, use higher values for more important aspects)', '', diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index e5cfeb4c..54f45738 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -554,11 +554,11 @@ export async function parseEvaluators( .filter((r): r is JsonObject => isJsonObject(r)) .map((rubric, index) => ({ id: asString(rubric.id) ?? `rubric-${index + 1}`, - description: asString(rubric.description) ?? '', + expected_outcome: asString(rubric.expected_outcome) ?? '', weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, })) - .filter((r) => r.description.length > 0) + .filter((r) => r.expected_outcome.length > 0) : undefined; if (typeValue === 'rubric') { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index c7f571f3..d5e3c3b1 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -202,7 +202,7 @@ export type LlmJudgeEvaluatorConfig = { export type RubricItem = { readonly id: string; - readonly description: string; + readonly expected_outcome: string; readonly weight: number; readonly required: boolean; }; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 7633c4d5..98d5725d 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -224,19 +224,19 @@ export async function loadEvalCases( if (typeof rubric === 'string') { return { id: `rubric-${index + 1}`, - description: rubric, + expected_outcome: rubric, weight: 1.0, required: true, }; } return { id: asString(rubric.id) ?? `rubric-${index + 1}`, - description: asString(rubric.description) ?? '', + expected_outcome: asString(rubric.expected_outcome) ?? '', weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, }; }) - .filter((r) => r.description.length > 0); + .filter((r) => r.expected_outcome.length > 0); if (rubricItems.length > 0) { const rubricEvaluator: import('./types.js').LlmJudgeEvaluatorConfig = { From 129f5b05fb7e8eae85948a694cc6b6bde74c87b0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 13:49:59 +1100 Subject: [PATCH 2/9] Format package.json arrays for Biome --- apps/cli/package.json | 5 +---- packages/core/package.json | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index c2392d0d..b440ea4f 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/packages/core/package.json b/packages/core/package.json index 18b90569..94210544 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", From 1db89fbdcea1a1894db32b8b3fe2e6aff445e677 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 14:23:45 +1100 Subject: [PATCH 3/9] Amend score-range proposal: required_min_score + per-criterion score_ranges --- .../changes/add-rubric-score-ranges/design.md | 47 +++++++++++-------- .../add-rubric-score-ranges/proposal.md | 29 +++++++----- .../specs/rubric-evaluator/spec.md | 26 +++++----- .../specs/yaml-schema/spec.md | 41 +++++++++++----- .../changes/add-rubric-score-ranges/tasks.md | 22 +++++---- 5 files changed, 101 insertions(+), 64 deletions(-) diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md index 5c9764db..5210937b 100644 --- a/openspec/changes/add-rubric-score-ranges/design.md +++ b/openspec/changes/add-rubric-score-ranges/design.md @@ -4,42 +4,51 @@ AgentV currently supports rubric-based evaluation by converting `rubrics` into ` External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1. ## Decision -Keep a **single rubric system** by extending the existing `rubrics` field to support an additional rubric entry shape for **score-range scoring**, without removing the existing checklist scoring. +Evolve to a **single rubric system** that supports both "DeepEval-style" banded scoring and multi-criterion weighted scoring by introducing **per-criterion score ranges**. + +Each rubric criterion keeps an `id` (and optional `weight`), but can optionally include `score_ranges` that define non-overlapping 0–10 bands with concrete expected outcomes. The judge returns an integer score 0..10 per criterion; the runtime normalizes each to 0..1 and aggregates deterministically. This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`. +The existing `required: boolean` is replaced (in the proposed primary shape) by `required_min_score: int` gating. `required` remains accepted as a deprecated alias during migration. + ### Proposed YAML Shape ```yaml evaluators: - name: correctness type: llm_judge rubrics: - - score_range: [0, 2] - expected_outcome: Factually incorrect. - - score_range: [3, 6] - expected_outcome: Mostly correct but includes notable errors or omissions. - - score_range: [7, 9] - expected_outcome: Correct with minor missing details. - - score_range: [10, 10] - expected_outcome: Fully correct and complete. + - id: correctness + weight: 1.0 + required_min_score: 10 + score_ranges: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct but includes notable errors or omissions. + - score_range: [7, 9] + expected_outcome: Correct with minor missing details. + - score_range: [10, 10] + expected_outcome: Fully correct and complete. ``` ### Output Contract -- Judge returns `score` as an **integer** in `0..10`. -- AgentV normalizes to `0..1` by dividing by 10. -- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail) and required-item behavior for checklist mode. +- Judge returns a **per-criterion** `score` as an integer in `0..10` for each rubric `id`. +- AgentV normalizes each to `0..1` by dividing by 10 and aggregates deterministically (weighted average). +- If any criterion has `required_min_score` and the returned score is below it, the verdict is forced to `fail`. +- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail). ## Validation Rules - Ranges are inclusive integer bounds. - Bounds must be within 0..10. -- No overlap. -- Must cover 0..10 inclusive. +- No overlap (within a given rubric criterion). +- Prefer full coverage of 0..10 inclusive (strict coverage recommended for determinism). - Each range must have non-empty `expected_outcome`. ## Backwards Compatibility -- If `rubrics` contains checklist entries, existing checklist behavior remains the default. -- If `rubrics` contains score-range entries, range scoring is used. -- To avoid ambiguous mixing, the proposal treats `rubrics` as **either all checklist or all score-range entries**. +- Existing checklist rubrics remain supported during migration. +- `required` is treated as a deprecated alias for `required_min_score: 10`. +- New rubric criteria may include `score_ranges` for banded 0–10 scoring. ### Migration - Replace checklist rubric object field `description:` with `expected_outcome:`. @@ -53,8 +62,8 @@ evaluators: ### Can score-range rubrics be deterministically mapped to the existing weighted-average system? Not in a semantics-preserving way. -Score-range rubrics define a *single holistic ordinal grade* (an integer 0..10) with an expected outcome per interval. -Checklist rubrics define *multiple independent criteria* with per-criterion weights and required flags, and compute a weighted fraction. +Holistic score-range rubrics define a *single ordinal grade* (an integer 0..10) with an expected outcome per interval. +Checklist rubrics define *multiple independent criteria* with per-criterion weights and gating, and compute a weighted fraction. Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector. Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input. diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md index 50f26d44..e66ebd81 100644 --- a/openspec/changes/add-rubric-score-ranges/proposal.md +++ b/openspec/changes/add-rubric-score-ranges/proposal.md @@ -12,23 +12,23 @@ Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confi Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact. ## What Changes -- Extend the existing `rubrics` concept to support **two rubric shapes** under a single field: - - **Checklist rubrics** (breaking rename): `{ id, expected_outcome, weight, required }` - - **Score-range rubrics** (new, optional): `{ score_range: [start, end], expected_outcome }` over **0–10 inclusive** +- Extend the existing `rubrics` concept to support **per-criterion score ranges** (analytic rubric scoring): + - Each rubric entry represents a criterion with an `id` and optional aggregation `weight`. + - Each criterion can include `score_ranges` (0–10 inclusive integer bands) with explicit `expected_outcome` text. + - The judge returns an integer score **0–10 per criterion**, which AgentV normalizes to **0–1** (divide by 10) and aggregates (weighted average). - This keeps a single rubric system and a single evaluator implementation while covering both use cases. +- Replace `required: boolean` with `required_min_score: int` (0–10) for gating. + - If a criterion has `required_min_score`, the overall verdict MUST be `fail` when the criterion score is below that threshold. -- When the evaluator is configured with score-range rubrics, it: - - Constrains the judge to output an integer **raw score 0–10** - - Normalizes to **0–1** (divide by 10) for the existing `EvaluationScore.score` -- Add validation rules: +- Add validation rules (for per-criterion score ranges): - Ranges MUST be integers within **0..10** - - Ranges MUST NOT overlap - - Ranges MUST cover **0..10** (inclusive) + - Ranges MUST NOT overlap within a criterion + - Ranges SHOULD cover **0..10** (inclusive) within a criterion (strict coverage is preferred for determinism) - Each range MUST include a non-empty `expected_outcome` -- Preserve the current behavior: - - Existing `llm_judge` freeform scoring (0–1) unchanged - - Existing `llm_judge` rubric checklist scoring logic unchanged (only the field name changes) + +- Backwards compatibility: + - Existing checklist rubrics remain supported during migration. + - `required` is treated as a deprecated alias for `required_min_score: 10`. ## Breaking Changes - **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`. @@ -38,6 +38,9 @@ Adding this as an **optional, backwards-compatible** scoring mode gives AgentV u - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]` - CLI `generate rubrics` output changes accordingly. +- **BREAKING (proposed new primary shape)**: Prefer `required_min_score` over `required`. + - `required` remains accepted as a deprecated alias during migration. + ## Impact - Affected specs: `rubric-evaluator`, `yaml-schema`. - Affected code (expected): diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md index 3d0b2d48..5afc100b 100644 --- a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md +++ b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md @@ -1,14 +1,14 @@ ## MODIFIED Requirements ### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics -The evaluator SHALL support rubric-based grading using a single `rubrics` field in one of two shapes: +The evaluator SHALL support rubric-based grading using rubric criteria entries. Each criterion may be: -1) **Checklist rubrics** (BREAKING rename): per-item boolean checks with weighted aggregation, using `expected_outcome` (formerly `description`). -2) **Score-range rubrics** (new, optional): a set of non-overlapping integer score ranges over 0–10 inclusive, each with an explicit `expected_outcome`. +1) **Checklist-style** (legacy): boolean checks per criterion using `expected_outcome` text. +2) **Score-range per criterion** (new): each criterion contains `score_ranges` defining non-overlapping integer ranges over 0–10 inclusive, each with an explicit `expected_outcome`. -If score-range rubrics are configured, the evaluator SHALL instruct the judge to output a **single integer score** in 0..10 and then normalize it to 0..1 for the reported evaluation score. +When score-ranges are present for a criterion, the evaluator SHALL instruct the judge to output an **integer score 0..10 for that criterion** and then normalize it to 0..1 for aggregation. -The system SHALL reject ambiguous configurations where `rubrics` mixes checklist and score-range entries. +The evaluator SHALL support `required_min_score` gating: if a criterion specifies `required_min_score` and the returned score is below it, the overall verdict SHALL be `fail`. #### Scenario: Checklist rubrics continue to work - **GIVEN** an eval case with `rubrics` (id/description/weight/required) @@ -17,10 +17,10 @@ The system SHALL reject ambiguous configurations where `rubrics` mixes checklist - **AND** the reported score SHALL be in 0..1 #### Scenario: Range rubrics constrain scoring -- **GIVEN** an eval case with `rubrics` consisting of multiple `score_range` entries and `expected_outcome` text +- **GIVEN** an eval case with `rubrics` where a criterion contains `score_ranges` entries and `expected_outcome` text - **WHEN** the rubric evaluator runs -- **THEN** the judge SHALL be constrained to output an integer score in 0..10 -- **AND** the system SHALL normalize the score to 0..1 by dividing by 10 +- **THEN** the judge SHALL be constrained to output an integer score in 0..10 for that criterion +- **AND** the system SHALL normalize each criterion score to 0..1 by dividing by 10 #### Scenario: Invalid range rubrics are rejected - **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10 @@ -37,8 +37,12 @@ The evaluator SHALL validate judge output against a schema appropriate to the co - **THEN** the evaluator SHALL accept a JSON object matching: ```typescript z.object({ - score: z.number().int().min(0).max(10), - reasoning: z.string().optional(), + checks: z.array(z.object({ + id: z.string(), + score: z.number().int().min(0).max(10), + reasoning: z.string().optional(), + })), + overall_reasoning: z.string().optional(), }) ``` -- **AND** AgentV SHALL normalize `score / 10` into the standard 0..1 result. +- **AND** AgentV SHALL normalize per-criterion `score / 10` into the standard 0..1 result and aggregate. diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md index 948d8f0c..eb085666 100644 --- a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md +++ b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md @@ -10,13 +10,28 @@ rubrics: - id: structure expected_outcome: Has clear headings and organization weight: 1.0 - required: true + required_min_score: 10 ``` - **WHEN** the YAML is parsed - **THEN** schema validation succeeds -### Requirement: Score-range rubrics MUST be supported for LLM judging -The YAML schema SHALL support configuring score-range rubrics for `llm_judge` evaluators via the existing `rubrics` field. +### Requirement: Rubric gating MUST support required_min_score +The YAML schema SHALL support `required_min_score` (0..10) on rubric criteria to enforce hard-gating. + +#### Scenario: required_min_score gates rubric criteria +- **GIVEN** a YAML eval case with: +```yaml +rubrics: + - id: correctness + weight: 2.0 + required_min_score: 10 + expected_outcome: Must be fully correct. +``` +- **WHEN** the YAML is parsed +- **THEN** schema validation succeeds + +### Requirement: Per-criterion score_ranges rubrics MUST be supported for LLM judging +The YAML schema SHALL support configuring per-criterion `score_ranges` for `llm_judge` evaluators via the existing `rubrics` field. #### Scenario: Configure score_rubric - **GIVEN** a YAML eval case with: @@ -25,14 +40,18 @@ evaluators: - name: correctness type: llm_judge rubrics: - - score_range: [0, 2] - expected_outcome: Factually incorrect. - - score_range: [3, 6] - expected_outcome: Mostly correct. - - score_range: [7, 9] - expected_outcome: Correct but missing minor details. - - score_range: [10, 10] - expected_outcome: Fully correct. + - id: correctness + weight: 1.0 + required_min_score: 10 + score_ranges: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct. + - score_range: [7, 9] + expected_outcome: Correct but missing minor details. + - score_range: [10, 10] + expected_outcome: Fully correct. ``` - **WHEN** the YAML is parsed - **THEN** the evaluator configuration SHALL include the provided score ranges diff --git a/openspec/changes/add-rubric-score-ranges/tasks.md b/openspec/changes/add-rubric-score-ranges/tasks.md index bd86dfb2..258fe043 100644 --- a/openspec/changes/add-rubric-score-ranges/tasks.md +++ b/openspec/changes/add-rubric-score-ranges/tasks.md @@ -1,25 +1,27 @@ ## 1. Schema & Types -- [ ] 1.1 Add `ScoreRangeRubric` types (0–10 integer ranges) to core evaluation types -- [ ] 1.2 Extend evaluator config to accept optional `score_rubric` (or `score_ranges`) field +- [ ] 1.1 Add `ScoreRange` and `RubricCriterion` types (per-criterion 0–10 integer ranges) to core evaluation types +- [ ] 1.2 Extend rubric criteria to accept `score_ranges` and `required_min_score` (deprecate `required`) ## 2. Validation - [ ] 2.1 Validate ranges are integers within 0..10 and start <= end -- [ ] 2.2 Validate non-overlap across ranges -- [ ] 2.3 Validate full coverage of 0..10 inclusive +- [ ] 2.2 Validate non-overlap within each criterion's ranges +- [ ] 2.3 Validate (preferred) full coverage of 0..10 inclusive per criterion - [ ] 2.4 Validate each range has non-empty `expected_outcome` +- [ ] 2.5 Validate `required_min_score` is an integer within 0..10 ## 3. LLM Judge Integration -- [ ] 3.1 Add prompt template for range-rubric scoring that requests integer `score` 0..10 -- [ ] 3.2 Normalize final score to 0..1 (divide by 10) and keep existing verdict logic -- [ ] 3.3 Store raw 0–10 score in `details` (or `evaluatorRawRequest/Response`) for debugging +- [ ] 3.1 Add prompt template for per-criterion score-range scoring that requests integer `score` 0..10 per rubric `id` +- [ ] 3.2 Normalize criterion scores to 0..1 (divide by 10) and aggregate deterministically (weighted average) +- [ ] 3.3 Apply `required_min_score` gating (force fail when any gated criterion is below threshold) +- [ ] 3.4 Store raw 0–10 scores in `details` (or `evaluatorRawRequest/Response`) for debugging ## 4. YAML Support -- [ ] 4.1 Support `score_rubric` in YAML evaluator config (snake_case) -- [ ] 4.2 Decide if inline `rubrics:` sugar can support range rubrics (or keep evaluator-only) +- [ ] 4.1 Support `score_ranges` nested under each rubric criterion in YAML +- [ ] 4.2 Support `required_min_score` in YAML and treat legacy `required: true` as `required_min_score: 10` ## 5. Tests - [ ] 5.1 Unit tests for validation (overlap, gaps, bounds) -- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization +- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization + gating ## 6. Docs - [ ] 6.1 Update rubric-evaluator skill/reference docs to include range rubrics From 143fd1906c866cf23f4f12f1b00fbfa1cfb124d3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 14:25:45 +1100 Subject: [PATCH 4/9] Update score-range rubric example to score_ranges --- examples/features/rubric/evals/dataset.yaml | 27 ++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml index 87837649..6a81e557 100644 --- a/examples/features/rubric/evals/dataset.yaml +++ b/examples/features/rubric/evals/dataset.yaml @@ -180,7 +180,7 @@ evalcases: # ========================================== # Example 5: Score-range rubrics (PROPOSED) - # Demonstrates: proposed 0–10 score_range rubrics with expected_outcome per range + # Demonstrates: proposed per-criterion 0–10 score_ranges with required_min_score gating # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`. # It is NOT supported by the current runtime until that change is implemented. # ========================================== @@ -197,14 +197,19 @@ evalcases: - role: assistant content: 42 - # Proposed polymorphic `rubrics` entries. - # Each entry defines an inclusive integer score range (0..10) with a concrete expected outcome. + # Proposed rubric criterion with score_ranges. + # The judge assigns an integer score 0..10 for each criterion id. + # AgentV normalizes each criterion score to 0..1 and aggregates deterministically. rubrics: - - score_range: [0, 2] - expected_outcome: Incorrect or nonsensical answer. - - score_range: [3, 6] - expected_outcome: Partially correct but has clear errors or missing reasoning. - - score_range: [7, 9] - expected_outcome: Correct answer with minor issues (e.g., unclear explanation). - - score_range: [10, 10] - expected_outcome: Fully correct and clear. + - id: correctness + weight: 1.0 + required_min_score: 10 + score_ranges: + - score_range: [0, 2] + expected_outcome: Incorrect or nonsensical answer. + - score_range: [3, 6] + expected_outcome: Partially correct but has clear errors or missing reasoning. + - score_range: [7, 9] + expected_outcome: Correct answer with minor issues (e.g., unclear explanation). + - score_range: [10, 10] + expected_outcome: Fully correct and clear. From 751ffba6143898af7e5b42fc5a132541d2dd89c0 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 14:30:41 +1100 Subject: [PATCH 5/9] Add multi-criteria score_ranges eval example --- examples/features/rubric/evals/dataset.yaml | 58 +++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml index 6a81e557..5e623fda 100644 --- a/examples/features/rubric/evals/dataset.yaml +++ b/examples/features/rubric/evals/dataset.yaml @@ -213,3 +213,61 @@ evalcases: expected_outcome: Correct answer with minor issues (e.g., unclear explanation). - score_range: [10, 10] expected_outcome: Fully correct and clear. + + # ========================================== + # Example 6: Multi-criteria score_ranges (PROPOSED) + # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation. + # Real-world intent: grading a summary on both factual accuracy and brevity. + # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented. + # ========================================== + - id: summary-multi-criteria-score-ranges-proposed + + expected_outcome: |- + Provide an accurate summary in under 50 words. + + input_messages: + - role: user + content: |- + Summarize this article in under 50 words: + + Climate change is accelerating faster than predicted. Recent studies show + Arctic ice melting at unprecedented rates, sea levels rising, and extreme + weather events becoming more frequent. Scientists urge immediate action to + reduce carbon emissions and transition to renewable energy sources. + + expected_messages: + - role: assistant + content: |- + Climate change is accelerating with rapid Arctic ice loss, rising seas, and + more extreme weather. Scientists urge urgent emissions cuts and a transition + to renewable energy. + + rubrics: + - id: factual_accuracy + weight: 2.0 + required_min_score: 8 + score_ranges: + - score_range: [0, 2] + expected_outcome: Contains major factual errors or contradicts the article. + - score_range: [3, 5] + expected_outcome: Mostly on-topic but includes at least one clear factual error or misstates a key claim. + - score_range: [6, 7] + expected_outcome: Generally accurate but misses an important point or slightly distorts emphasis. + - score_range: [8, 9] + expected_outcome: Accurate and covers the key points with only minor omissions. + - score_range: [10, 10] + expected_outcome: Fully accurate, captures all key points with no distortions. + + - id: brevity_and_clarity + weight: 1.0 + score_ranges: + - score_range: [0, 2] + expected_outcome: Exceeds 50 words or is hard to understand. + - score_range: [3, 5] + expected_outcome: Under 50 words but unclear, repetitive, or poorly structured. + - score_range: [6, 7] + expected_outcome: Under 50 words and mostly clear, but could be more concise or better phrased. + - score_range: [8, 9] + expected_outcome: Under 50 words, clear and concise. + - score_range: [10, 10] + expected_outcome: Under 50 words, exceptionally clear, concise, and well phrased. From a81e1032317057ae18e91e7a9ad4c91371ac2193 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 9 Jan 2026 14:31:19 +1100 Subject: [PATCH 6/9] Remove redundant single-criterion score_ranges example --- examples/features/rubric/evals/dataset.yaml | 38 +-------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml index 5e623fda..6b902412 100644 --- a/examples/features/rubric/evals/dataset.yaml +++ b/examples/features/rubric/evals/dataset.yaml @@ -179,43 +179,7 @@ evalcases: # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml # ========================================== - # Example 5: Score-range rubrics (PROPOSED) - # Demonstrates: proposed per-criterion 0–10 score_ranges with required_min_score gating - # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`. - # It is NOT supported by the current runtime until that change is implemented. - # ========================================== - - id: correctness-score-range-proposed - - expected_outcome: |- - Answer the question correctly and completely. - - input_messages: - - role: user - content: What is 15 + 27? - - expected_messages: - - role: assistant - content: 42 - - # Proposed rubric criterion with score_ranges. - # The judge assigns an integer score 0..10 for each criterion id. - # AgentV normalizes each criterion score to 0..1 and aggregates deterministically. - rubrics: - - id: correctness - weight: 1.0 - required_min_score: 10 - score_ranges: - - score_range: [0, 2] - expected_outcome: Incorrect or nonsensical answer. - - score_range: [3, 6] - expected_outcome: Partially correct but has clear errors or missing reasoning. - - score_range: [7, 9] - expected_outcome: Correct answer with minor issues (e.g., unclear explanation). - - score_range: [10, 10] - expected_outcome: Fully correct and clear. - - # ========================================== - # Example 6: Multi-criteria score_ranges (PROPOSED) + # Example 5: Multi-criteria score_ranges (PROPOSED) # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation. # Real-world intent: grading a summary on both factual accuracy and brevity. # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented. From 3fa4d06b63029cb699128fcf1f27468d7eb08de5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 21 Jan 2026 12:19:00 +0000 Subject: [PATCH 7/9] Implement score_ranges rubrics with backward-compatible description alias - Add description as backward-compatible alias for expected_outcome in all parsers (evaluator-parser, yaml-parser, jsonl-parser) - Add ScoreRange and extended RubricItem types with score_ranges and required_min_score - Implement score_ranges parsing with validation: - Ranges must be integers 0-10 - Ranges must not overlap - Ranges must cover 0-10 inclusive - Each range requires non-empty expected_outcome - Implement score-range evaluation in llm-judge: - Detect score_ranges rubrics automatically - Build specialized prompt for 0-10 integer scoring - Normalize scores to 0-1 (divide by 10) - Apply required_min_score gating (legacy required: true = required_min_score: 10) - Update test to use expected_outcome field name --- .../src/evaluation/evaluators/llm-judge.ts | 233 ++++++++++++++++++ .../evaluation/loaders/evaluator-parser.ts | 208 +++++++++++++++- .../src/evaluation/loaders/jsonl-parser.ts | 9 +- packages/core/src/evaluation/types.ts | 40 ++- packages/core/src/evaluation/yaml-parser.ts | 5 +- .../core/test/evaluation/evaluators.test.ts | 4 +- 6 files changed, 482 insertions(+), 17 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index fb643e8b..0974f2ef 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -57,6 +57,21 @@ const rubricEvaluationSchema = z.object({ overall_reasoning: z.string().describe('Overall assessment summary (1-2 sentences)'), }); +/** + * Schema for score-range rubric evaluation. + * Each check returns an integer score 0-10 instead of boolean satisfied. + */ +const scoreRangeCheckResultSchema = z.object({ + id: z.string().describe('The ID of the rubric criterion being scored'), + score: z.number().int().min(0).max(10).describe('Integer score 0-10 for this criterion'), + reasoning: z.string().describe('Brief explanation (1-2 sentences) for this score').optional(), +}); + +const scoreRangeEvaluationSchema = z.object({ + checks: z.array(scoreRangeCheckResultSchema).describe('Scores for each rubric criterion'), + overall_reasoning: z.string().describe('Overall assessment summary (1-2 sentences)').optional(), +}); + export { freeformEvaluationSchema }; export class LlmJudgeEvaluator implements Evaluator { @@ -175,6 +190,13 @@ export class LlmJudgeEvaluator implements Evaluator { ); } + // Detect if any rubric uses score_ranges (analytic rubric mode) + const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); + + if (hasScoreRanges) { + return this.evaluateWithScoreRanges(context, judgeProvider, rubrics); + } + const prompt = this.buildRubricPrompt(context, rubrics); const systemPrompt = buildRubricOutputSchema(); @@ -205,6 +227,112 @@ export class LlmJudgeEvaluator implements Evaluator { }; } + /** + * Evaluate using score-range rubrics (analytic rubric scoring). + * Each criterion is scored 0-10 and normalized to 0-1. + */ + private async evaluateWithScoreRanges( + context: EvaluationContext, + judgeProvider: Provider, + rubrics: readonly RubricItem[], + ): Promise { + const prompt = this.buildScoreRangePrompt(context, rubrics); + const systemPrompt = buildScoreRangeOutputSchema(); + + const evaluatorRawRequest: JsonObject = { + userPrompt: prompt, + systemPrompt, + target: judgeProvider.targetName, + }; + + const { data } = await this.runWithRetry({ + context, + judgeProvider, + systemPrompt, + userPrompt: prompt, + schema: scoreRangeEvaluationSchema, + }); + + const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics); + + return { + score, + verdict, + hits, + misses, + expectedAspectCount: rubrics.length, + reasoning: data.overall_reasoning, + evaluatorRawRequest, + details, + }; + } + + /** + * Build prompt for score-range rubric evaluation. + */ + private buildScoreRangePrompt( + context: EvaluationContext, + rubrics: readonly RubricItem[], + ): string { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + + const parts: string[] = [ + 'You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.', + 'For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.', + '', + '[[ ## question ## ]]', + formattedQuestion, + '', + '[[ ## expected_outcome ## ]]', + context.evalCase.expected_outcome, + '', + ]; + + if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { + parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); + } + + parts.push( + '[[ ## candidate_answer ## ]]', + context.candidate, + '', + '[[ ## scoring_criteria ## ]]', + ); + + for (const rubric of rubrics) { + const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; + const minScoreLabel = + rubric.required_min_score !== undefined + ? ` [REQUIRED: min score ${rubric.required_min_score}]` + : ''; + + parts.push('', `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`); + + if (rubric.expected_outcome) { + parts.push(`Description: ${rubric.expected_outcome}`); + } + + if (rubric.score_ranges && rubric.score_ranges.length > 0) { + parts.push('Score ranges:'); + for (const range of rubric.score_ranges) { + const [min, max] = range.score_range; + const rangeLabel = min === max ? `${min}` : `${min}-${max}`; + parts.push(` - Score ${rangeLabel}: ${range.expected_outcome}`); + } + } + } + + parts.push( + '', + 'For each criterion, provide an integer score 0-10 that matches one of its defined score ranges.', + ); + + return parts.join('\n'); + } + private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string { const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 @@ -366,3 +494,108 @@ function calculateRubricScore( const verdict = failedRequired ? 'fail' : scoreToVerdict(score); return { score, verdict, hits, misses }; } + +/** + * Build the output schema for score-range rubric evaluation. + */ +function buildScoreRangeOutputSchema(): string { + return `You are an expert evaluator. Score the candidate answer on each criterion. +You must return a valid JSON object matching this schema: +{ + "checks": [ + { + "id": "string (criterion id)", + "score": integer (0-10), + "reasoning": "string (brief explanation for score)" + } + ], + "overall_reasoning": "string (summary, optional)" +} + +Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`; +} + +/** + * Calculate score from score-range rubric evaluation results. + * - Normalizes each criterion score (0-10) to 0-1 by dividing by 10 + * - Computes weighted average across criteria + * - Applies required_min_score gating (force fail if below threshold) + */ +function calculateScoreRangeResult( + result: z.infer, + rubrics: readonly RubricItem[], +): { + score: number; + verdict: 'pass' | 'fail' | 'borderline'; + hits: string[]; + misses: string[]; + details: JsonObject; +} { + const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric])); + const hits: string[] = []; + const misses: string[] = []; + const rawScores: Record = {}; + let totalWeight = 0; + let weightedScoreSum = 0; + let failedRequired = false; + + for (const check of result.checks) { + const rubric = rubricMap.get(check.id); + if (!rubric) { + continue; + } + + const rawScore = Math.max(0, Math.min(10, check.score)); // Clamp to 0-10 + const normalizedScore = rawScore / 10; // Normalize to 0-1 + rawScores[rubric.id] = rawScore; + + totalWeight += rubric.weight; + weightedScoreSum += normalizedScore * rubric.weight; + + // Determine required minimum score: + // - If required_min_score is set, use it directly + // - If required is true (legacy), treat as required_min_score: 10 + // - Otherwise, no gating + let requiredMinScore: number | undefined; + if (rubric.required_min_score !== undefined) { + requiredMinScore = rubric.required_min_score; + } else if (rubric.required === true) { + requiredMinScore = 10; // Legacy: required: true means must score 10/10 + } + + // Find the matching score range description for reporting + const matchingRange = rubric.score_ranges?.find( + (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1], + ); + const rangeDescription = matchingRange?.expected_outcome ?? ''; + const criterionLabel = rubric.expected_outcome ?? rubric.id; + + const reasoningText = check.reasoning ? `: ${check.reasoning}` : ''; + const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`; + + // Check gating + if (requiredMinScore !== undefined && rawScore < requiredMinScore) { + failedRequired = true; + misses.push(scoreInfo); + } else if (rawScore >= 7) { + hits.push(scoreInfo); + } else { + misses.push(scoreInfo); + } + } + + const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0; + const verdict = failedRequired ? 'fail' : scoreToVerdict(score); + + return { + score, + verdict, + hits, + misses, + details: { + raw_scores: rawScores, + normalization: 'score / 10', + aggregation: 'weighted_average', + }, + }; +} diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 54f45738..91a546a5 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -550,15 +550,7 @@ export async function parseEvaluators( const rawRubrics = rawEvaluator.rubrics; const parsedRubrics = Array.isArray(rawRubrics) - ? rawRubrics - .filter((r): r is JsonObject => isJsonObject(r)) - .map((rubric, index) => ({ - id: asString(rubric.id) ?? `rubric-${index + 1}`, - expected_outcome: asString(rubric.expected_outcome) ?? '', - weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, - required: typeof rubric.required === 'boolean' ? rubric.required : true, - })) - .filter((r) => r.expected_outcome.length > 0) + ? parseRubricItems(rawRubrics, name, evalId) : undefined; if (typeValue === 'rubric') { @@ -714,3 +706,201 @@ function isValidFieldAggregationType( ): value is import('../types.js').FieldAggregationType { return typeof value === 'string' && VALID_FIELD_AGGREGATION_TYPES.has(value); } + +/** + * Parse rubric items from raw YAML/JSON data. + * Supports both checklist rubrics and score-range rubrics. + */ +function parseRubricItems( + rawRubrics: readonly unknown[], + evaluatorName: string, + evalId: string, +): import('../types.js').RubricItem[] | undefined { + const items: import('../types.js').RubricItem[] = []; + + for (const [index, rawRubric] of rawRubrics.entries()) { + if (!isJsonObject(rawRubric)) { + logWarning( + `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`, + ); + continue; + } + + const id = asString(rawRubric.id) ?? `rubric-${index + 1}`; + // Support both expected_outcome and description (backward compatibility) + const expectedOutcome = + asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? ''; + const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0; + + // Parse required_min_score (new) or required (legacy backward compat) + let requiredMinScore: number | undefined; + let required: boolean | undefined; + + if (typeof rawRubric.required_min_score === 'number') { + const minScore = rawRubric.required_min_score; + if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) { + throw new Error( + `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`, + ); + } + requiredMinScore = minScore; + } + + if (typeof rawRubric.required === 'boolean') { + required = rawRubric.required; + } + + // Parse score_ranges if present + let scoreRanges: import('../types.js').ScoreRange[] | undefined; + const rawScoreRanges = rawRubric.score_ranges; + + if (rawScoreRanges !== undefined) { + if (!Array.isArray(rawScoreRanges)) { + throw new Error( + `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`, + ); + } + + scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId); + + // For score-range rubrics, expected_outcome at rubric level is optional + items.push({ + id, + weight, + ...(expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {}), + ...(required !== undefined ? { required } : {}), + ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), + score_ranges: scoreRanges, + }); + } else { + // Checklist rubric: expected_outcome is required + if (expectedOutcome.length === 0) { + logWarning( + `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`, + ); + continue; + } + + items.push({ + id, + expected_outcome: expectedOutcome, + weight, + // Default to required: true if not specified (backward compatibility) + required: required ?? true, + ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}), + }); + } + } + + return items.length > 0 ? items : undefined; +} + +/** + * Parse and validate score ranges for a rubric criterion. + * Validates: + * - Ranges are [min, max] with integers 0-10 + * - min <= max + * - Non-overlapping ranges + * - Full coverage of 0-10 (warning if not covered) + * - Each range has non-empty expected_outcome + */ +function parseScoreRanges( + rawRanges: readonly unknown[], + rubricId: string, + evaluatorName: string, + evalId: string, +): import('../types.js').ScoreRange[] { + const ranges: import('../types.js').ScoreRange[] = []; + + for (const [index, rawRange] of rawRanges.entries()) { + if (!isJsonObject(rawRange)) { + throw new Error( + `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`, + ); + } + + const scoreRangeValue = rawRange.score_range; + if ( + !Array.isArray(scoreRangeValue) || + scoreRangeValue.length !== 2 || + typeof scoreRangeValue[0] !== 'number' || + typeof scoreRangeValue[1] !== 'number' + ) { + throw new Error( + `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`, + ); + } + + const [min, max] = scoreRangeValue; + + // Validate integers in 0-10 range + if (!Number.isInteger(min) || !Number.isInteger(max)) { + throw new Error( + `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`, + ); + } + + if (min < 0 || min > 10 || max < 0 || max > 10) { + throw new Error( + `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`, + ); + } + + if (min > max) { + throw new Error( + `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`, + ); + } + + // Validate expected_outcome + const expectedOutcome = + asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? ''; + if (expectedOutcome.length === 0) { + throw new Error( + `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`, + ); + } + + ranges.push({ + score_range: [min, max] as const, + expected_outcome: expectedOutcome, + }); + } + + // Validate non-overlapping ranges + const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]); + for (let i = 1; i < sortedRanges.length; i++) { + const prev = sortedRanges[i - 1]; + const curr = sortedRanges[i]; + if (curr.score_range[0] <= prev.score_range[1]) { + throw new Error( + `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': ` + + `[${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`, + ); + } + } + + // Validate full coverage of 0-10 (strict requirement per spec) + const covered = new Set(); + for (const range of ranges) { + for (let i = range.score_range[0]; i <= range.score_range[1]; i++) { + covered.add(i); + } + } + + const missing: number[] = []; + for (let i = 0; i <= 10; i++) { + if (!covered.has(i)) { + missing.push(i); + } + } + + if (missing.length > 0) { + throw new Error( + `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': ` + + `missing coverage for scores: ${missing.join(', ')}. Ranges must cover all integers 0-10.`, + ); + } + + return ranges; +} diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 4f2b9b73..185a33b8 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -268,19 +268,22 @@ export async function loadEvalCasesFromJsonl( if (typeof rubric === 'string') { return { id: `rubric-${index + 1}`, - description: rubric, + expected_outcome: rubric, weight: 1.0, required: true, }; } + // Support both expected_outcome and description (backward compatibility) + const expectedOutcome = + asString(rubric.expected_outcome) ?? asString(rubric.description) ?? ''; return { id: asString(rubric.id) ?? `rubric-${index + 1}`, - description: asString(rubric.description) ?? '', + expected_outcome: expectedOutcome, weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, }; }) - .filter((r) => r.description.length > 0); + .filter((r) => r.expected_outcome.length > 0); if (rubricItems.length > 0) { const rubricEvaluator: import('../types.js').LlmJudgeEvaluatorConfig = { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index d5e3c3b1..f41a63fb 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -200,11 +200,47 @@ export type LlmJudgeEvaluatorConfig = { readonly weight?: number; }; +/** + * Score range definition for analytic rubric scoring. + * Each range maps an integer score band (0-10) to an expected outcome description. + */ +export type ScoreRange = { + /** Inclusive integer range [min, max] within 0-10 */ + readonly score_range: readonly [number, number]; + /** Description of what this score range represents */ + readonly expected_outcome: string; +}; + +/** + * Rubric item for LLM judge evaluation. + * Supports two modes: + * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome` + * - Score-range mode: 0-10 integer scoring with `score_ranges` + */ export type RubricItem = { readonly id: string; - readonly expected_outcome: string; + /** + * For checklist rubrics: the expected outcome text (required). + * For score-range rubrics: optional overall criterion description. + */ + readonly expected_outcome?: string; readonly weight: number; - readonly required: boolean; + /** + * Legacy boolean gating (deprecated, treated as required_min_score: 10). + * Use required_min_score instead for finer control. + */ + readonly required?: boolean; + /** + * Minimum score (0-10) required to pass this criterion. + * If the criterion score is below this threshold, the overall verdict is 'fail'. + */ + readonly required_min_score?: number; + /** + * Score range definitions for analytic rubric scoring. + * When present, the judge outputs an integer 0-10 score per criterion. + * Ranges must be non-overlapping and cover 0-10 inclusive. + */ + readonly score_ranges?: readonly ScoreRange[]; }; export type CompositeAggregatorConfig = diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index 98d5725d..e66b67a7 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -229,9 +229,12 @@ export async function loadEvalCases( required: true, }; } + // Support both expected_outcome and description (backward compatibility) + const expectedOutcome = + asString(rubric.expected_outcome) ?? asString(rubric.description) ?? ''; return { id: asString(rubric.id) ?? `rubric-${index + 1}`, - expected_outcome: asString(rubric.expected_outcome) ?? '', + expected_outcome: expectedOutcome, weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, }; diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts index 532239b7..3fd5de55 100644 --- a/packages/core/test/evaluation/evaluators.test.ts +++ b/packages/core/test/evaluation/evaluators.test.ts @@ -367,8 +367,8 @@ describe('LlmJudgeEvaluator', () => { name: 'rubric', type: 'llm_judge', rubrics: [ - { id: 'r1', description: 'Mentions logging', weight: 1.0, required: true }, - { id: 'r2', description: 'Mentions tests', weight: 1.0, required: false }, + { id: 'r1', expected_outcome: 'Mentions logging', weight: 1.0, required: true }, + { id: 'r2', expected_outcome: 'Mentions tests', weight: 1.0, required: false }, ], }, }); From e549315bde9fd419f435880d1663553565e02c9b Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 21 Jan 2026 12:37:20 +0000 Subject: [PATCH 8/9] Add score_ranges tests and documentation - Add 4 minimal unit tests for score_ranges validation: - Valid score_ranges parsing with required_min_score - Overlapping ranges validation error - Incomplete coverage validation error - Backward-compatible description alias - Update rubric-evaluator.md skill documentation: - Add Score-Range Rubrics (Analytic Mode) section - Document score_ranges validation rules - Add required_min_score field to table - Add "When to Use Each Mode" guidance - Note description as backward-compatible alias --- .../references/rubric-evaluator.md | 103 +++++++++++++--- .../loaders/evaluator-parser.test.ts | 115 ++++++++++++++++++ 2 files changed, 200 insertions(+), 18 deletions(-) diff --git a/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md b/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md index f400892d..75e1869f 100644 --- a/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +++ b/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md @@ -23,7 +23,7 @@ evalcases: - States time complexity correctly ``` -### Detailed Rubric Objects +### Detailed Rubric Objects (Checklist Mode) Use objects for fine-grained control over weights and requirements: @@ -31,58 +31,125 @@ Use objects for fine-grained control over weights and requirements: evalcases: - id: technical-guide expected_outcome: Write a comprehensive HTTP status codes guide - + input_messages: - role: user content: Write a guide explaining HTTP status codes - + rubrics: - id: structure - description: Has clear headings and organization + expected_outcome: Has clear headings and organization weight: 1.0 required: true - + - id: success-codes - description: Covers 2xx success codes with examples + expected_outcome: Covers 2xx success codes with examples weight: 2.0 required: true - + - id: client-errors - description: Explains 4xx client error codes + expected_outcome: Explains 4xx client error codes weight: 2.0 required: true - + - id: server-errors - description: Explains 5xx server error codes + expected_outcome: Explains 5xx server error codes weight: 1.5 required: false - + - id: practical-examples - description: Includes practical use case examples + expected_outcome: Includes practical use case examples weight: 1.0 required: false ``` +### Score-Range Rubrics (Analytic Mode) + +For more granular scoring, use `score_ranges` to define 0-10 integer scoring per criterion: + +```yaml +evalcases: + - id: code-review + expected_outcome: Review the code for correctness and style + + input_messages: + - role: user + content: Review this Python function for issues + + rubrics: + - id: correctness + weight: 2.0 + required_min_score: 7 # Fail if score < 7 + score_ranges: + - score_range: [0, 2] + expected_outcome: Contains critical bugs or errors + - score_range: [3, 5] + expected_outcome: Has minor bugs or edge case issues + - score_range: [6, 8] + expected_outcome: Functionally correct with minor issues + - score_range: [9, 10] + expected_outcome: Fully correct implementation + + - id: style + weight: 1.0 + score_ranges: + - score_range: [0, 3] + expected_outcome: Poor style, hard to read + - score_range: [4, 6] + expected_outcome: Acceptable style with issues + - score_range: [7, 10] + expected_outcome: Clean, idiomatic code +``` + +**Score-range validation rules:** +- Ranges must be integers within 0-10 +- Ranges must not overlap +- Ranges must cover all values 0-10 (no gaps) +- Each range must have a non-empty `expected_outcome` + ## Rubric Object Fields | Field | Type | Default | Description | |-------|------|---------|-------------| | `id` | string | auto-generated | Unique identifier for the rubric | -| `description` | string | required | The criterion being evaluated | +| `expected_outcome` | string | required* | The criterion being evaluated (*optional if `score_ranges` used) | | `weight` | number | 1.0 | Relative importance (higher = more impact on score) | -| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' | +| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' (checklist mode) | +| `required_min_score` | integer | - | Minimum 0-10 score required to pass (score-range mode) | +| `score_ranges` | array | - | Score range definitions for analytic rubric scoring | + +> **Note:** `description` is supported as a backward-compatible alias for `expected_outcome`. ## Scoring and Verdicts -**Score Calculation:** +### Checklist Mode (boolean) ``` score = (sum of satisfied weights) / (total weights) ``` +### Score-Range Mode (0-10 integers) +``` +normalized_score = raw_score / 10 # Convert 0-10 to 0-1 +final_score = weighted_average(normalized_scores) +``` + **Verdict Rules:** -- `pass`: Score ≥ 0.8 AND all required rubrics satisfied -- `borderline`: Score ≥ 0.6 AND all required rubrics satisfied -- `fail`: Score < 0.6 OR any required rubric failed +- `pass`: Score ≥ 0.8 AND all gating criteria satisfied +- `borderline`: Score ≥ 0.6 AND all gating criteria satisfied +- `fail`: Score < 0.6 OR any gating criterion failed + +**Gating:** +- Checklist mode: `required: true` means must be satisfied +- Score-range mode: `required_min_score: N` means score must be ≥ N + +## When to Use Each Mode + +| Use Case | Mode | Why | +|----------|------|-----| +| Binary pass/fail criteria | Checklist | Simple yes/no evaluation | +| Quality gradient | Score-range | Captures nuance (poor → excellent) | +| Critical requirements | Checklist + `required: true` | Hard gating on must-haves | +| Minimum quality bar | Score-range + `required_min_score` | Flexible threshold gating | ## Combining Rubrics with Other Evaluators diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts index 73ae1ebc..63ed2136 100644 --- a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts +++ b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts @@ -312,6 +312,121 @@ describe('parseEvaluators - code_judge config pass-through', () => { }); }); +describe('parseEvaluators - score_ranges rubrics', () => { + it('parses valid score_ranges with required_min_score', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'correctness', + type: 'llm_judge', + rubrics: [ + { + id: 'accuracy', + weight: 2.0, + required_min_score: 7, + score_ranges: [ + { score_range: [0, 3], expected_outcome: 'Incorrect' }, + { score_range: [4, 6], expected_outcome: 'Partially correct' }, + { score_range: [7, 9], expected_outcome: 'Mostly correct' }, + { score_range: [10, 10], expected_outcome: 'Fully correct' }, + ], + }, + ], + }, + ], + }; + + const evaluators = await parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'); + + expect(evaluators).toHaveLength(1); + const config = evaluators?.[0]; + expect(config?.type).toBe('llm_judge'); + if (config?.type === 'llm_judge') { + expect(config.rubrics).toHaveLength(1); + const rubric = config.rubrics?.[0]; + expect(rubric?.id).toBe('accuracy'); + expect(rubric?.weight).toBe(2.0); + expect(rubric?.required_min_score).toBe(7); + expect(rubric?.score_ranges).toHaveLength(4); + } + }); + + it('throws on overlapping score_ranges', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'overlapping', + type: 'llm_judge', + rubrics: [ + { + id: 'test', + score_ranges: [ + { score_range: [0, 5], expected_outcome: 'Low' }, + { score_range: [4, 10], expected_outcome: 'High' }, // Overlaps at 4-5 + ], + }, + ], + }, + ], + }; + + await expect( + parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'), + ).rejects.toThrow(/overlapping/i); + }); + + it('throws on incomplete score_ranges coverage', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'incomplete', + type: 'llm_judge', + rubrics: [ + { + id: 'test', + score_ranges: [ + { score_range: [0, 3], expected_outcome: 'Low' }, + { score_range: [7, 10], expected_outcome: 'High' }, // Missing 4-6 + ], + }, + ], + }, + ], + }; + + await expect( + parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'), + ).rejects.toThrow(/coverage/i); + }); + + it('supports description as backward-compatible alias for expected_outcome', async () => { + const rawEvalCase = { + evaluators: [ + { + name: 'legacy', + type: 'llm_judge', + rubrics: [ + { + id: 'r1', + description: 'Must be polite', // Legacy field name + weight: 1.0, + required: true, + }, + ], + }, + ], + }; + + const evaluators = await parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'); + + expect(evaluators).toHaveLength(1); + const config = evaluators?.[0]; + if (config?.type === 'llm_judge') { + expect(config.rubrics?.[0]?.expected_outcome).toBe('Must be polite'); + } + }); +}); + describe('parseEvaluators - token_usage', () => { it('parses token_usage evaluator with limits', async () => { const rawEvalCase = { From 282cdf97966650dd6bb9ee2e1b98d4c247983d68 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 21 Jan 2026 12:42:11 +0000 Subject: [PATCH 9/9] Add changeset for score_ranges rubrics feature --- .changeset/score-ranges-rubrics.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 .changeset/score-ranges-rubrics.md diff --git a/.changeset/score-ranges-rubrics.md b/.changeset/score-ranges-rubrics.md new file mode 100644 index 00000000..889e201c --- /dev/null +++ b/.changeset/score-ranges-rubrics.md @@ -0,0 +1,13 @@ +--- +"@agentv/core": minor +"agentv": minor +--- + +Add score_ranges rubrics for analytic LLM judge evaluation + +- Add `score_ranges` field for 0-10 integer scoring per rubric criterion +- Add `required_min_score` field for flexible gating (replaces boolean `required`) +- Add `description` as backward-compatible alias for `expected_outcome` +- Validate score ranges: integers 0-10, non-overlapping, full coverage +- Normalize scores to 0-1 (divide by 10) with weighted aggregation +- Legacy `required: true` treated as `required_min_score: 10`