From 19fdec5f07b34b569e2241e81439f2f0e4f46be6 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 13:48:50 +1100
Subject: [PATCH 1/9] Rename rubric description to expected_outcome

Align CLI and rubric generator output with expected_outcome and add score-range rubric proposal + mapping analysis.
---
 apps/cli/src/commands/generate/rubrics.ts     |  4 +-
 examples/features/rubric/evals/dataset.yaml   | 41 +++++++++--
 .../changes/add-rubric-score-ranges/design.md | 73 +++++++++++++++++++
 .../add-rubric-score-ranges/proposal.md       | 54 ++++++++++++++
 .../specs/rubric-evaluator/spec.md            | 44 +++++++++++
 .../specs/yaml-schema/spec.md                 | 53 ++++++++++++++
 .../changes/add-rubric-score-ranges/tasks.md  | 26 +++++++
 .../src/evaluation/evaluators/llm-judge.ts    |  6 +-
 .../evaluation/generators/rubric-generator.ts |  6 +-
 .../evaluation/loaders/evaluator-parser.ts    |  4 +-
 packages/core/src/evaluation/types.ts         |  2 +-
 packages/core/src/evaluation/yaml-parser.ts   |  6 +-
 12 files changed, 300 insertions(+), 19 deletions(-)
 create mode 100644 openspec/changes/add-rubric-score-ranges/design.md
 create mode 100644 openspec/changes/add-rubric-score-ranges/proposal.md
 create mode 100644 openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
 create mode 100644 openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
 create mode 100644 openspec/changes/add-rubric-score-ranges/tasks.md

diff --git a/apps/cli/src/commands/generate/rubrics.ts b/apps/cli/src/commands/generate/rubrics.ts
index 3f2993f6..c2807195 100644
--- a/apps/cli/src/commands/generate/rubrics.ts
+++ b/apps/cli/src/commands/generate/rubrics.ts
@@ -160,9 +160,9 @@ export async function generateRubricsCommand(options: GenerateRubricsOptions): P
       caseNode.set(
         'rubrics',
         rubrics.map(
-          (r: { id: string; description: string; weight: number; required: boolean }) => ({
+          (r: { id: string; expected_outcome: string; weight: number; required: boolean }) => ({
             id: r.id,
-            description: r.description,
+            expected_outcome: r.expected_outcome,
             weight: r.weight,
             required: r.required,
           }),
diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml
index b9b764f9..87837649 100644
--- a/examples/features/rubric/evals/dataset.yaml
+++ b/examples/features/rubric/evals/dataset.yaml
@@ -71,27 +71,27 @@ evalcases:
     # Detailed rubric objects with weights and required flags
     rubrics:
       - id: structure
-        description: Has clear headings and organization
+        expected_outcome: Has clear headings and organization
         weight: 1.0
         required: true
 
       - id: success-codes
-        description: Covers 2xx success codes with examples
+        expected_outcome: Covers 2xx success codes with examples
         weight: 2.0
         required: true
 
       - id: client-errors
-        description: Explains 4xx client error codes
+        expected_outcome: Explains 4xx client error codes
         weight: 2.0
         required: true
 
       - id: server-errors
-        description: Explains 5xx server error codes
+        expected_outcome: Explains 5xx server error codes
         weight: 1.5
         required: false
 
       - id: practical-examples
-        description: Includes practical use case examples
+        expected_outcome: Includes practical use case examples
         weight: 1.0
         required: false
 
@@ -177,3 +177,34 @@ evalcases:
 
     # No rubrics defined - will use default llm_judge evaluator
     # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml
+
+  # ==========================================
+  # Example 5: Score-range rubrics (PROPOSED)
+  # Demonstrates: proposed 0–10 score_range rubrics with expected_outcome per range
+  # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`.
+  #         It is NOT supported by the current runtime until that change is implemented.
+  # ==========================================
+  - id: correctness-score-range-proposed
+
+    expected_outcome: |-
+      Answer the question correctly and completely.
+
+    input_messages:
+      - role: user
+        content: What is 15 + 27?
+
+    expected_messages:
+      - role: assistant
+        content: 42
+
+    # Proposed polymorphic `rubrics` entries.
+    # Each entry defines an inclusive integer score range (0..10) with a concrete expected outcome.
+    rubrics:
+      - score_range: [0, 2]
+        expected_outcome: Incorrect or nonsensical answer.
+      - score_range: [3, 6]
+        expected_outcome: Partially correct but has clear errors or missing reasoning.
+      - score_range: [7, 9]
+        expected_outcome: Correct answer with minor issues (e.g., unclear explanation).
+      - score_range: [10, 10]
+        expected_outcome: Fully correct and clear.
diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md
new file mode 100644
index 00000000..5c9764db
--- /dev/null
+++ b/openspec/changes/add-rubric-score-ranges/design.md
@@ -0,0 +1,73 @@
+## Context
+AgentV currently supports rubric-based evaluation by converting `rubrics` into `llm_judge` checklist items. The judge returns per-item `satisfied: boolean` checks and the runtime computes a weighted fraction score in 0..1.
+
+External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1.
+
+## Decision
+Keep a **single rubric system** by extending the existing `rubrics` field to support an additional rubric entry shape for **score-range scoring**, without removing the existing checklist scoring.
+
+This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`.
+
+### Proposed YAML Shape
+```yaml
+evaluators:
+  - name: correctness
+    type: llm_judge
+    rubrics:
+      - score_range: [0, 2]
+        expected_outcome: Factually incorrect.
+      - score_range: [3, 6]
+        expected_outcome: Mostly correct but includes notable errors or omissions.
+      - score_range: [7, 9]
+        expected_outcome: Correct with minor missing details.
+      - score_range: [10, 10]
+        expected_outcome: Fully correct and complete.
+```
+
+### Output Contract
+- Judge returns `score` as an **integer** in `0..10`.
+- AgentV normalizes to `0..1` by dividing by 10.
+- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail) and required-item behavior for checklist mode.
+
+## Validation Rules
+- Ranges are inclusive integer bounds.
+- Bounds must be within 0..10.
+- No overlap.
+- Must cover 0..10 inclusive.
+- Each range must have non-empty `expected_outcome`.
+
+## Backwards Compatibility
+- If `rubrics` contains checklist entries, existing checklist behavior remains the default.
+- If `rubrics` contains score-range entries, range scoring is used.
+- To avoid ambiguous mixing, the proposal treats `rubrics` as **either all checklist or all score-range entries**.
+
+### Migration
+- Replace checklist rubric object field `description:` with `expected_outcome:`.
+
+## Open Questions
+- Should AgentV allow gaps (e.g., reserve 0 for “unscorable”), or strictly require full coverage? (Proposal defaults to strict full coverage to match the cited best practice.)
+- Should mixed `rubrics` (checklist + score-range) be allowed, and if so how to combine them? (Proposal: disallow mixing for simplicity and determinism.)
+
+## Deterministic Mapping to Checklist (Weighted-Average) Rubrics
+
+### Can score-range rubrics be deterministically mapped to the existing weighted-average system?
+Not in a semantics-preserving way.
+
+Score-range rubrics define a *single holistic ordinal grade* (an integer 0..10) with an expected outcome per interval.
+Checklist rubrics define *multiple independent criteria* with per-criterion weights and required flags, and compute a weighted fraction.
+
+Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector.
+Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input.
+
+### Can checklist rubrics be deterministically mapped to score-range rubrics?
+Only in a lossy, wrapper-style way.
+
+Given checklist results, AgentV can deterministically compute a normalized score $s \in [0,1]$ and then map it to a raw integer $r = \mathrm{round}(10s)$ (or $\lfloor 10s \rfloor$, etc.).
+But that does not recreate the score-range *rubric definition* (expected outcomes per bucket), and it does not provide the core value of range rubrics: constraining the judge with explicit outcome descriptions per range.
+
+### Conclusion
+The two rubric modes are not redundant:
+- Checklist rubrics are best for requirement-driven grading (decomposable criteria, required flags, deterministic scoring).
+- Score-range rubrics are best for holistic grading where the evaluator needs explicit outcome descriptions per band.
+
+The most practical unification is at the interface level: treat both as rubric-driven evaluators that produce a normalized $[0,1]$ score and a verdict, but keep both scoring modes as first-class options rather than making one a wrapper for the other.
diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md
new file mode 100644
index 00000000..50f26d44
--- /dev/null
+++ b/openspec/changes/add-rubric-score-ranges/proposal.md
@@ -0,0 +1,54 @@
+# Change: Add 0–10 score-range rubrics for LLM judging
+
+## Why
+AgentV’s current rubric support in `llm_judge` is a **binary checklist** (each rubric item is `satisfied: true|false` and the score is computed as a weighted fraction). This is great for requirements-style grading, but it does **not** support the “confine the judge into explicit score ranges” pattern used by common LLM-evals tooling.
+
+Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confident AI) recommends:
+- A **0–10 integer scale** (more reliable than floats for LLMs)
+- Explicit **non-overlapping** `score_range` definitions
+- Clear **expected outcomes per range**, not vague labels
+- **Normalization to 0–1** for downstream aggregation
+
+Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact.
+
+## What Changes
+- Extend the existing `rubrics` concept to support **two rubric shapes** under a single field:
+  - **Checklist rubrics** (breaking rename): `{ id, expected_outcome, weight, required }`
+  - **Score-range rubrics** (new, optional): `{ score_range: [start, end], expected_outcome }` over **0–10 inclusive**
+
+  This keeps a single rubric system and a single evaluator implementation while covering both use cases.
+
+- When the evaluator is configured with score-range rubrics, it:
+  - Constrains the judge to output an integer **raw score 0–10**
+  - Normalizes to **0–1** (divide by 10) for the existing `EvaluationScore.score`
+- Add validation rules:
+  - Ranges MUST be integers within **0..10**
+  - Ranges MUST NOT overlap
+  - Ranges MUST cover **0..10** (inclusive)
+  - Each range MUST include a non-empty `expected_outcome`
+- Preserve the current behavior:
+  - Existing `llm_judge` freeform scoring (0–1) unchanged
+  - Existing `llm_judge` rubric checklist scoring logic unchanged (only the field name changes)
+
+## Breaking Changes
+- **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`.
+  - YAML before:
+    - `rubrics: [{ id: "x", description: "...", weight: 1, required: true }]`
+  - YAML after:
+    - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]`
+  - CLI `generate rubrics` output changes accordingly.
+
+## Impact
+- Affected specs: `rubric-evaluator`, `yaml-schema`.
+- Affected code (expected):
+  - `packages/core/src/evaluation/types.ts` (new config/type)
+  - `packages/core/src/evaluation/yaml-parser.ts` (parsing inline config)
+  - `packages/core/src/evaluation/loaders/evaluator-parser.ts` (validation)
+  - `packages/core/src/evaluation/evaluators/llm-judge.ts` (prompt + scoring)
+  - `packages/core/src/evaluation/validation/*` (range validation helper)
+  - Tests under `packages/core/test/**`
+
+## Non-Goals
+- Do not replace checklist rubrics.
+- Do not change `EvaluationScore.score` away from 0–1.
+- Do not add new CLI UX beyond schema support (future enhancement could generate range rubrics).
diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
new file mode 100644
index 00000000..3d0b2d48
--- /dev/null
+++ b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
@@ -0,0 +1,44 @@
+## MODIFIED Requirements
+
+### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics
+The evaluator SHALL support rubric-based grading using a single `rubrics` field in one of two shapes:
+
+1) **Checklist rubrics** (BREAKING rename): per-item boolean checks with weighted aggregation, using `expected_outcome` (formerly `description`).
+2) **Score-range rubrics** (new, optional): a set of non-overlapping integer score ranges over 0–10 inclusive, each with an explicit `expected_outcome`.
+
+If score-range rubrics are configured, the evaluator SHALL instruct the judge to output a **single integer score** in 0..10 and then normalize it to 0..1 for the reported evaluation score.
+
+The system SHALL reject ambiguous configurations where `rubrics` mixes checklist and score-range entries.
+
+#### Scenario: Checklist rubrics continue to work
+- **GIVEN** an eval case with `rubrics` (id/description/weight/required)
+- **WHEN** the rubric evaluator runs
+- **THEN** it SHALL grade using per-item boolean checks
+- **AND** the reported score SHALL be in 0..1
+
+#### Scenario: Range rubrics constrain scoring
+- **GIVEN** an eval case with `rubrics` consisting of multiple `score_range` entries and `expected_outcome` text
+- **WHEN** the rubric evaluator runs
+- **THEN** the judge SHALL be constrained to output an integer score in 0..10
+- **AND** the system SHALL normalize the score to 0..1 by dividing by 10
+
+#### Scenario: Invalid range rubrics are rejected
+- **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10
+- **WHEN** the eval suite is loaded
+- **THEN** validation SHALL fail
+- **AND** the error message SHALL indicate the violated rule (overlap, bounds, or coverage)
+
+### Requirement: Structured Grading MUST produce validated results
+The evaluator SHALL validate judge output against a schema appropriate to the configured mode.
+
+#### Scenario: Range rubric output schema
+- **GIVEN** a range-rubric configuration
+- **WHEN** the judge responds
+- **THEN** the evaluator SHALL accept a JSON object matching:
+```typescript
+z.object({
+  score: z.number().int().min(0).max(10),
+  reasoning: z.string().optional(),
+})
+```
+- **AND** AgentV SHALL normalize `score / 10` into the standard 0..1 result.
diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
new file mode 100644
index 00000000..948d8f0c
--- /dev/null
+++ b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
@@ -0,0 +1,53 @@
+## ADDED Requirements
+
+### Requirement: Checklist rubric field name MUST be `expected_outcome`
+The YAML schema SHALL accept checklist rubric objects using `expected_outcome` (replacing the legacy `description`).
+
+#### Scenario: Checklist rubric uses expected_outcome
+- **GIVEN** a YAML eval case with:
+```yaml
+rubrics:
+  - id: structure
+    expected_outcome: Has clear headings and organization
+    weight: 1.0
+    required: true
+```
+- **WHEN** the YAML is parsed
+- **THEN** schema validation succeeds
+
+### Requirement: Score-range rubrics MUST be supported for LLM judging
+The YAML schema SHALL support configuring score-range rubrics for `llm_judge` evaluators via the existing `rubrics` field.
+
+#### Scenario: Configure score_rubric
+- **GIVEN** a YAML eval case with:
+```yaml
+evaluators:
+  - name: correctness
+    type: llm_judge
+    rubrics:
+      - score_range: [0, 2]
+        expected_outcome: Factually incorrect.
+      - score_range: [3, 6]
+        expected_outcome: Mostly correct.
+      - score_range: [7, 9]
+        expected_outcome: Correct but missing minor details.
+      - score_range: [10, 10]
+        expected_outcome: Fully correct.
+```
+- **WHEN** the YAML is parsed
+- **THEN** the evaluator configuration SHALL include the provided score ranges
+
+#### Scenario: Reject overlapping score ranges
+- **GIVEN** a YAML eval case with overlapping ranges
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail
+
+#### Scenario: Reject incomplete 0..10 coverage
+- **GIVEN** a YAML eval case where score ranges do not cover 0..10 inclusive
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail
+
+#### Scenario: Reject empty expected_outcome
+- **GIVEN** a YAML eval case where a range rubric entry has an empty `expected_outcome`
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail
diff --git a/openspec/changes/add-rubric-score-ranges/tasks.md b/openspec/changes/add-rubric-score-ranges/tasks.md
new file mode 100644
index 00000000..bd86dfb2
--- /dev/null
+++ b/openspec/changes/add-rubric-score-ranges/tasks.md
@@ -0,0 +1,26 @@
+## 1. Schema & Types
+- [ ] 1.1 Add `ScoreRangeRubric` types (0–10 integer ranges) to core evaluation types
+- [ ] 1.2 Extend evaluator config to accept optional `score_rubric` (or `score_ranges`) field
+
+## 2. Validation
+- [ ] 2.1 Validate ranges are integers within 0..10 and start <= end
+- [ ] 2.2 Validate non-overlap across ranges
+- [ ] 2.3 Validate full coverage of 0..10 inclusive
+- [ ] 2.4 Validate each range has non-empty `expected_outcome`
+
+## 3. LLM Judge Integration
+- [ ] 3.1 Add prompt template for range-rubric scoring that requests integer `score` 0..10
+- [ ] 3.2 Normalize final score to 0..1 (divide by 10) and keep existing verdict logic
+- [ ] 3.3 Store raw 0–10 score in `details` (or `evaluatorRawRequest/Response`) for debugging
+
+## 4. YAML Support
+- [ ] 4.1 Support `score_rubric` in YAML evaluator config (snake_case)
+- [ ] 4.2 Decide if inline `rubrics:` sugar can support range rubrics (or keep evaluator-only)
+
+## 5. Tests
+- [ ] 5.1 Unit tests for validation (overlap, gaps, bounds)
+- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization
+
+## 6. Docs
+- [ ] 6.1 Update rubric-evaluator skill/reference docs to include range rubrics
+- [ ] 6.2 Add examples of good/bad range definitions
diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts
index ab8a5dd5..fb643e8b 100644
--- a/packages/core/src/evaluation/evaluators/llm-judge.ts
+++ b/packages/core/src/evaluation/evaluators/llm-judge.ts
@@ -231,7 +231,7 @@ export class LlmJudgeEvaluator implements Evaluator {
     for (const rubric of rubrics) {
       const requiredLabel = rubric.required ? ' (REQUIRED)' : '';
       const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
-      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
+      parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`);
     }
 
     parts.push('', 'For each rubric, determine if it is satisfied and provide brief reasoning.');
@@ -353,9 +353,9 @@ function calculateRubricScore(
 
     if (check.satisfied) {
       earnedWeight += rubric.weight;
-      hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
     } else {
-      misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
+      misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`);
       if (rubric.required) {
         failedRequired = true;
       }
diff --git a/packages/core/src/evaluation/generators/rubric-generator.ts b/packages/core/src/evaluation/generators/rubric-generator.ts
index 54ee01c3..44cb54e2 100644
--- a/packages/core/src/evaluation/generators/rubric-generator.ts
+++ b/packages/core/src/evaluation/generators/rubric-generator.ts
@@ -6,7 +6,7 @@ import type { RubricItem } from '../types.js';
 
 const rubricItemSchema = z.object({
   id: z.string().describe('Short identifier for this rubric (e.g., clarity, completeness)'),
-  description: z.string().describe('What this rubric checks for'),
+  expected_outcome: z.string().describe('Concrete expected outcome for this rubric item'),
   weight: z.number().default(1.0).describe('Relative importance (default 1.0)'),
   required: z.boolean().default(true).describe('Whether this is a mandatory requirement'),
 });
@@ -43,7 +43,7 @@ You must return a valid JSON object matching this schema:
   "rubrics": [
     {
       "id": "string (short identifier)",
-      "description": "string (what to check)",
+      "expected_outcome": "string (concrete expected outcome for this rubric item)",
       "weight": number (default 1.0),
       "required": boolean (default true)
     }
@@ -86,7 +86,7 @@ function buildPrompt(expectedOutcome: string, question?: string, referenceAnswer
     'Each rubric should:',
     '- Be specific and testable',
     '- Have a short, descriptive ID',
-    '- Include a clear description of what to check',
+    '- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)',
     '- Indicate if it is required (mandatory) or optional',
     '- Have an appropriate weight (default 1.0, use higher values for more important aspects)',
     '',
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index e5cfeb4c..54f45738 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -554,11 +554,11 @@ export async function parseEvaluators(
           .filter((r): r is JsonObject => isJsonObject(r))
           .map((rubric, index) => ({
             id: asString(rubric.id) ?? `rubric-${index + 1}`,
-            description: asString(rubric.description) ?? '',
+            expected_outcome: asString(rubric.expected_outcome) ?? '',
             weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
             required: typeof rubric.required === 'boolean' ? rubric.required : true,
           }))
-          .filter((r) => r.description.length > 0)
+          .filter((r) => r.expected_outcome.length > 0)
       : undefined;
 
     if (typeValue === 'rubric') {
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index c7f571f3..d5e3c3b1 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -202,7 +202,7 @@ export type LlmJudgeEvaluatorConfig = {
 
 export type RubricItem = {
   readonly id: string;
-  readonly description: string;
+  readonly expected_outcome: string;
   readonly weight: number;
   readonly required: boolean;
 };
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 7633c4d5..98d5725d 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -224,19 +224,19 @@ export async function loadEvalCases(
           if (typeof rubric === 'string') {
             return {
               id: `rubric-${index + 1}`,
-              description: rubric,
+              expected_outcome: rubric,
               weight: 1.0,
               required: true,
             };
           }
           return {
             id: asString(rubric.id) ?? `rubric-${index + 1}`,
-            description: asString(rubric.description) ?? '',
+            expected_outcome: asString(rubric.expected_outcome) ?? '',
             weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
             required: typeof rubric.required === 'boolean' ? rubric.required : true,
           };
         })
-        .filter((r) => r.description.length > 0);
+        .filter((r) => r.expected_outcome.length > 0);
 
       if (rubricItems.length > 0) {
         const rubricEvaluator: import('./types.js').LlmJudgeEvaluatorConfig = {

From 129f5b05fb7e8eae85948a694cc6b6bde74c87b0 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 13:49:59 +1100
Subject: [PATCH 2/9] Format package.json arrays for Biome

---
 apps/cli/package.json      | 5 +----
 packages/core/package.json | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/apps/cli/package.json b/apps/cli/package.json
index c2392d0d..b440ea4f 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",
diff --git a/packages/core/package.json b/packages/core/package.json
index 18b90569..94210544 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -36,10 +36,7 @@
     "test:watch": "bun test --watch",
     "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "dependencies": {
     "@ai-sdk/anthropic": "^2.0.53",
     "@ai-sdk/azure": "^2.0.78",

From 1db89fbdcea1a1894db32b8b3fe2e6aff445e677 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 14:23:45 +1100
Subject: [PATCH 3/9] Amend score-range proposal: required_min_score +
 per-criterion score_ranges

---
 .../changes/add-rubric-score-ranges/design.md | 47 +++++++++++--------
 .../add-rubric-score-ranges/proposal.md       | 29 +++++++-----
 .../specs/rubric-evaluator/spec.md            | 26 +++++-----
 .../specs/yaml-schema/spec.md                 | 41 +++++++++++-----
 .../changes/add-rubric-score-ranges/tasks.md  | 22 +++++----
 5 files changed, 101 insertions(+), 64 deletions(-)

diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md
index 5c9764db..5210937b 100644
--- a/openspec/changes/add-rubric-score-ranges/design.md
+++ b/openspec/changes/add-rubric-score-ranges/design.md
@@ -4,42 +4,51 @@ AgentV currently supports rubric-based evaluation by converting `rubrics` into `
 External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1.
 
 ## Decision
-Keep a **single rubric system** by extending the existing `rubrics` field to support an additional rubric entry shape for **score-range scoring**, without removing the existing checklist scoring.
+Evolve to a **single rubric system** that supports both "DeepEval-style" banded scoring and multi-criterion weighted scoring by introducing **per-criterion score ranges**.
+
+Each rubric criterion keeps an `id` (and optional `weight`), but can optionally include `score_ranges` that define non-overlapping 0–10 bands with concrete expected outcomes. The judge returns an integer score 0..10 per criterion; the runtime normalizes each to 0..1 and aggregates deterministically.
 
 This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`.
 
+The existing `required: boolean` is replaced (in the proposed primary shape) by `required_min_score: int` gating. `required` remains accepted as a deprecated alias during migration.
+
 ### Proposed YAML Shape
 ```yaml
 evaluators:
   - name: correctness
     type: llm_judge
     rubrics:
-      - score_range: [0, 2]
-        expected_outcome: Factually incorrect.
-      - score_range: [3, 6]
-        expected_outcome: Mostly correct but includes notable errors or omissions.
-      - score_range: [7, 9]
-        expected_outcome: Correct with minor missing details.
-      - score_range: [10, 10]
-        expected_outcome: Fully correct and complete.
+      - id: correctness
+        weight: 1.0
+        required_min_score: 10
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Factually incorrect.
+          - score_range: [3, 6]
+            expected_outcome: Mostly correct but includes notable errors or omissions.
+          - score_range: [7, 9]
+            expected_outcome: Correct with minor missing details.
+          - score_range: [10, 10]
+            expected_outcome: Fully correct and complete.
 ```
 
 ### Output Contract
-- Judge returns `score` as an **integer** in `0..10`.
-- AgentV normalizes to `0..1` by dividing by 10.
-- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail) and required-item behavior for checklist mode.
+- Judge returns a **per-criterion** `score` as an integer in `0..10` for each rubric `id`.
+- AgentV normalizes each to `0..1` by dividing by 10 and aggregates deterministically (weighted average).
+- If any criterion has `required_min_score` and the returned score is below it, the verdict is forced to `fail`.
+- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail).
 
 ## Validation Rules
 - Ranges are inclusive integer bounds.
 - Bounds must be within 0..10.
-- No overlap.
-- Must cover 0..10 inclusive.
+- No overlap (within a given rubric criterion).
+- Prefer full coverage of 0..10 inclusive (strict coverage recommended for determinism).
 - Each range must have non-empty `expected_outcome`.
 
 ## Backwards Compatibility
-- If `rubrics` contains checklist entries, existing checklist behavior remains the default.
-- If `rubrics` contains score-range entries, range scoring is used.
-- To avoid ambiguous mixing, the proposal treats `rubrics` as **either all checklist or all score-range entries**.
+- Existing checklist rubrics remain supported during migration.
+- `required` is treated as a deprecated alias for `required_min_score: 10`.
+- New rubric criteria may include `score_ranges` for banded 0–10 scoring.
 
 ### Migration
 - Replace checklist rubric object field `description:` with `expected_outcome:`.
@@ -53,8 +62,8 @@ evaluators:
 ### Can score-range rubrics be deterministically mapped to the existing weighted-average system?
 Not in a semantics-preserving way.
 
-Score-range rubrics define a *single holistic ordinal grade* (an integer 0..10) with an expected outcome per interval.
-Checklist rubrics define *multiple independent criteria* with per-criterion weights and required flags, and compute a weighted fraction.
+Holistic score-range rubrics define a *single ordinal grade* (an integer 0..10) with an expected outcome per interval.
+Checklist rubrics define *multiple independent criteria* with per-criterion weights and gating, and compute a weighted fraction.
 
 Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector.
 Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input.
diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md
index 50f26d44..e66ebd81 100644
--- a/openspec/changes/add-rubric-score-ranges/proposal.md
+++ b/openspec/changes/add-rubric-score-ranges/proposal.md
@@ -12,23 +12,23 @@ Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confi
 Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact.
 
 ## What Changes
-- Extend the existing `rubrics` concept to support **two rubric shapes** under a single field:
-  - **Checklist rubrics** (breaking rename): `{ id, expected_outcome, weight, required }`
-  - **Score-range rubrics** (new, optional): `{ score_range: [start, end], expected_outcome }` over **0–10 inclusive**
+- Extend the existing `rubrics` concept to support **per-criterion score ranges** (analytic rubric scoring):
+  - Each rubric entry represents a criterion with an `id` and optional aggregation `weight`.
+  - Each criterion can include `score_ranges` (0–10 inclusive integer bands) with explicit `expected_outcome` text.
+  - The judge returns an integer score **0–10 per criterion**, which AgentV normalizes to **0–1** (divide by 10) and aggregates (weighted average).
 
-  This keeps a single rubric system and a single evaluator implementation while covering both use cases.
+- Replace `required: boolean` with `required_min_score: int` (0–10) for gating.
+  - If a criterion has `required_min_score`, the overall verdict MUST be `fail` when the criterion score is below that threshold.
 
-- When the evaluator is configured with score-range rubrics, it:
-  - Constrains the judge to output an integer **raw score 0–10**
-  - Normalizes to **0–1** (divide by 10) for the existing `EvaluationScore.score`
-- Add validation rules:
+- Add validation rules (for per-criterion score ranges):
   - Ranges MUST be integers within **0..10**
-  - Ranges MUST NOT overlap
-  - Ranges MUST cover **0..10** (inclusive)
+  - Ranges MUST NOT overlap within a criterion
+  - Ranges SHOULD cover **0..10** (inclusive) within a criterion (strict coverage is preferred for determinism)
   - Each range MUST include a non-empty `expected_outcome`
-- Preserve the current behavior:
-  - Existing `llm_judge` freeform scoring (0–1) unchanged
-  - Existing `llm_judge` rubric checklist scoring logic unchanged (only the field name changes)
+
+- Backwards compatibility:
+  - Existing checklist rubrics remain supported during migration.
+  - `required` is treated as a deprecated alias for `required_min_score: 10`.
 
 ## Breaking Changes
 - **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`.
@@ -38,6 +38,9 @@ Adding this as an **optional, backwards-compatible** scoring mode gives AgentV u
     - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]`
   - CLI `generate rubrics` output changes accordingly.
 
+- **BREAKING (proposed new primary shape)**: Prefer `required_min_score` over `required`.
+  - `required` remains accepted as a deprecated alias during migration.
+
 ## Impact
 - Affected specs: `rubric-evaluator`, `yaml-schema`.
 - Affected code (expected):
diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
index 3d0b2d48..5afc100b 100644
--- a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
+++ b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
@@ -1,14 +1,14 @@
 ## MODIFIED Requirements
 
 ### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics
-The evaluator SHALL support rubric-based grading using a single `rubrics` field in one of two shapes:
+The evaluator SHALL support rubric-based grading using rubric criteria entries. Each criterion may be:
 
-1) **Checklist rubrics** (BREAKING rename): per-item boolean checks with weighted aggregation, using `expected_outcome` (formerly `description`).
-2) **Score-range rubrics** (new, optional): a set of non-overlapping integer score ranges over 0–10 inclusive, each with an explicit `expected_outcome`.
+1) **Checklist-style** (legacy): boolean checks per criterion using `expected_outcome` text.
+2) **Score-range per criterion** (new): each criterion contains `score_ranges` defining non-overlapping integer ranges over 0–10 inclusive, each with an explicit `expected_outcome`.
 
-If score-range rubrics are configured, the evaluator SHALL instruct the judge to output a **single integer score** in 0..10 and then normalize it to 0..1 for the reported evaluation score.
+When score-ranges are present for a criterion, the evaluator SHALL instruct the judge to output an **integer score 0..10 for that criterion** and then normalize it to 0..1 for aggregation.
 
-The system SHALL reject ambiguous configurations where `rubrics` mixes checklist and score-range entries.
+The evaluator SHALL support `required_min_score` gating: if a criterion specifies `required_min_score` and the returned score is below it, the overall verdict SHALL be `fail`.
 
 #### Scenario: Checklist rubrics continue to work
 - **GIVEN** an eval case with `rubrics` (id/description/weight/required)
@@ -17,10 +17,10 @@ The system SHALL reject ambiguous configurations where `rubrics` mixes checklist
 - **AND** the reported score SHALL be in 0..1
 
 #### Scenario: Range rubrics constrain scoring
-- **GIVEN** an eval case with `rubrics` consisting of multiple `score_range` entries and `expected_outcome` text
+- **GIVEN** an eval case with `rubrics` where a criterion contains `score_ranges` entries and `expected_outcome` text
 - **WHEN** the rubric evaluator runs
-- **THEN** the judge SHALL be constrained to output an integer score in 0..10
-- **AND** the system SHALL normalize the score to 0..1 by dividing by 10
+- **THEN** the judge SHALL be constrained to output an integer score in 0..10 for that criterion
+- **AND** the system SHALL normalize each criterion score to 0..1 by dividing by 10
 
 #### Scenario: Invalid range rubrics are rejected
 - **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10
@@ -37,8 +37,12 @@ The evaluator SHALL validate judge output against a schema appropriate to the co
 - **THEN** the evaluator SHALL accept a JSON object matching:
 ```typescript
 z.object({
-  score: z.number().int().min(0).max(10),
-  reasoning: z.string().optional(),
+  checks: z.array(z.object({
+    id: z.string(),
+    score: z.number().int().min(0).max(10),
+    reasoning: z.string().optional(),
+  })),
+  overall_reasoning: z.string().optional(),
 })
 ```
-- **AND** AgentV SHALL normalize `score / 10` into the standard 0..1 result.
+- **AND** AgentV SHALL normalize per-criterion `score / 10` into the standard 0..1 result and aggregate.
diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
index 948d8f0c..eb085666 100644
--- a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
+++ b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
@@ -10,13 +10,28 @@ rubrics:
   - id: structure
     expected_outcome: Has clear headings and organization
     weight: 1.0
-    required: true
+    required_min_score: 10
 ```
 - **WHEN** the YAML is parsed
 - **THEN** schema validation succeeds
 
-### Requirement: Score-range rubrics MUST be supported for LLM judging
-The YAML schema SHALL support configuring score-range rubrics for `llm_judge` evaluators via the existing `rubrics` field.
+### Requirement: Rubric gating MUST support required_min_score
+The YAML schema SHALL support `required_min_score` (0..10) on rubric criteria to enforce hard-gating.
+
+#### Scenario: required_min_score gates rubric criteria
+- **GIVEN** a YAML eval case with:
+```yaml
+rubrics:
+  - id: correctness
+    weight: 2.0
+    required_min_score: 10
+    expected_outcome: Must be fully correct.
+```
+- **WHEN** the YAML is parsed
+- **THEN** schema validation succeeds
+
+### Requirement: Per-criterion score_ranges rubrics MUST be supported for LLM judging
+The YAML schema SHALL support configuring per-criterion `score_ranges` for `llm_judge` evaluators via the existing `rubrics` field.
 
 #### Scenario: Configure score_rubric
 - **GIVEN** a YAML eval case with:
@@ -25,14 +40,18 @@ evaluators:
   - name: correctness
     type: llm_judge
     rubrics:
-      - score_range: [0, 2]
-        expected_outcome: Factually incorrect.
-      - score_range: [3, 6]
-        expected_outcome: Mostly correct.
-      - score_range: [7, 9]
-        expected_outcome: Correct but missing minor details.
-      - score_range: [10, 10]
-        expected_outcome: Fully correct.
+      - id: correctness
+        weight: 1.0
+        required_min_score: 10
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Factually incorrect.
+          - score_range: [3, 6]
+            expected_outcome: Mostly correct.
+          - score_range: [7, 9]
+            expected_outcome: Correct but missing minor details.
+          - score_range: [10, 10]
+            expected_outcome: Fully correct.
 ```
 - **WHEN** the YAML is parsed
 - **THEN** the evaluator configuration SHALL include the provided score ranges
diff --git a/openspec/changes/add-rubric-score-ranges/tasks.md b/openspec/changes/add-rubric-score-ranges/tasks.md
index bd86dfb2..258fe043 100644
--- a/openspec/changes/add-rubric-score-ranges/tasks.md
+++ b/openspec/changes/add-rubric-score-ranges/tasks.md
@@ -1,25 +1,27 @@
 ## 1. Schema & Types
-- [ ] 1.1 Add `ScoreRangeRubric` types (0–10 integer ranges) to core evaluation types
-- [ ] 1.2 Extend evaluator config to accept optional `score_rubric` (or `score_ranges`) field
+- [ ] 1.1 Add `ScoreRange` and `RubricCriterion` types (per-criterion 0–10 integer ranges) to core evaluation types
+- [ ] 1.2 Extend rubric criteria to accept `score_ranges` and `required_min_score` (deprecate `required`)
 
 ## 2. Validation
 - [ ] 2.1 Validate ranges are integers within 0..10 and start <= end
-- [ ] 2.2 Validate non-overlap across ranges
-- [ ] 2.3 Validate full coverage of 0..10 inclusive
+- [ ] 2.2 Validate non-overlap within each criterion's ranges
+- [ ] 2.3 Validate (preferred) full coverage of 0..10 inclusive per criterion
 - [ ] 2.4 Validate each range has non-empty `expected_outcome`
+- [ ] 2.5 Validate `required_min_score` is an integer within 0..10
 
 ## 3. LLM Judge Integration
-- [ ] 3.1 Add prompt template for range-rubric scoring that requests integer `score` 0..10
-- [ ] 3.2 Normalize final score to 0..1 (divide by 10) and keep existing verdict logic
-- [ ] 3.3 Store raw 0–10 score in `details` (or `evaluatorRawRequest/Response`) for debugging
+- [ ] 3.1 Add prompt template for per-criterion score-range scoring that requests integer `score` 0..10 per rubric `id`
+- [ ] 3.2 Normalize criterion scores to 0..1 (divide by 10) and aggregate deterministically (weighted average)
+- [ ] 3.3 Apply `required_min_score` gating (force fail when any gated criterion is below threshold)
+- [ ] 3.4 Store raw 0–10 scores in `details` (or `evaluatorRawRequest/Response`) for debugging
 
 ## 4. YAML Support
-- [ ] 4.1 Support `score_rubric` in YAML evaluator config (snake_case)
-- [ ] 4.2 Decide if inline `rubrics:` sugar can support range rubrics (or keep evaluator-only)
+- [ ] 4.1 Support `score_ranges` nested under each rubric criterion in YAML
+- [ ] 4.2 Support `required_min_score` in YAML and treat legacy `required: true` as `required_min_score: 10`
 
 ## 5. Tests
 - [ ] 5.1 Unit tests for validation (overlap, gaps, bounds)
-- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization
+- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization + gating
 
 ## 6. Docs
 - [ ] 6.1 Update rubric-evaluator skill/reference docs to include range rubrics

From 143fd1906c866cf23f4f12f1b00fbfa1cfb124d3 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 14:25:45 +1100
Subject: [PATCH 4/9] Update score-range rubric example to score_ranges

---
 examples/features/rubric/evals/dataset.yaml | 27 ++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml
index 87837649..6a81e557 100644
--- a/examples/features/rubric/evals/dataset.yaml
+++ b/examples/features/rubric/evals/dataset.yaml
@@ -180,7 +180,7 @@ evalcases:
 
   # ==========================================
   # Example 5: Score-range rubrics (PROPOSED)
-  # Demonstrates: proposed 0–10 score_range rubrics with expected_outcome per range
+  # Demonstrates: proposed per-criterion 0–10 score_ranges with required_min_score gating
   # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`.
   #         It is NOT supported by the current runtime until that change is implemented.
   # ==========================================
@@ -197,14 +197,19 @@ evalcases:
       - role: assistant
         content: 42
 
-    # Proposed polymorphic `rubrics` entries.
-    # Each entry defines an inclusive integer score range (0..10) with a concrete expected outcome.
+    # Proposed rubric criterion with score_ranges.
+    # The judge assigns an integer score 0..10 for each criterion id.
+    # AgentV normalizes each criterion score to 0..1 and aggregates deterministically.
     rubrics:
-      - score_range: [0, 2]
-        expected_outcome: Incorrect or nonsensical answer.
-      - score_range: [3, 6]
-        expected_outcome: Partially correct but has clear errors or missing reasoning.
-      - score_range: [7, 9]
-        expected_outcome: Correct answer with minor issues (e.g., unclear explanation).
-      - score_range: [10, 10]
-        expected_outcome: Fully correct and clear.
+      - id: correctness
+        weight: 1.0
+        required_min_score: 10
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Incorrect or nonsensical answer.
+          - score_range: [3, 6]
+            expected_outcome: Partially correct but has clear errors or missing reasoning.
+          - score_range: [7, 9]
+            expected_outcome: Correct answer with minor issues (e.g., unclear explanation).
+          - score_range: [10, 10]
+            expected_outcome: Fully correct and clear.

From 751ffba6143898af7e5b42fc5a132541d2dd89c0 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 14:30:41 +1100
Subject: [PATCH 5/9] Add multi-criteria score_ranges eval example

---
 examples/features/rubric/evals/dataset.yaml | 58 +++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml
index 6a81e557..5e623fda 100644
--- a/examples/features/rubric/evals/dataset.yaml
+++ b/examples/features/rubric/evals/dataset.yaml
@@ -213,3 +213,61 @@ evalcases:
             expected_outcome: Correct answer with minor issues (e.g., unclear explanation).
           - score_range: [10, 10]
             expected_outcome: Fully correct and clear.
+
+  # ==========================================
+  # Example 6: Multi-criteria score_ranges (PROPOSED)
+  # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation.
+  # Real-world intent: grading a summary on both factual accuracy and brevity.
+  # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented.
+  # ==========================================
+  - id: summary-multi-criteria-score-ranges-proposed
+
+    expected_outcome: |-
+      Provide an accurate summary in under 50 words.
+
+    input_messages:
+      - role: user
+        content: |-
+          Summarize this article in under 50 words:
+
+          Climate change is accelerating faster than predicted. Recent studies show
+          Arctic ice melting at unprecedented rates, sea levels rising, and extreme
+          weather events becoming more frequent. Scientists urge immediate action to
+          reduce carbon emissions and transition to renewable energy sources.
+
+    expected_messages:
+      - role: assistant
+        content: |-
+          Climate change is accelerating with rapid Arctic ice loss, rising seas, and
+          more extreme weather. Scientists urge urgent emissions cuts and a transition
+          to renewable energy.
+
+    rubrics:
+      - id: factual_accuracy
+        weight: 2.0
+        required_min_score: 8
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Contains major factual errors or contradicts the article.
+          - score_range: [3, 5]
+            expected_outcome: Mostly on-topic but includes at least one clear factual error or misstates a key claim.
+          - score_range: [6, 7]
+            expected_outcome: Generally accurate but misses an important point or slightly distorts emphasis.
+          - score_range: [8, 9]
+            expected_outcome: Accurate and covers the key points with only minor omissions.
+          - score_range: [10, 10]
+            expected_outcome: Fully accurate, captures all key points with no distortions.
+
+      - id: brevity_and_clarity
+        weight: 1.0
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Exceeds 50 words or is hard to understand.
+          - score_range: [3, 5]
+            expected_outcome: Under 50 words but unclear, repetitive, or poorly structured.
+          - score_range: [6, 7]
+            expected_outcome: Under 50 words and mostly clear, but could be more concise or better phrased.
+          - score_range: [8, 9]
+            expected_outcome: Under 50 words, clear and concise.
+          - score_range: [10, 10]
+            expected_outcome: Under 50 words, exceptionally clear, concise, and well phrased.

From a81e1032317057ae18e91e7a9ad4c91371ac2193 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 9 Jan 2026 14:31:19 +1100
Subject: [PATCH 6/9] Remove redundant single-criterion score_ranges example

---
 examples/features/rubric/evals/dataset.yaml | 38 +--------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml
index 5e623fda..6b902412 100644
--- a/examples/features/rubric/evals/dataset.yaml
+++ b/examples/features/rubric/evals/dataset.yaml
@@ -179,43 +179,7 @@ evalcases:
     # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml
 
   # ==========================================
-  # Example 5: Score-range rubrics (PROPOSED)
-  # Demonstrates: proposed per-criterion 0–10 score_ranges with required_min_score gating
-  # Status: This example matches the OpenSpec change proposal `add-rubric-score-ranges`.
-  #         It is NOT supported by the current runtime until that change is implemented.
-  # ==========================================
-  - id: correctness-score-range-proposed
-
-    expected_outcome: |-
-      Answer the question correctly and completely.
-
-    input_messages:
-      - role: user
-        content: What is 15 + 27?
-
-    expected_messages:
-      - role: assistant
-        content: 42
-
-    # Proposed rubric criterion with score_ranges.
-    # The judge assigns an integer score 0..10 for each criterion id.
-    # AgentV normalizes each criterion score to 0..1 and aggregates deterministically.
-    rubrics:
-      - id: correctness
-        weight: 1.0
-        required_min_score: 10
-        score_ranges:
-          - score_range: [0, 2]
-            expected_outcome: Incorrect or nonsensical answer.
-          - score_range: [3, 6]
-            expected_outcome: Partially correct but has clear errors or missing reasoning.
-          - score_range: [7, 9]
-            expected_outcome: Correct answer with minor issues (e.g., unclear explanation).
-          - score_range: [10, 10]
-            expected_outcome: Fully correct and clear.
-
-  # ==========================================
-  # Example 6: Multi-criteria score_ranges (PROPOSED)
+  # Example 5: Multi-criteria score_ranges (PROPOSED)
   # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation.
   # Real-world intent: grading a summary on both factual accuracy and brevity.
   # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented.

From 3fa4d06b63029cb699128fcf1f27468d7eb08de5 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 21 Jan 2026 12:19:00 +0000
Subject: [PATCH 7/9] Implement score_ranges rubrics with backward-compatible
 description alias

- Add description as backward-compatible alias for expected_outcome in all parsers
  (evaluator-parser, yaml-parser, jsonl-parser)
- Add ScoreRange and extended RubricItem types with score_ranges and required_min_score
- Implement score_ranges parsing with validation:
  - Ranges must be integers 0-10
  - Ranges must not overlap
  - Ranges must cover 0-10 inclusive
  - Each range requires non-empty expected_outcome
- Implement score-range evaluation in llm-judge:
  - Detect score_ranges rubrics automatically
  - Build specialized prompt for 0-10 integer scoring
  - Normalize scores to 0-1 (divide by 10)
  - Apply required_min_score gating (legacy required: true = required_min_score: 10)
- Update test to use expected_outcome field name
---
 .../src/evaluation/evaluators/llm-judge.ts    | 233 ++++++++++++++++++
 .../evaluation/loaders/evaluator-parser.ts    | 208 +++++++++++++++-
 .../src/evaluation/loaders/jsonl-parser.ts    |   9 +-
 packages/core/src/evaluation/types.ts         |  40 ++-
 packages/core/src/evaluation/yaml-parser.ts   |   5 +-
 .../core/test/evaluation/evaluators.test.ts   |   4 +-
 6 files changed, 482 insertions(+), 17 deletions(-)

diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts
index fb643e8b..0974f2ef 100644
--- a/packages/core/src/evaluation/evaluators/llm-judge.ts
+++ b/packages/core/src/evaluation/evaluators/llm-judge.ts
@@ -57,6 +57,21 @@ const rubricEvaluationSchema = z.object({
   overall_reasoning: z.string().describe('Overall assessment summary (1-2 sentences)'),
 });
 
+/**
+ * Schema for score-range rubric evaluation.
+ * Each check returns an integer score 0-10 instead of boolean satisfied.
+ */
+const scoreRangeCheckResultSchema = z.object({
+  id: z.string().describe('The ID of the rubric criterion being scored'),
+  score: z.number().int().min(0).max(10).describe('Integer score 0-10 for this criterion'),
+  reasoning: z.string().describe('Brief explanation (1-2 sentences) for this score').optional(),
+});
+
+const scoreRangeEvaluationSchema = z.object({
+  checks: z.array(scoreRangeCheckResultSchema).describe('Scores for each rubric criterion'),
+  overall_reasoning: z.string().describe('Overall assessment summary (1-2 sentences)').optional(),
+});
+
 export { freeformEvaluationSchema };
 
 export class LlmJudgeEvaluator implements Evaluator {
@@ -175,6 +190,13 @@ export class LlmJudgeEvaluator implements Evaluator {
       );
     }
 
+    // Detect if any rubric uses score_ranges (analytic rubric mode)
+    const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
+
+    if (hasScoreRanges) {
+      return this.evaluateWithScoreRanges(context, judgeProvider, rubrics);
+    }
+
     const prompt = this.buildRubricPrompt(context, rubrics);
     const systemPrompt = buildRubricOutputSchema();
 
@@ -205,6 +227,112 @@ export class LlmJudgeEvaluator implements Evaluator {
     };
   }
 
+  /**
+   * Evaluate using score-range rubrics (analytic rubric scoring).
+   * Each criterion is scored 0-10 and normalized to 0-1.
+   */
+  private async evaluateWithScoreRanges(
+    context: EvaluationContext,
+    judgeProvider: Provider,
+    rubrics: readonly RubricItem[],
+  ): Promise<EvaluationScore> {
+    const prompt = this.buildScoreRangePrompt(context, rubrics);
+    const systemPrompt = buildScoreRangeOutputSchema();
+
+    const evaluatorRawRequest: JsonObject = {
+      userPrompt: prompt,
+      systemPrompt,
+      target: judgeProvider.targetName,
+    };
+
+    const { data } = await this.runWithRetry({
+      context,
+      judgeProvider,
+      systemPrompt,
+      userPrompt: prompt,
+      schema: scoreRangeEvaluationSchema,
+    });
+
+    const { score, verdict, hits, misses, details } = calculateScoreRangeResult(data, rubrics);
+
+    return {
+      score,
+      verdict,
+      hits,
+      misses,
+      expectedAspectCount: rubrics.length,
+      reasoning: data.overall_reasoning,
+      evaluatorRawRequest,
+      details,
+    };
+  }
+
+  /**
+   * Build prompt for score-range rubric evaluation.
+   */
+  private buildScoreRangePrompt(
+    context: EvaluationContext,
+    rubrics: readonly RubricItem[],
+  ): string {
+    const formattedQuestion =
+      context.promptInputs.question && context.promptInputs.question.trim().length > 0
+        ? context.promptInputs.question
+        : context.evalCase.question;
+
+    const parts: string[] = [
+      'You are an expert evaluator. Score the candidate answer on each criterion below using the provided score ranges.',
+      'For each criterion, output an integer score from 0 to 10 based on which score range best matches the answer.',
+      '',
+      '[[ ## question ## ]]',
+      formattedQuestion,
+      '',
+      '[[ ## expected_outcome ## ]]',
+      context.evalCase.expected_outcome,
+      '',
+    ];
+
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, '');
+    }
+
+    parts.push(
+      '[[ ## candidate_answer ## ]]',
+      context.candidate,
+      '',
+      '[[ ## scoring_criteria ## ]]',
+    );
+
+    for (const rubric of rubrics) {
+      const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
+      const minScoreLabel =
+        rubric.required_min_score !== undefined
+          ? ` [REQUIRED: min score ${rubric.required_min_score}]`
+          : '';
+
+      parts.push('', `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
+
+      if (rubric.expected_outcome) {
+        parts.push(`Description: ${rubric.expected_outcome}`);
+      }
+
+      if (rubric.score_ranges && rubric.score_ranges.length > 0) {
+        parts.push('Score ranges:');
+        for (const range of rubric.score_ranges) {
+          const [min, max] = range.score_range;
+          const rangeLabel = min === max ? `${min}` : `${min}-${max}`;
+          parts.push(`  - Score ${rangeLabel}: ${range.expected_outcome}`);
+        }
+      }
+    }
+
+    parts.push(
+      '',
+      'For each criterion, provide an integer score 0-10 that matches one of its defined score ranges.',
+    );
+
+    return parts.join('\n');
+  }
+
   private buildRubricPrompt(context: EvaluationContext, rubrics: readonly RubricItem[]): string {
     const formattedQuestion =
       context.promptInputs.question && context.promptInputs.question.trim().length > 0
@@ -366,3 +494,108 @@ function calculateRubricScore(
   const verdict = failedRequired ? 'fail' : scoreToVerdict(score);
   return { score, verdict, hits, misses };
 }
+
+/**
+ * Build the output schema for score-range rubric evaluation.
+ */
+function buildScoreRangeOutputSchema(): string {
+  return `You are an expert evaluator. Score the candidate answer on each criterion.
+You must return a valid JSON object matching this schema:
+{
+  "checks": [
+    {
+      "id": "string (criterion id)",
+      "score": integer (0-10),
+      "reasoning": "string (brief explanation for score)"
+    }
+  ],
+  "overall_reasoning": "string (summary, optional)"
+}
+
+Important: The "score" must be an integer from 0 to 10 that falls within one of the defined score ranges for that criterion.`;
+}
+
+/**
+ * Calculate score from score-range rubric evaluation results.
+ * - Normalizes each criterion score (0-10) to 0-1 by dividing by 10
+ * - Computes weighted average across criteria
+ * - Applies required_min_score gating (force fail if below threshold)
+ */
+function calculateScoreRangeResult(
+  result: z.infer<typeof scoreRangeEvaluationSchema>,
+  rubrics: readonly RubricItem[],
+): {
+  score: number;
+  verdict: 'pass' | 'fail' | 'borderline';
+  hits: string[];
+  misses: string[];
+  details: JsonObject;
+} {
+  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
+  const hits: string[] = [];
+  const misses: string[] = [];
+  const rawScores: Record<string, number> = {};
+  let totalWeight = 0;
+  let weightedScoreSum = 0;
+  let failedRequired = false;
+
+  for (const check of result.checks) {
+    const rubric = rubricMap.get(check.id);
+    if (!rubric) {
+      continue;
+    }
+
+    const rawScore = Math.max(0, Math.min(10, check.score)); // Clamp to 0-10
+    const normalizedScore = rawScore / 10; // Normalize to 0-1
+    rawScores[rubric.id] = rawScore;
+
+    totalWeight += rubric.weight;
+    weightedScoreSum += normalizedScore * rubric.weight;
+
+    // Determine required minimum score:
+    // - If required_min_score is set, use it directly
+    // - If required is true (legacy), treat as required_min_score: 10
+    // - Otherwise, no gating
+    let requiredMinScore: number | undefined;
+    if (rubric.required_min_score !== undefined) {
+      requiredMinScore = rubric.required_min_score;
+    } else if (rubric.required === true) {
+      requiredMinScore = 10; // Legacy: required: true means must score 10/10
+    }
+
+    // Find the matching score range description for reporting
+    const matchingRange = rubric.score_ranges?.find(
+      (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1],
+    );
+    const rangeDescription = matchingRange?.expected_outcome ?? '';
+    const criterionLabel = rubric.expected_outcome ?? rubric.id;
+
+    const reasoningText = check.reasoning ? `: ${check.reasoning}` : '';
+    const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
+
+    // Check gating
+    if (requiredMinScore !== undefined && rawScore < requiredMinScore) {
+      failedRequired = true;
+      misses.push(scoreInfo);
+    } else if (rawScore >= 7) {
+      hits.push(scoreInfo);
+    } else {
+      misses.push(scoreInfo);
+    }
+  }
+
+  const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
+  const verdict = failedRequired ? 'fail' : scoreToVerdict(score);
+
+  return {
+    score,
+    verdict,
+    hits,
+    misses,
+    details: {
+      raw_scores: rawScores,
+      normalization: 'score / 10',
+      aggregation: 'weighted_average',
+    },
+  };
+}
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index 54f45738..91a546a5 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -550,15 +550,7 @@ export async function parseEvaluators(
 
     const rawRubrics = rawEvaluator.rubrics;
     const parsedRubrics = Array.isArray(rawRubrics)
-      ? rawRubrics
-          .filter((r): r is JsonObject => isJsonObject(r))
-          .map((rubric, index) => ({
-            id: asString(rubric.id) ?? `rubric-${index + 1}`,
-            expected_outcome: asString(rubric.expected_outcome) ?? '',
-            weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
-            required: typeof rubric.required === 'boolean' ? rubric.required : true,
-          }))
-          .filter((r) => r.expected_outcome.length > 0)
+      ? parseRubricItems(rawRubrics, name, evalId)
       : undefined;
 
     if (typeValue === 'rubric') {
@@ -714,3 +706,201 @@ function isValidFieldAggregationType(
 ): value is import('../types.js').FieldAggregationType {
   return typeof value === 'string' && VALID_FIELD_AGGREGATION_TYPES.has(value);
 }
+
+/**
+ * Parse rubric items from raw YAML/JSON data.
+ * Supports both checklist rubrics and score-range rubrics.
+ */
+function parseRubricItems(
+  rawRubrics: readonly unknown[],
+  evaluatorName: string,
+  evalId: string,
+): import('../types.js').RubricItem[] | undefined {
+  const items: import('../types.js').RubricItem[] = [];
+
+  for (const [index, rawRubric] of rawRubrics.entries()) {
+    if (!isJsonObject(rawRubric)) {
+      logWarning(
+        `Skipping invalid rubric entry at index ${index} in evaluator '${evaluatorName}' (expected object)`,
+      );
+      continue;
+    }
+
+    const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
+    // Support both expected_outcome and description (backward compatibility)
+    const expectedOutcome =
+      asString(rawRubric.expected_outcome) ?? asString(rawRubric.description) ?? '';
+    const weight = typeof rawRubric.weight === 'number' ? rawRubric.weight : 1.0;
+
+    // Parse required_min_score (new) or required (legacy backward compat)
+    let requiredMinScore: number | undefined;
+    let required: boolean | undefined;
+
+    if (typeof rawRubric.required_min_score === 'number') {
+      const minScore = rawRubric.required_min_score;
+      if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
+        throw new Error(
+          `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`,
+        );
+      }
+      requiredMinScore = minScore;
+    }
+
+    if (typeof rawRubric.required === 'boolean') {
+      required = rawRubric.required;
+    }
+
+    // Parse score_ranges if present
+    let scoreRanges: import('../types.js').ScoreRange[] | undefined;
+    const rawScoreRanges = rawRubric.score_ranges;
+
+    if (rawScoreRanges !== undefined) {
+      if (!Array.isArray(rawScoreRanges)) {
+        throw new Error(
+          `Invalid score_ranges for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an array`,
+        );
+      }
+
+      scoreRanges = parseScoreRanges(rawScoreRanges, id, evaluatorName, evalId);
+
+      // For score-range rubrics, expected_outcome at rubric level is optional
+      items.push({
+        id,
+        weight,
+        ...(expectedOutcome.length > 0 ? { expected_outcome: expectedOutcome } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}),
+        score_ranges: scoreRanges,
+      });
+    } else {
+      // Checklist rubric: expected_outcome is required
+      if (expectedOutcome.length === 0) {
+        logWarning(
+          `Skipping rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': missing expected_outcome`,
+        );
+        continue;
+      }
+
+      items.push({
+        id,
+        expected_outcome: expectedOutcome,
+        weight,
+        // Default to required: true if not specified (backward compatibility)
+        required: required ?? true,
+        ...(requiredMinScore !== undefined ? { required_min_score: requiredMinScore } : {}),
+      });
+    }
+  }
+
+  return items.length > 0 ? items : undefined;
+}
+
+/**
+ * Parse and validate score ranges for a rubric criterion.
+ * Validates:
+ * - Ranges are [min, max] with integers 0-10
+ * - min <= max
+ * - Non-overlapping ranges
+ * - Full coverage of 0-10 (warning if not covered)
+ * - Each range has non-empty expected_outcome
+ */
+function parseScoreRanges(
+  rawRanges: readonly unknown[],
+  rubricId: string,
+  evaluatorName: string,
+  evalId: string,
+): import('../types.js').ScoreRange[] {
+  const ranges: import('../types.js').ScoreRange[] = [];
+
+  for (const [index, rawRange] of rawRanges.entries()) {
+    if (!isJsonObject(rawRange)) {
+      throw new Error(
+        `Invalid score_range entry at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': expected object`,
+      );
+    }
+
+    const scoreRangeValue = rawRange.score_range;
+    if (
+      !Array.isArray(scoreRangeValue) ||
+      scoreRangeValue.length !== 2 ||
+      typeof scoreRangeValue[0] !== 'number' ||
+      typeof scoreRangeValue[1] !== 'number'
+    ) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be [min, max] array of two numbers`,
+      );
+    }
+
+    const [min, max] = scoreRangeValue;
+
+    // Validate integers in 0-10 range
+    if (!Number.isInteger(min) || !Number.isInteger(max)) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be integers (got [${min}, ${max}])`,
+      );
+    }
+
+    if (min < 0 || min > 10 || max < 0 || max > 10) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': values must be 0-10 (got [${min}, ${max}])`,
+      );
+    }
+
+    if (min > max) {
+      throw new Error(
+        `Invalid score_range at index ${index} for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': min must be <= max (got [${min}, ${max}])`,
+      );
+    }
+
+    // Validate expected_outcome
+    const expectedOutcome =
+      asString(rawRange.expected_outcome) ?? asString(rawRange.description) ?? '';
+    if (expectedOutcome.length === 0) {
+      throw new Error(
+        `Missing expected_outcome for score_range [${min}, ${max}] in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}'`,
+      );
+    }
+
+    ranges.push({
+      score_range: [min, max] as const,
+      expected_outcome: expectedOutcome,
+    });
+  }
+
+  // Validate non-overlapping ranges
+  const sortedRanges = [...ranges].sort((a, b) => a.score_range[0] - b.score_range[0]);
+  for (let i = 1; i < sortedRanges.length; i++) {
+    const prev = sortedRanges[i - 1];
+    const curr = sortedRanges[i];
+    if (curr.score_range[0] <= prev.score_range[1]) {
+      throw new Error(
+        `Overlapping score_ranges in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': ` +
+          `[${prev.score_range[0]}, ${prev.score_range[1]}] overlaps with [${curr.score_range[0]}, ${curr.score_range[1]}]`,
+      );
+    }
+  }
+
+  // Validate full coverage of 0-10 (strict requirement per spec)
+  const covered = new Set<number>();
+  for (const range of ranges) {
+    for (let i = range.score_range[0]; i <= range.score_range[1]; i++) {
+      covered.add(i);
+    }
+  }
+
+  const missing: number[] = [];
+  for (let i = 0; i <= 10; i++) {
+    if (!covered.has(i)) {
+      missing.push(i);
+    }
+  }
+
+  if (missing.length > 0) {
+    throw new Error(
+      `Incomplete score_ranges coverage in rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': ` +
+        `missing coverage for scores: ${missing.join(', ')}. Ranges must cover all integers 0-10.`,
+    );
+  }
+
+  return ranges;
+}
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
index 4f2b9b73..185a33b8 100644
--- a/packages/core/src/evaluation/loaders/jsonl-parser.ts
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -268,19 +268,22 @@ export async function loadEvalCasesFromJsonl(
           if (typeof rubric === 'string') {
             return {
               id: `rubric-${index + 1}`,
-              description: rubric,
+              expected_outcome: rubric,
               weight: 1.0,
               required: true,
             };
           }
+          // Support both expected_outcome and description (backward compatibility)
+          const expectedOutcome =
+            asString(rubric.expected_outcome) ?? asString(rubric.description) ?? '';
           return {
             id: asString(rubric.id) ?? `rubric-${index + 1}`,
-            description: asString(rubric.description) ?? '',
+            expected_outcome: expectedOutcome,
             weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
             required: typeof rubric.required === 'boolean' ? rubric.required : true,
           };
         })
-        .filter((r) => r.description.length > 0);
+        .filter((r) => r.expected_outcome.length > 0);
 
       if (rubricItems.length > 0) {
         const rubricEvaluator: import('../types.js').LlmJudgeEvaluatorConfig = {
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index d5e3c3b1..f41a63fb 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -200,11 +200,47 @@ export type LlmJudgeEvaluatorConfig = {
   readonly weight?: number;
 };
 
+/**
+ * Score range definition for analytic rubric scoring.
+ * Each range maps an integer score band (0-10) to an expected outcome description.
+ */
+export type ScoreRange = {
+  /** Inclusive integer range [min, max] within 0-10 */
+  readonly score_range: readonly [number, number];
+  /** Description of what this score range represents */
+  readonly expected_outcome: string;
+};
+
+/**
+ * Rubric item for LLM judge evaluation.
+ * Supports two modes:
+ * - Checklist mode: boolean satisfied/not-satisfied with `expected_outcome`
+ * - Score-range mode: 0-10 integer scoring with `score_ranges`
+ */
 export type RubricItem = {
   readonly id: string;
-  readonly expected_outcome: string;
+  /**
+   * For checklist rubrics: the expected outcome text (required).
+   * For score-range rubrics: optional overall criterion description.
+   */
+  readonly expected_outcome?: string;
   readonly weight: number;
-  readonly required: boolean;
+  /**
+   * Legacy boolean gating (deprecated, treated as required_min_score: 10).
+   * Use required_min_score instead for finer control.
+   */
+  readonly required?: boolean;
+  /**
+   * Minimum score (0-10) required to pass this criterion.
+   * If the criterion score is below this threshold, the overall verdict is 'fail'.
+   */
+  readonly required_min_score?: number;
+  /**
+   * Score range definitions for analytic rubric scoring.
+   * When present, the judge outputs an integer 0-10 score per criterion.
+   * Ranges must be non-overlapping and cover 0-10 inclusive.
+   */
+  readonly score_ranges?: readonly ScoreRange[];
 };
 
 export type CompositeAggregatorConfig =
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index 98d5725d..e66b67a7 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -229,9 +229,12 @@ export async function loadEvalCases(
               required: true,
             };
           }
+          // Support both expected_outcome and description (backward compatibility)
+          const expectedOutcome =
+            asString(rubric.expected_outcome) ?? asString(rubric.description) ?? '';
           return {
             id: asString(rubric.id) ?? `rubric-${index + 1}`,
-            expected_outcome: asString(rubric.expected_outcome) ?? '',
+            expected_outcome: expectedOutcome,
             weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
             required: typeof rubric.required === 'boolean' ? rubric.required : true,
           };
diff --git a/packages/core/test/evaluation/evaluators.test.ts b/packages/core/test/evaluation/evaluators.test.ts
index 532239b7..3fd5de55 100644
--- a/packages/core/test/evaluation/evaluators.test.ts
+++ b/packages/core/test/evaluation/evaluators.test.ts
@@ -367,8 +367,8 @@ describe('LlmJudgeEvaluator', () => {
         name: 'rubric',
         type: 'llm_judge',
         rubrics: [
-          { id: 'r1', description: 'Mentions logging', weight: 1.0, required: true },
-          { id: 'r2', description: 'Mentions tests', weight: 1.0, required: false },
+          { id: 'r1', expected_outcome: 'Mentions logging', weight: 1.0, required: true },
+          { id: 'r2', expected_outcome: 'Mentions tests', weight: 1.0, required: false },
         ],
       },
     });

From e549315bde9fd419f435880d1663553565e02c9b Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 21 Jan 2026 12:37:20 +0000
Subject: [PATCH 8/9] Add score_ranges tests and documentation

- Add 4 minimal unit tests for score_ranges validation:
  - Valid score_ranges parsing with required_min_score
  - Overlapping ranges validation error
  - Incomplete coverage validation error
  - Backward-compatible description alias
- Update rubric-evaluator.md skill documentation:
  - Add Score-Range Rubrics (Analytic Mode) section
  - Document score_ranges validation rules
  - Add required_min_score field to table
  - Add "When to Use Each Mode" guidance
  - Note description as backward-compatible alias
---
 .../references/rubric-evaluator.md            | 103 +++++++++++++---
 .../loaders/evaluator-parser.test.ts          | 115 ++++++++++++++++++
 2 files changed, 200 insertions(+), 18 deletions(-)

diff --git a/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md b/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md
index f400892d..75e1869f 100644
--- a/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md
+++ b/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md
@@ -23,7 +23,7 @@ evalcases:
       - States time complexity correctly
 ```
 
-### Detailed Rubric Objects
+### Detailed Rubric Objects (Checklist Mode)
 
 Use objects for fine-grained control over weights and requirements:
 
@@ -31,58 +31,125 @@ Use objects for fine-grained control over weights and requirements:
 evalcases:
   - id: technical-guide
     expected_outcome: Write a comprehensive HTTP status codes guide
-    
+
     input_messages:
       - role: user
         content: Write a guide explaining HTTP status codes
-    
+
     rubrics:
       - id: structure
-        description: Has clear headings and organization
+        expected_outcome: Has clear headings and organization
         weight: 1.0
         required: true
-        
+
       - id: success-codes
-        description: Covers 2xx success codes with examples
+        expected_outcome: Covers 2xx success codes with examples
         weight: 2.0
         required: true
-        
+
       - id: client-errors
-        description: Explains 4xx client error codes
+        expected_outcome: Explains 4xx client error codes
         weight: 2.0
         required: true
-        
+
       - id: server-errors
-        description: Explains 5xx server error codes
+        expected_outcome: Explains 5xx server error codes
         weight: 1.5
         required: false
-        
+
       - id: practical-examples
-        description: Includes practical use case examples
+        expected_outcome: Includes practical use case examples
         weight: 1.0
         required: false
 ```
 
+### Score-Range Rubrics (Analytic Mode)
+
+For more granular scoring, use `score_ranges` to define 0-10 integer scoring per criterion:
+
+```yaml
+evalcases:
+  - id: code-review
+    expected_outcome: Review the code for correctness and style
+
+    input_messages:
+      - role: user
+        content: Review this Python function for issues
+
+    rubrics:
+      - id: correctness
+        weight: 2.0
+        required_min_score: 7  # Fail if score < 7
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Contains critical bugs or errors
+          - score_range: [3, 5]
+            expected_outcome: Has minor bugs or edge case issues
+          - score_range: [6, 8]
+            expected_outcome: Functionally correct with minor issues
+          - score_range: [9, 10]
+            expected_outcome: Fully correct implementation
+
+      - id: style
+        weight: 1.0
+        score_ranges:
+          - score_range: [0, 3]
+            expected_outcome: Poor style, hard to read
+          - score_range: [4, 6]
+            expected_outcome: Acceptable style with issues
+          - score_range: [7, 10]
+            expected_outcome: Clean, idiomatic code
+```
+
+**Score-range validation rules:**
+- Ranges must be integers within 0-10
+- Ranges must not overlap
+- Ranges must cover all values 0-10 (no gaps)
+- Each range must have a non-empty `expected_outcome`
+
 ## Rubric Object Fields
 
 | Field | Type | Default | Description |
 |-------|------|---------|-------------|
 | `id` | string | auto-generated | Unique identifier for the rubric |
-| `description` | string | required | The criterion being evaluated |
+| `expected_outcome` | string | required* | The criterion being evaluated (*optional if `score_ranges` used) |
 | `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
-| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' |
+| `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' (checklist mode) |
+| `required_min_score` | integer | - | Minimum 0-10 score required to pass (score-range mode) |
+| `score_ranges` | array | - | Score range definitions for analytic rubric scoring |
+
+> **Note:** `description` is supported as a backward-compatible alias for `expected_outcome`.
 
 ## Scoring and Verdicts
 
-**Score Calculation:**
+### Checklist Mode (boolean)
 ```
 score = (sum of satisfied weights) / (total weights)
 ```
 
+### Score-Range Mode (0-10 integers)
+```
+normalized_score = raw_score / 10  # Convert 0-10 to 0-1
+final_score = weighted_average(normalized_scores)
+```
+
 **Verdict Rules:**
-- `pass`: Score ≥ 0.8 AND all required rubrics satisfied
-- `borderline`: Score ≥ 0.6 AND all required rubrics satisfied  
-- `fail`: Score < 0.6 OR any required rubric failed
+- `pass`: Score ≥ 0.8 AND all gating criteria satisfied
+- `borderline`: Score ≥ 0.6 AND all gating criteria satisfied
+- `fail`: Score < 0.6 OR any gating criterion failed
+
+**Gating:**
+- Checklist mode: `required: true` means must be satisfied
+- Score-range mode: `required_min_score: N` means score must be ≥ N
+
+## When to Use Each Mode
+
+| Use Case | Mode | Why |
+|----------|------|-----|
+| Binary pass/fail criteria | Checklist | Simple yes/no evaluation |
+| Quality gradient | Score-range | Captures nuance (poor → excellent) |
+| Critical requirements | Checklist + `required: true` | Hard gating on must-haves |
+| Minimum quality bar | Score-range + `required_min_score` | Flexible threshold gating |
 
 ## Combining Rubrics with Other Evaluators
 
diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
index 73ae1ebc..63ed2136 100644
--- a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
+++ b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts
@@ -312,6 +312,121 @@ describe('parseEvaluators - code_judge config pass-through', () => {
   });
 });
 
+describe('parseEvaluators - score_ranges rubrics', () => {
+  it('parses valid score_ranges with required_min_score', async () => {
+    const rawEvalCase = {
+      evaluators: [
+        {
+          name: 'correctness',
+          type: 'llm_judge',
+          rubrics: [
+            {
+              id: 'accuracy',
+              weight: 2.0,
+              required_min_score: 7,
+              score_ranges: [
+                { score_range: [0, 3], expected_outcome: 'Incorrect' },
+                { score_range: [4, 6], expected_outcome: 'Partially correct' },
+                { score_range: [7, 9], expected_outcome: 'Mostly correct' },
+                { score_range: [10, 10], expected_outcome: 'Fully correct' },
+              ],
+            },
+          ],
+        },
+      ],
+    };
+
+    const evaluators = await parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case');
+
+    expect(evaluators).toHaveLength(1);
+    const config = evaluators?.[0];
+    expect(config?.type).toBe('llm_judge');
+    if (config?.type === 'llm_judge') {
+      expect(config.rubrics).toHaveLength(1);
+      const rubric = config.rubrics?.[0];
+      expect(rubric?.id).toBe('accuracy');
+      expect(rubric?.weight).toBe(2.0);
+      expect(rubric?.required_min_score).toBe(7);
+      expect(rubric?.score_ranges).toHaveLength(4);
+    }
+  });
+
+  it('throws on overlapping score_ranges', async () => {
+    const rawEvalCase = {
+      evaluators: [
+        {
+          name: 'overlapping',
+          type: 'llm_judge',
+          rubrics: [
+            {
+              id: 'test',
+              score_ranges: [
+                { score_range: [0, 5], expected_outcome: 'Low' },
+                { score_range: [4, 10], expected_outcome: 'High' }, // Overlaps at 4-5
+              ],
+            },
+          ],
+        },
+      ],
+    };
+
+    await expect(
+      parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'),
+    ).rejects.toThrow(/overlapping/i);
+  });
+
+  it('throws on incomplete score_ranges coverage', async () => {
+    const rawEvalCase = {
+      evaluators: [
+        {
+          name: 'incomplete',
+          type: 'llm_judge',
+          rubrics: [
+            {
+              id: 'test',
+              score_ranges: [
+                { score_range: [0, 3], expected_outcome: 'Low' },
+                { score_range: [7, 10], expected_outcome: 'High' }, // Missing 4-6
+              ],
+            },
+          ],
+        },
+      ],
+    };
+
+    await expect(
+      parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case'),
+    ).rejects.toThrow(/coverage/i);
+  });
+
+  it('supports description as backward-compatible alias for expected_outcome', async () => {
+    const rawEvalCase = {
+      evaluators: [
+        {
+          name: 'legacy',
+          type: 'llm_judge',
+          rubrics: [
+            {
+              id: 'r1',
+              description: 'Must be polite', // Legacy field name
+              weight: 1.0,
+              required: true,
+            },
+          ],
+        },
+      ],
+    };
+
+    const evaluators = await parseEvaluators(rawEvalCase, undefined, [process.cwd()], 'test-case');
+
+    expect(evaluators).toHaveLength(1);
+    const config = evaluators?.[0];
+    if (config?.type === 'llm_judge') {
+      expect(config.rubrics?.[0]?.expected_outcome).toBe('Must be polite');
+    }
+  });
+});
+
 describe('parseEvaluators - token_usage', () => {
   it('parses token_usage evaluator with limits', async () => {
     const rawEvalCase = {

From 282cdf97966650dd6bb9ee2e1b98d4c247983d68 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Wed, 21 Jan 2026 12:42:11 +0000
Subject: [PATCH 9/9] Add changeset for score_ranges rubrics feature

---
 .changeset/score-ranges-rubrics.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 .changeset/score-ranges-rubrics.md

diff --git a/.changeset/score-ranges-rubrics.md b/.changeset/score-ranges-rubrics.md
new file mode 100644
index 00000000..889e201c
--- /dev/null
+++ b/.changeset/score-ranges-rubrics.md
@@ -0,0 +1,13 @@
+---
+"@agentv/core": minor
+"agentv": minor
+---
+
+Add score_ranges rubrics for analytic LLM judge evaluation
+
+- Add `score_ranges` field for 0-10 integer scoring per rubric criterion
+- Add `required_min_score` field for flexible gating (replaces boolean `required`)
+- Add `description` as backward-compatible alias for `expected_outcome`
+- Validate score ranges: integers 0-10, non-overlapping, full coverage
+- Normalize scores to 0-1 (divide by 10) with weighted aggregation
+- Legacy `required: true` treated as `required_min_score: 10`