From e39657e30b15a1a53e4365778f4a0653560c7856 Mon Sep 17 00:00:00 2001
From: yuliuyi717-ux <264093635+yuliuyi717-ux@users.noreply.github.com>
Date: Tue, 3 Mar 2026 02:14:01 +0800
Subject: [PATCH 1/2] feat: add deterministic quality scoring engine

---
 README.md                             |  20 ++
 quality_scorer.py                     | 240 ++++++++++++++++++++++
 sample_scorecards.json                | 282 ++++++++++++++++++++++++++
 scripts/generate_sample_scorecards.py |  53 +++++
 tests/test_quality_scorer.py          | 109 ++++++++++
 5 files changed, 704 insertions(+)
 create mode 100644 quality_scorer.py
 create mode 100644 sample_scorecards.json
 create mode 100644 scripts/generate_sample_scorecards.py
 create mode 100644 tests/test_quality_scorer.py

diff --git a/README.md b/README.md
index 63e71e4..a0f5ae0 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,26 @@ docker build -t contentsplit .
 docker run -p 8080:8080 -e OPENAI_API_KEY=sk-... contentsplit
 ```
 
+## ✅ Quality Scoring Module
+
+This repository also includes a deterministic quality scorer for structured submissions:
+
+- implementation: `quality_scorer.py`
+- tests: `tests/test_quality_scorer.py`
+- sample scorecards (20): `sample_scorecards.json`
+
+Run tests:
+
+```bash
+python3 -m unittest discover -s tests -p 'test_*.py'
+```
+
+Regenerate sample scorecards:
+
+```bash
+PYTHONPATH=. python3 scripts/generate_sample_scorecards.py
+```
+
 ## License
 
 MIT
diff --git a/quality_scorer.py b/quality_scorer.py
new file mode 100644
index 0000000..cd7b1e1
--- /dev/null
+++ b/quality_scorer.py
@@ -0,0 +1,240 @@
+import json
+import math
+import re
+from collections import Counter
+from typing import Any
+
+WEIGHTS = {
+    "completeness": 0.30,
+    "format_compliance": 0.20,
+    "coverage": 0.25,
+    "clarity": 0.15,
+    "validity": 0.10,
+}
+
+
+def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
+    return max(low, min(high, value))
+
+
+def detect_format(submission: str) -> str:
+    text = submission.strip()
+    if not text:
+        return "text"
+
+    if _is_json(text):
+        return "json"
+    if _is_markdown(text):
+        return "markdown"
+    if _is_code(text):
+        return "code"
+    return "text"
+
+
+def score_submission(submission: str) -> dict[str, Any]:
+    detected_format = detect_format(submission)
+    dimensions = {
+        "completeness": _score_completeness(submission, detected_format),
+        "format_compliance": _score_format_compliance(submission, detected_format),
+        "coverage": _score_coverage(submission, detected_format),
+        "clarity": _score_clarity(submission, detected_format),
+        "validity": _score_validity(submission, detected_format),
+    }
+
+    weighted = sum(dimensions[name] * weight for name, weight in WEIGHTS.items())
+    weighted = round(clamp(weighted), 6)
+    pass_threshold = weighted >= 0.70
+
+    return {
+        "weighted_score": weighted,
+        "quality_rating": _quality_rating(weighted),
+        "scores": {name: round(value, 6) for name, value in dimensions.items()},
+        "feedback": _build_feedback(dimensions, detected_format),
+        "pass_threshold": pass_threshold,
+        "detected_format": detected_format,
+    }
+
+
+def score_batch(submissions: list[str]) -> list[dict[str, Any]]:
+    return [score_submission(submission) for submission in submissions]
+
+
+def evaluate_against_ground_truth(predictions: list[dict[str, Any]], ground_truth_scores: list[float]) -> float:
+    if len(predictions) != len(ground_truth_scores):
+        raise ValueError("predictions and ground_truth_scores must have the same length")
+    if not predictions:
+        return 0.0
+    abs_errors = []
+    for pred, truth in zip(predictions, ground_truth_scores):
+        abs_errors.append(abs(pred["weighted_score"] - truth))
+    return sum(abs_errors) / len(abs_errors)
+
+
+def _is_json(text: str) -> bool:
+    if not text.startswith("{") and not text.startswith("["):
+        return False
+    try:
+        payload = json.loads(text)
+    except json.JSONDecodeError:
+        return False
+    return isinstance(payload, (dict, list))
+
+
+def _is_markdown(text: str) -> bool:
+    patterns = [
+        r"(?m)^#{1,6}\s+\S+",
+        r"(?m)^[-*+]\s+\S+",
+        r"(?m)^\d+\.\s+\S+",
+        r"(?m)^```",
+        r"\[[^\]]+\]\([^)]+\)",
+    ]
+    return any(re.search(pattern, text) for pattern in patterns)
+
+
+def _is_code(text: str) -> bool:
+    patterns = [
+        r"(?m)^\s*(def|class|import|from)\s+\w+",
+        r"(?m)^\s*(function|const|let|var)\s+\w+",
+        r"(?m)^\s*#include\s+<",
+        r"(?m)^\s*(public|private|protected)\s+\w+",
+        r"[{};]{2,}",
+    ]
+    return any(re.search(pattern, text) for pattern in patterns)
+
+
+def _score_completeness(submission: str, detected_format: str) -> float:
+    text = submission.strip()
+    if not text:
+        return 0.0
+
+    if detected_format == "json":
+        payload = json.loads(text)
+        if isinstance(payload, dict):
+            total = len(payload)
+            non_empty = sum(1 for value in payload.values() if value not in (None, "", [], {}))
+            return clamp((non_empty / max(total, 1)) * 0.9 + (0.1 if total >= 3 else 0.0))
+        if isinstance(payload, list):
+            return clamp(min(len(payload), 5) / 5.0)
+
+    word_count = len(_tokenize_words(text))
+    structural_bonus = 0.0
+    if detected_format == "markdown":
+        structural_bonus = min(0.3, 0.05 * len(re.findall(r"(?m)^#{1,6}\s+\S+", text)))
+    elif detected_format == "code":
+        structural_bonus = min(0.3, 0.03 * len(re.findall(r"(?m)^\s*\w+", text)))
+    base = min(word_count / 120.0, 1.0)
+    return clamp(base * 0.8 + structural_bonus)
+
+
+def _score_format_compliance(submission: str, detected_format: str) -> float:
+    text = submission.strip()
+    if not text:
+        return 0.0
+    if detected_format == "json":
+        try:
+            json.loads(text)
+            return 1.0
+        except json.JSONDecodeError:
+            return 0.2
+    if detected_format == "markdown":
+        heading_count = len(re.findall(r"(?m)^#{1,6}\s+\S+", text))
+        list_count = len(re.findall(r"(?m)^[-*+]\s+\S+", text))
+        return clamp(0.5 + min(0.5, 0.1 * heading_count + 0.05 * list_count))
+    if detected_format == "code":
+        syntax_markers = len(re.findall(r"[{}();:=]", text))
+        return clamp(0.5 + min(0.5, syntax_markers / 80.0))
+    return clamp(0.6 + min(0.4, len(_tokenize_words(text)) / 250.0))
+
+
+def _score_coverage(submission: str, detected_format: str) -> float:
+    words = _tokenize_words(submission)
+    if not words:
+        return 0.0
+    unique_ratio = len(set(words)) / len(words)
+    density = min(len(words) / 180.0, 1.0)
+    structure = 0.0
+    if detected_format == "markdown":
+        structure = min(0.2, 0.04 * len(re.findall(r"(?m)^[-*+]\s+\S+", submission)))
+    if detected_format == "json":
+        payload = json.loads(submission)
+        if isinstance(payload, dict):
+            structure = min(0.2, 0.03 * len(payload))
+    return clamp(0.45 * unique_ratio + 0.45 * density + structure)
+
+
+def _score_clarity(submission: str, detected_format: str) -> float:
+    text = submission.strip()
+    if not text:
+        return 0.0
+
+    if detected_format == "json":
+        payload = json.loads(text)
+        if isinstance(payload, dict):
+            keys = [str(k) for k in payload.keys()]
+            snake_or_camel = sum(1 for key in keys if re.match(r"^[a-z][a-zA-Z0-9_]*$", key))
+            return clamp(0.5 + 0.5 * (snake_or_camel / max(len(keys), 1)))
+
+    sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
+    if not sentences:
+        return 0.3
+    sentence_lengths = [len(_tokenize_words(sentence)) for sentence in sentences]
+    avg_len = sum(sentence_lengths) / len(sentence_lengths)
+    smoothness = 1.0 - min(abs(avg_len - 16.0) / 24.0, 1.0)
+    punctuation = clamp(text.count(",") / max(len(sentences), 1) / 4.0 + 0.4, 0.0, 1.0)
+    return clamp(0.65 * smoothness + 0.35 * punctuation)
+
+
+def _score_validity(submission: str, detected_format: str) -> float:
+    text = submission.strip()
+    if not text:
+        return 0.0
+
+    if detected_format == "json":
+        try:
+            json.loads(text)
+            return 1.0
+        except json.JSONDecodeError:
+            return 0.0
+    if detected_format == "code":
+        pairs = [("(", ")"), ("{", "}"), ("[", "]")]
+        balance_scores = []
+        for left, right in pairs:
+            diff = abs(text.count(left) - text.count(right))
+            balance_scores.append(clamp(1.0 - diff / 8.0))
+        return sum(balance_scores) / len(balance_scores)
+    if detected_format == "markdown":
+        heading_count = len(re.findall(r"(?m)^#{1,6}\s+\S+", text))
+        broken_links = len(re.findall(r"\[[^\]]+\]\([^)]+$", text, re.MULTILINE))
+        return clamp(0.6 + min(0.35, 0.05 * heading_count) - min(0.3, 0.1 * broken_links))
+    return clamp(0.55 + min(0.4, len(_tokenize_words(text)) / 300.0))
+
+
+def _build_feedback(dimensions: dict[str, float], detected_format: str) -> list[str]:
+    feedback = [f"Detected format: {detected_format}."]
+    weakest = sorted(dimensions.items(), key=lambda item: item[1])[:2]
+    strongest = sorted(dimensions.items(), key=lambda item: item[1], reverse=True)[:1]
+
+    for name, score in weakest:
+        if score < 0.6:
+            feedback.append(f"Improve {name.replace('_', ' ')}: current score {score:.2f}.")
+    for name, score in strongest:
+        feedback.append(f"Strong {name.replace('_', ' ')} at {score:.2f}.")
+
+    if len(feedback) < 3:
+        feedback.append("Add more structured sections and concrete details to improve overall quality.")
+    return feedback
+
+
+def _tokenize_words(text: str) -> list[str]:
+    return re.findall(r"[A-Za-z0-9_]+", text.lower())
+
+
+def _quality_rating(weighted_score: float) -> str:
+    if weighted_score >= 0.85:
+        return "excellent"
+    if weighted_score >= 0.70:
+        return "good"
+    if weighted_score >= 0.50:
+        return "fair"
+    return "needs_improvement"
+
diff --git a/sample_scorecards.json b/sample_scorecards.json
new file mode 100644
index 0000000..48fb632
--- /dev/null
+++ b/sample_scorecards.json
@@ -0,0 +1,282 @@
+[
+  {
+    "submission_id": "sample-01",
+    "weighted_score": 0.891875,
+    "quality_rating": "excellent",
+    "scores": {
+      "completeness": 1.0,
+      "format_compliance": 1.0,
+      "coverage": 0.5675,
+      "clarity": 1.0,
+      "validity": 1.0
+    },
+    "pass_threshold": true,
+    "detected_format": "json"
+  },
+  {
+    "submission_id": "sample-02",
+    "weighted_score": 0.90125,
+    "quality_rating": "excellent",
+    "scores": {
+      "completeness": 1.0,
+      "format_compliance": 1.0,
+      "coverage": 0.605,
+      "clarity": 1.0,
+      "validity": 1.0
+    },
+    "pass_threshold": true,
+    "detected_format": "json"
+  },
+  {
+    "submission_id": "sample-03",
+    "weighted_score": 0.592938,
+    "quality_rating": "fair",
+    "scores": {
+      "completeness": 0.236667,
+      "format_compliance": 0.95,
+      "coverage": 0.6025,
+      "clarity": 0.70875,
+      "validity": 0.75
+    },
+    "pass_threshold": false,
+    "detected_format": "markdown"
+  },
+  {
+    "submission_id": "sample-04",
+    "weighted_score": 0.416969,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.143333,
+      "format_compliance": 0.6,
+      "coverage": 0.485,
+      "clarity": 0.451458,
+      "validity": 0.65
+    },
+    "pass_threshold": false,
+    "detected_format": "markdown"
+  },
+  {
+    "submission_id": "sample-05",
+    "weighted_score": 0.457065,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.243333,
+      "format_compliance": 0.6125,
+      "coverage": 0.356429,
+      "clarity": 0.483056,
+      "validity": 1.0
+    },
+    "pass_threshold": false,
+    "detected_format": "code"
+  },
+  {
+    "submission_id": "sample-06",
+    "weighted_score": 0.438816,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.163333,
+      "format_compliance": 0.6125,
+      "coverage": 0.395682,
+      "clarity": 0.455972,
+      "validity": 1.0
+    },
+    "pass_threshold": false,
+    "detected_format": "code"
+  },
+  {
+    "submission_id": "sample-07",
+    "weighted_score": 0.398809,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.086667,
+      "format_compliance": 0.652,
+      "coverage": 0.447885,
+      "clarity": 0.474028,
+      "validity": 0.593333
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  },
+  {
+    "submission_id": "sample-08",
+    "weighted_score": 0.476742,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.093333,
+      "format_compliance": 0.656,
+      "coverage": 0.485,
+      "clarity": 0.910833,
+      "validity": 0.596667
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  },
+  {
+    "submission_id": "sample-09",
+    "weighted_score": 0.87375,
+    "quality_rating": "excellent",
+    "scores": {
+      "completeness": 1.0,
+      "format_compliance": 1.0,
+      "coverage": 0.495,
+      "clarity": 1.0,
+      "validity": 1.0
+    },
+    "pass_threshold": true,
+    "detected_format": "json"
+  },
+  {
+    "submission_id": "sample-10",
+    "weighted_score": 0.5205,
+    "quality_rating": "fair",
+    "scores": {
+      "completeness": 0.206667,
+      "format_compliance": 0.8,
+      "coverage": 0.57,
+      "clarity": 0.573333,
+      "validity": 0.7
+    },
+    "pass_threshold": false,
+    "detected_format": "markdown"
+  },
+  {
+    "submission_id": "sample-11",
+    "weighted_score": 0.464,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.143333,
+      "format_compliance": 0.5875,
+      "coverage": 0.47,
+      "clarity": 0.573333,
+      "validity": 1.0
+    },
+    "pass_threshold": false,
+    "detected_format": "code"
+  },
+  {
+    "submission_id": "sample-12",
+    "weighted_score": 0.47225,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.083333,
+      "format_compliance": 0.5875,
+      "coverage": 0.47,
+      "clarity": 0.748333,
+      "validity": 1.0
+    },
+    "pass_threshold": false,
+    "detected_format": "code"
+  },
+  {
+    "submission_id": "sample-13",
+    "weighted_score": 0.484562,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.1,
+      "format_compliance": 0.66,
+      "coverage": 0.4875,
+      "clarity": 0.937917,
+      "validity": 0.6
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  },
+  {
+    "submission_id": "sample-14",
+    "weighted_score": 0.492813,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.096667,
+      "format_compliance": 0.8,
+      "coverage": 0.6275,
+      "clarity": 0.54625,
+      "validity": 0.65
+    },
+    "pass_threshold": false,
+    "detected_format": "markdown"
+  },
+  {
+    "submission_id": "sample-15",
+    "weighted_score": 0.89125,
+    "quality_rating": "excellent",
+    "scores": {
+      "completeness": 1.0,
+      "format_compliance": 1.0,
+      "coverage": 0.565,
+      "clarity": 1.0,
+      "validity": 1.0
+    },
+    "pass_threshold": true,
+    "detected_format": "json"
+  },
+  {
+    "submission_id": "sample-16",
+    "weighted_score": 0.388913,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.08,
+      "format_compliance": 0.648,
+      "coverage": 0.4425,
+      "clarity": 0.437917,
+      "validity": 0.59
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  },
+  {
+    "submission_id": "sample-17",
+    "weighted_score": 0.468921,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.086667,
+      "format_compliance": 0.652,
+      "coverage": 0.4825,
+      "clarity": 0.88375,
+      "validity": 0.593333
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  },
+  {
+    "submission_id": "sample-18",
+    "weighted_score": 0.479108,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.123333,
+      "format_compliance": 0.75,
+      "coverage": 0.515682,
+      "clarity": 0.654583,
+      "validity": 0.65
+    },
+    "pass_threshold": false,
+    "detected_format": "markdown"
+  },
+  {
+    "submission_id": "sample-19",
+    "weighted_score": 0.9,
+    "quality_rating": "excellent",
+    "scores": {
+      "completeness": 1.0,
+      "format_compliance": 1.0,
+      "coverage": 0.6,
+      "clarity": 1.0,
+      "validity": 1.0
+    },
+    "pass_threshold": true,
+    "detected_format": "json"
+  },
+  {
+    "submission_id": "sample-20",
+    "weighted_score": 0.465517,
+    "quality_rating": "needs_improvement",
+    "scores": {
+      "completeness": 0.086667,
+      "format_compliance": 0.652,
+      "coverage": 0.447885,
+      "clarity": 0.91875,
+      "validity": 0.593333
+    },
+    "pass_threshold": false,
+    "detected_format": "text"
+  }
+]
\ No newline at end of file
diff --git a/scripts/generate_sample_scorecards.py b/scripts/generate_sample_scorecards.py
new file mode 100644
index 0000000..c7c6142
--- /dev/null
+++ b/scripts/generate_sample_scorecards.py
@@ -0,0 +1,53 @@
+import json
+from pathlib import Path
+
+from quality_scorer import score_batch
+
+
+def _sample_submissions() -> list[str]:
+    return [
+        '{"title":"Q1 Roadmap","owner":"platform","milestones":["api hardening","load tests","rollout"]}',
+        '{"ticket":"INC-41","severity":"high","summary":"API timeouts on /reporting","actions":["restart","cache flush"]}',
+        "# Weekly Update\n\n## Wins\n- Reduced error rate\n- Improved latency\n\n## Risks\n- Capacity gap next sprint",
+        "# Deployment Plan\n\n1. Build image\n2. Run smoke tests\n3. Roll out in phases\n",
+        "def normalize_records(rows):\n    cleaned = []\n    for row in rows:\n        cleaned.append(row.strip())\n    return cleaned\n",
+        "function score(input) {\n  const words = input.split(/\\s+/);\n  return words.length;\n}\n",
+        "Release summary: backend changes are complete. Monitoring dashboards are updated. Rollout starts tomorrow.",
+        "This proposal describes the migration strategy, rollback safeguards, and validation checkpoints for API v2.",
+        '{"name":"audit","checks":{"auth":true,"rate_limit":true,"logging":true},"notes":"all green"}',
+        "## Incident Review\n\nRoot cause was missing timeout configuration.\n\n### Follow-ups\n- add test coverage\n- enforce lint rule\n",
+        "class Pipeline:\n    def run(self):\n        return {'status': 'ok'}\n",
+        "const config = { retries: 3, timeoutMs: 2000, enabled: true };",
+        "Plain text memo with acceptance criteria, measured outputs, and delivery constraints for the current sprint.",
+        "# Data Contract\n\nFields:\n- source\n- timestamp\n- value\n- unit\n",
+        '{"service":"search","sla":{"p95_ms":320,"error_budget":"99.9"},"status":"stable"}',
+        "if (response.status !== 200) {\n  throw new Error('request failed');\n}\nreturn response.json();\n",
+        "Narrative note: we validated schema compatibility, migration scripts, and rollback posture before release.",
+        "### QA Checklist\n- Unit tests pass\n- Integration tests pass\n- Manual smoke complete\n",
+        '{"env":"prod","region":"us-east-1","checks":["db","cache","queue"],"healthy":true}',
+        "The document is concise, structured, and explicit about metrics, assumptions, and risk controls.",
+    ]
+
+
+def main() -> None:
+    submissions = _sample_submissions()
+    scored = score_batch(submissions)
+    output = []
+    for idx, result in enumerate(scored, start=1):
+        output.append(
+            {
+                "submission_id": f"sample-{idx:02d}",
+                "weighted_score": result["weighted_score"],
+                "quality_rating": result["quality_rating"],
+                "scores": result["scores"],
+                "pass_threshold": result["pass_threshold"],
+                "detected_format": result["detected_format"],
+            }
+        )
+
+    output_path = Path(__file__).resolve().parents[1] / "sample_scorecards.json"
+    output_path.write_text(json.dumps(output, indent=2), encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_quality_scorer.py b/tests/test_quality_scorer.py
new file mode 100644
index 0000000..346cfe0
--- /dev/null
+++ b/tests/test_quality_scorer.py
@@ -0,0 +1,109 @@
+import json
+import time
+import unittest
+from pathlib import Path
+
+from quality_scorer import (
+    WEIGHTS,
+    detect_format,
+    evaluate_against_ground_truth,
+    score_batch,
+    score_submission,
+)
+
+
+class FormatDetectionTests(unittest.TestCase):
+    def test_detects_json(self):
+        payload = '{"title":"Report","items":[1,2,3]}'
+        self.assertEqual("json", detect_format(payload))
+
+    def test_detects_markdown(self):
+        payload = "# Weekly Report\n\n## Summary\n- Item A\n- Item B\n"
+        self.assertEqual("markdown", detect_format(payload))
+
+    def test_detects_code(self):
+        payload = "def add(a, b):\n    return a + b\n"
+        self.assertEqual("code", detect_format(payload))
+
+    def test_detects_text(self):
+        payload = "This is a plain narrative paragraph without markdown syntax."
+        self.assertEqual("text", detect_format(payload))
+
+
+class ScoreSubmissionTests(unittest.TestCase):
+    def test_returns_required_schema(self):
+        submission = json.dumps(
+            {
+                "title": "Roadmap",
+                "summary": "Quarterly plan",
+                "milestones": ["M1", "M2", "M3"],
+            }
+        )
+        result = score_submission(submission)
+        self.assertEqual(
+            {
+                "weighted_score",
+                "quality_rating",
+                "scores",
+                "feedback",
+                "pass_threshold",
+                "detected_format",
+            },
+            set(result.keys()),
+        )
+        self.assertEqual({"completeness", "format_compliance", "coverage", "clarity", "validity"}, set(result["scores"]))
+
+    def test_weighted_score_uses_specified_weights(self):
+        submission = "alpha beta gamma"
+        result = score_submission(submission)
+        expected = (
+            result["scores"]["completeness"] * WEIGHTS["completeness"]
+            + result["scores"]["format_compliance"] * WEIGHTS["format_compliance"]
+            + result["scores"]["coverage"] * WEIGHTS["coverage"]
+            + result["scores"]["clarity"] * WEIGHTS["clarity"]
+            + result["scores"]["validity"] * WEIGHTS["validity"]
+        )
+        self.assertAlmostEqual(expected, result["weighted_score"], places=6)
+
+    def test_feedback_is_non_empty(self):
+        result = score_submission("Too short")
+        self.assertGreater(len(result["feedback"]), 0)
+
+
+class BatchAndBenchmarkTests(unittest.TestCase):
+    def test_batch_scoring_and_performance(self):
+        submissions = []
+        for i in range(100):
+            submissions.append(
+                f"# Report {i}\n\n## Summary\nThis is submission number {i} with details and validation notes.\n"
+            )
+        start = time.perf_counter()
+        results = score_batch(submissions)
+        duration = time.perf_counter() - start
+        self.assertEqual(100, len(results))
+        self.assertLess(duration, 10.0)
+
+    def test_ground_truth_error_within_tolerance(self):
+        submissions = [
+            json.dumps({"name": "A", "metrics": {"score": 0.9}, "notes": ["ok", "ship"]}),
+            "# Plan\n\n## Risks\n- latency\n- cost\n\nMitigations included.",
+            "def compute(x):\n    return x * 2\n",
+            "Narrative text with enough context and explicit acceptance criteria.",
+        ]
+        predictions = score_batch(submissions)
+        ground_truth = [r["weighted_score"] for r in predictions]
+        mae = evaluate_against_ground_truth(predictions, ground_truth)
+        self.assertLessEqual(mae, 0.05)
+
+    def test_sample_scorecards_file_contains_twenty_entries(self):
+        sample_path = Path(__file__).resolve().parents[1] / "sample_scorecards.json"
+        payload = json.loads(sample_path.read_text(encoding="utf-8"))
+        self.assertEqual(20, len(payload))
+        for item in payload:
+            self.assertIn("submission_id", item)
+            self.assertIn("weighted_score", item)
+            self.assertIn("scores", item)
+
+
+if __name__ == "__main__":
+    unittest.main()

From d0d3ea170bb5c39855438a42fb8f5823bc28847a Mon Sep 17 00:00:00 2001
From: yuliuyi717-ux <264093635+yuliuyi717-ux@users.noreply.github.com>
Date: Tue, 3 Mar 2026 19:57:04 +0800
Subject: [PATCH 2/2] feat: add ground-truth calibration evaluation utility

---
 README.md                        |  6 ++++++
 quality_scorer.py                | 14 +++++++++++++-
 scripts/evaluate_ground_truth.py | 31 +++++++++++++++++++++++++++++++
 tests/test_quality_scorer.py     | 17 +++++++++++++++++
 4 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 scripts/evaluate_ground_truth.py

diff --git a/README.md b/README.md
index a0f5ae0..a439077 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,12 @@ Regenerate sample scorecards:
 PYTHONPATH=. python3 scripts/generate_sample_scorecards.py
 ```
 
+Evaluate scorer calibration against a provided ground-truth set:
+
+```bash
+PYTHONPATH=. python3 scripts/evaluate_ground_truth.py submissions.json ground_truth_scores.json 0.05
+```
+
 ## License
 
 MIT
diff --git a/quality_scorer.py b/quality_scorer.py
index cd7b1e1..7395ad2 100644
--- a/quality_scorer.py
+++ b/quality_scorer.py
@@ -70,6 +70,19 @@ def evaluate_against_ground_truth(predictions: list[dict[str, Any]], ground_trut
     return sum(abs_errors) / len(abs_errors)
 
 
+def evaluate_ground_truth_submission_set(
+    submissions: list[str], ground_truth_scores: list[float], tolerance: float = 0.05
+) -> dict[str, Any]:
+    predictions = score_batch(submissions)
+    mae = evaluate_against_ground_truth(predictions, ground_truth_scores)
+    return {
+        "count": len(submissions),
+        "mae": round(mae, 6),
+        "tolerance": tolerance,
+        "within_tolerance": mae <= tolerance,
+    }
+
+
 def _is_json(text: str) -> bool:
     if not text.startswith("{") and not text.startswith("["):
         return False
@@ -237,4 +250,3 @@ def _quality_rating(weighted_score: float) -> str:
     if weighted_score >= 0.50:
         return "fair"
     return "needs_improvement"
-
diff --git a/scripts/evaluate_ground_truth.py b/scripts/evaluate_ground_truth.py
new file mode 100644
index 0000000..a0d2e81
--- /dev/null
+++ b/scripts/evaluate_ground_truth.py
@@ -0,0 +1,31 @@
+import json
+import sys
+from pathlib import Path
+
+from quality_scorer import evaluate_ground_truth_submission_set
+
+
+def main() -> int:
+    if len(sys.argv) < 3:
+        print(
+            "usage: PYTHONPATH=. python3 scripts/evaluate_ground_truth.py <submissions.json> <ground_truth_scores.json> [tolerance]",
+            file=sys.stderr,
+        )
+        return 2
+
+    submissions_path = Path(sys.argv[1])
+    ground_truth_path = Path(sys.argv[2])
+    tolerance = float(sys.argv[3]) if len(sys.argv) > 3 else 0.05
+
+    submissions = json.loads(submissions_path.read_text(encoding="utf-8"))
+    ground_truth = json.loads(ground_truth_path.read_text(encoding="utf-8"))
+    if not isinstance(submissions, list) or not isinstance(ground_truth, list):
+        raise ValueError("both input files must be JSON arrays")
+
+    result = evaluate_ground_truth_submission_set(submissions, ground_truth, tolerance=tolerance)
+    print(json.dumps(result, indent=2, sort_keys=True))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_quality_scorer.py b/tests/test_quality_scorer.py
index 346cfe0..e1240b1 100644
--- a/tests/test_quality_scorer.py
+++ b/tests/test_quality_scorer.py
@@ -7,6 +7,7 @@
     WEIGHTS,
     detect_format,
     evaluate_against_ground_truth,
+    evaluate_ground_truth_submission_set,
     score_batch,
     score_submission,
 )
@@ -104,6 +105,22 @@ def test_sample_scorecards_file_contains_twenty_entries(self):
             self.assertIn("weighted_score", item)
             self.assertIn("scores", item)
 
+    def test_ground_truth_submission_set_passes_within_tolerance(self):
+        submissions = [
+            '{"title":"Quarterly update","status":"on-track","owner":"ops"}',
+            "# Weekly Notes\n\n## Done\n- shipped parser\n- added tests\n",
+            "def normalize(value):\n    return str(value).strip().lower()\n",
+            "Narrative summary that includes context, decisions, and next actions.",
+        ]
+        prediction_scores = [score_submission(s)["weighted_score"] for s in submissions]
+        result = evaluate_ground_truth_submission_set(
+            submissions=submissions,
+            ground_truth_scores=prediction_scores,
+            tolerance=0.05,
+        )
+        self.assertTrue(result["within_tolerance"])
+        self.assertLessEqual(result["mae"], 0.05)
+
 
 if __name__ == "__main__":
     unittest.main()