From e39657e30b15a1a53e4365778f4a0653560c7856 Mon Sep 17 00:00:00 2001 From: yuliuyi717-ux <264093635+yuliuyi717-ux@users.noreply.github.com> Date: Tue, 3 Mar 2026 02:14:01 +0800 Subject: [PATCH 1/2] feat: add deterministic quality scoring engine --- README.md | 20 ++ quality_scorer.py | 240 ++++++++++++++++++++++ sample_scorecards.json | 282 ++++++++++++++++++++++++++ scripts/generate_sample_scorecards.py | 53 +++++ tests/test_quality_scorer.py | 109 ++++++++++ 5 files changed, 704 insertions(+) create mode 100644 quality_scorer.py create mode 100644 sample_scorecards.json create mode 100644 scripts/generate_sample_scorecards.py create mode 100644 tests/test_quality_scorer.py diff --git a/README.md b/README.md index 63e71e4..a0f5ae0 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,26 @@ docker build -t contentsplit . docker run -p 8080:8080 -e OPENAI_API_KEY=sk-... contentsplit ``` +## ✅ Quality Scoring Module + +This repository also includes a deterministic quality scorer for structured submissions: + +- implementation: `quality_scorer.py` +- tests: `tests/test_quality_scorer.py` +- sample scorecards (20): `sample_scorecards.json` + +Run tests: + +```bash +python3 -m unittest discover -s tests -p 'test_*.py' +``` + +Regenerate sample scorecards: + +```bash +PYTHONPATH=. python3 scripts/generate_sample_scorecards.py +``` + ## License MIT diff --git a/quality_scorer.py b/quality_scorer.py new file mode 100644 index 0000000..cd7b1e1 --- /dev/null +++ b/quality_scorer.py @@ -0,0 +1,240 @@ +import json +import math +import re +from collections import Counter +from typing import Any + +WEIGHTS = { + "completeness": 0.30, + "format_compliance": 0.20, + "coverage": 0.25, + "clarity": 0.15, + "validity": 0.10, +} + + +def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: + return max(low, min(high, value)) + + +def detect_format(submission: str) -> str: + text = submission.strip() + if not text: + return "text" + + if _is_json(text): + return "json" + if _is_markdown(text): + return "markdown" + if _is_code(text): + return "code" + return "text" + + +def score_submission(submission: str) -> dict[str, Any]: + detected_format = detect_format(submission) + dimensions = { + "completeness": _score_completeness(submission, detected_format), + "format_compliance": _score_format_compliance(submission, detected_format), + "coverage": _score_coverage(submission, detected_format), + "clarity": _score_clarity(submission, detected_format), + "validity": _score_validity(submission, detected_format), + } + + weighted = sum(dimensions[name] * weight for name, weight in WEIGHTS.items()) + weighted = round(clamp(weighted), 6) + pass_threshold = weighted >= 0.70 + + return { + "weighted_score": weighted, + "quality_rating": _quality_rating(weighted), + "scores": {name: round(value, 6) for name, value in dimensions.items()}, + "feedback": _build_feedback(dimensions, detected_format), + "pass_threshold": pass_threshold, + "detected_format": detected_format, + } + + +def score_batch(submissions: list[str]) -> list[dict[str, Any]]: + return [score_submission(submission) for submission in submissions] + + +def evaluate_against_ground_truth(predictions: list[dict[str, Any]], ground_truth_scores: list[float]) -> float: + if len(predictions) != len(ground_truth_scores): + raise ValueError("predictions and ground_truth_scores must have the same length") + if not predictions: + return 0.0 + abs_errors = [] + for pred, truth in zip(predictions, ground_truth_scores): + abs_errors.append(abs(pred["weighted_score"] - truth)) + return sum(abs_errors) / len(abs_errors) + + +def _is_json(text: str) -> bool: + if not text.startswith("{") and not text.startswith("["): + return False + try: + payload = json.loads(text) + except json.JSONDecodeError: + return False + return isinstance(payload, (dict, list)) + + +def _is_markdown(text: str) -> bool: + patterns = [ + r"(?m)^#{1,6}\s+\S+", + r"(?m)^[-*+]\s+\S+", + r"(?m)^\d+\.\s+\S+", + r"(?m)^```", + r"\[[^\]]+\]\([^)]+\)", + ] + return any(re.search(pattern, text) for pattern in patterns) + + +def _is_code(text: str) -> bool: + patterns = [ + r"(?m)^\s*(def|class|import|from)\s+\w+", + r"(?m)^\s*(function|const|let|var)\s+\w+", + r"(?m)^\s*#include\s+<", + r"(?m)^\s*(public|private|protected)\s+\w+", + r"[{};]{2,}", + ] + return any(re.search(pattern, text) for pattern in patterns) + + +def _score_completeness(submission: str, detected_format: str) -> float: + text = submission.strip() + if not text: + return 0.0 + + if detected_format == "json": + payload = json.loads(text) + if isinstance(payload, dict): + total = len(payload) + non_empty = sum(1 for value in payload.values() if value not in (None, "", [], {})) + return clamp((non_empty / max(total, 1)) * 0.9 + (0.1 if total >= 3 else 0.0)) + if isinstance(payload, list): + return clamp(min(len(payload), 5) / 5.0) + + word_count = len(_tokenize_words(text)) + structural_bonus = 0.0 + if detected_format == "markdown": + structural_bonus = min(0.3, 0.05 * len(re.findall(r"(?m)^#{1,6}\s+\S+", text))) + elif detected_format == "code": + structural_bonus = min(0.3, 0.03 * len(re.findall(r"(?m)^\s*\w+", text))) + base = min(word_count / 120.0, 1.0) + return clamp(base * 0.8 + structural_bonus) + + +def _score_format_compliance(submission: str, detected_format: str) -> float: + text = submission.strip() + if not text: + return 0.0 + if detected_format == "json": + try: + json.loads(text) + return 1.0 + except json.JSONDecodeError: + return 0.2 + if detected_format == "markdown": + heading_count = len(re.findall(r"(?m)^#{1,6}\s+\S+", text)) + list_count = len(re.findall(r"(?m)^[-*+]\s+\S+", text)) + return clamp(0.5 + min(0.5, 0.1 * heading_count + 0.05 * list_count)) + if detected_format == "code": + syntax_markers = len(re.findall(r"[{}();:=]", text)) + return clamp(0.5 + min(0.5, syntax_markers / 80.0)) + return clamp(0.6 + min(0.4, len(_tokenize_words(text)) / 250.0)) + + +def _score_coverage(submission: str, detected_format: str) -> float: + words = _tokenize_words(submission) + if not words: + return 0.0 + unique_ratio = len(set(words)) / len(words) + density = min(len(words) / 180.0, 1.0) + structure = 0.0 + if detected_format == "markdown": + structure = min(0.2, 0.04 * len(re.findall(r"(?m)^[-*+]\s+\S+", submission))) + if detected_format == "json": + payload = json.loads(submission) + if isinstance(payload, dict): + structure = min(0.2, 0.03 * len(payload)) + return clamp(0.45 * unique_ratio + 0.45 * density + structure) + + +def _score_clarity(submission: str, detected_format: str) -> float: + text = submission.strip() + if not text: + return 0.0 + + if detected_format == "json": + payload = json.loads(text) + if isinstance(payload, dict): + keys = [str(k) for k in payload.keys()] + snake_or_camel = sum(1 for key in keys if re.match(r"^[a-z][a-zA-Z0-9_]*$", key)) + return clamp(0.5 + 0.5 * (snake_or_camel / max(len(keys), 1))) + + sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()] + if not sentences: + return 0.3 + sentence_lengths = [len(_tokenize_words(sentence)) for sentence in sentences] + avg_len = sum(sentence_lengths) / len(sentence_lengths) + smoothness = 1.0 - min(abs(avg_len - 16.0) / 24.0, 1.0) + punctuation = clamp(text.count(",") / max(len(sentences), 1) / 4.0 + 0.4, 0.0, 1.0) + return clamp(0.65 * smoothness + 0.35 * punctuation) + + +def _score_validity(submission: str, detected_format: str) -> float: + text = submission.strip() + if not text: + return 0.0 + + if detected_format == "json": + try: + json.loads(text) + return 1.0 + except json.JSONDecodeError: + return 0.0 + if detected_format == "code": + pairs = [("(", ")"), ("{", "}"), ("[", "]")] + balance_scores = [] + for left, right in pairs: + diff = abs(text.count(left) - text.count(right)) + balance_scores.append(clamp(1.0 - diff / 8.0)) + return sum(balance_scores) / len(balance_scores) + if detected_format == "markdown": + heading_count = len(re.findall(r"(?m)^#{1,6}\s+\S+", text)) + broken_links = len(re.findall(r"\[[^\]]+\]\([^)]+$", text, re.MULTILINE)) + return clamp(0.6 + min(0.35, 0.05 * heading_count) - min(0.3, 0.1 * broken_links)) + return clamp(0.55 + min(0.4, len(_tokenize_words(text)) / 300.0)) + + +def _build_feedback(dimensions: dict[str, float], detected_format: str) -> list[str]: + feedback = [f"Detected format: {detected_format}."] + weakest = sorted(dimensions.items(), key=lambda item: item[1])[:2] + strongest = sorted(dimensions.items(), key=lambda item: item[1], reverse=True)[:1] + + for name, score in weakest: + if score < 0.6: + feedback.append(f"Improve {name.replace('_', ' ')}: current score {score:.2f}.") + for name, score in strongest: + feedback.append(f"Strong {name.replace('_', ' ')} at {score:.2f}.") + + if len(feedback) < 3: + feedback.append("Add more structured sections and concrete details to improve overall quality.") + return feedback + + +def _tokenize_words(text: str) -> list[str]: + return re.findall(r"[A-Za-z0-9_]+", text.lower()) + + +def _quality_rating(weighted_score: float) -> str: + if weighted_score >= 0.85: + return "excellent" + if weighted_score >= 0.70: + return "good" + if weighted_score >= 0.50: + return "fair" + return "needs_improvement" + diff --git a/sample_scorecards.json b/sample_scorecards.json new file mode 100644 index 0000000..48fb632 --- /dev/null +++ b/sample_scorecards.json @@ -0,0 +1,282 @@ +[ + { + "submission_id": "sample-01", + "weighted_score": 0.891875, + "quality_rating": "excellent", + "scores": { + "completeness": 1.0, + "format_compliance": 1.0, + "coverage": 0.5675, + "clarity": 1.0, + "validity": 1.0 + }, + "pass_threshold": true, + "detected_format": "json" + }, + { + "submission_id": "sample-02", + "weighted_score": 0.90125, + "quality_rating": "excellent", + "scores": { + "completeness": 1.0, + "format_compliance": 1.0, + "coverage": 0.605, + "clarity": 1.0, + "validity": 1.0 + }, + "pass_threshold": true, + "detected_format": "json" + }, + { + "submission_id": "sample-03", + "weighted_score": 0.592938, + "quality_rating": "fair", + "scores": { + "completeness": 0.236667, + "format_compliance": 0.95, + "coverage": 0.6025, + "clarity": 0.70875, + "validity": 0.75 + }, + "pass_threshold": false, + "detected_format": "markdown" + }, + { + "submission_id": "sample-04", + "weighted_score": 0.416969, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.143333, + "format_compliance": 0.6, + "coverage": 0.485, + "clarity": 0.451458, + "validity": 0.65 + }, + "pass_threshold": false, + "detected_format": "markdown" + }, + { + "submission_id": "sample-05", + "weighted_score": 0.457065, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.243333, + "format_compliance": 0.6125, + "coverage": 0.356429, + "clarity": 0.483056, + "validity": 1.0 + }, + "pass_threshold": false, + "detected_format": "code" + }, + { + "submission_id": "sample-06", + "weighted_score": 0.438816, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.163333, + "format_compliance": 0.6125, + "coverage": 0.395682, + "clarity": 0.455972, + "validity": 1.0 + }, + "pass_threshold": false, + "detected_format": "code" + }, + { + "submission_id": "sample-07", + "weighted_score": 0.398809, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.086667, + "format_compliance": 0.652, + "coverage": 0.447885, + "clarity": 0.474028, + "validity": 0.593333 + }, + "pass_threshold": false, + "detected_format": "text" + }, + { + "submission_id": "sample-08", + "weighted_score": 0.476742, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.093333, + "format_compliance": 0.656, + "coverage": 0.485, + "clarity": 0.910833, + "validity": 0.596667 + }, + "pass_threshold": false, + "detected_format": "text" + }, + { + "submission_id": "sample-09", + "weighted_score": 0.87375, + "quality_rating": "excellent", + "scores": { + "completeness": 1.0, + "format_compliance": 1.0, + "coverage": 0.495, + "clarity": 1.0, + "validity": 1.0 + }, + "pass_threshold": true, + "detected_format": "json" + }, + { + "submission_id": "sample-10", + "weighted_score": 0.5205, + "quality_rating": "fair", + "scores": { + "completeness": 0.206667, + "format_compliance": 0.8, + "coverage": 0.57, + "clarity": 0.573333, + "validity": 0.7 + }, + "pass_threshold": false, + "detected_format": "markdown" + }, + { + "submission_id": "sample-11", + "weighted_score": 0.464, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.143333, + "format_compliance": 0.5875, + "coverage": 0.47, + "clarity": 0.573333, + "validity": 1.0 + }, + "pass_threshold": false, + "detected_format": "code" + }, + { + "submission_id": "sample-12", + "weighted_score": 0.47225, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.083333, + "format_compliance": 0.5875, + "coverage": 0.47, + "clarity": 0.748333, + "validity": 1.0 + }, + "pass_threshold": false, + "detected_format": "code" + }, + { + "submission_id": "sample-13", + "weighted_score": 0.484562, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.1, + "format_compliance": 0.66, + "coverage": 0.4875, + "clarity": 0.937917, + "validity": 0.6 + }, + "pass_threshold": false, + "detected_format": "text" + }, + { + "submission_id": "sample-14", + "weighted_score": 0.492813, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.096667, + "format_compliance": 0.8, + "coverage": 0.6275, + "clarity": 0.54625, + "validity": 0.65 + }, + "pass_threshold": false, + "detected_format": "markdown" + }, + { + "submission_id": "sample-15", + "weighted_score": 0.89125, + "quality_rating": "excellent", + "scores": { + "completeness": 1.0, + "format_compliance": 1.0, + "coverage": 0.565, + "clarity": 1.0, + "validity": 1.0 + }, + "pass_threshold": true, + "detected_format": "json" + }, + { + "submission_id": "sample-16", + "weighted_score": 0.388913, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.08, + "format_compliance": 0.648, + "coverage": 0.4425, + "clarity": 0.437917, + "validity": 0.59 + }, + "pass_threshold": false, + "detected_format": "text" + }, + { + "submission_id": "sample-17", + "weighted_score": 0.468921, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.086667, + "format_compliance": 0.652, + "coverage": 0.4825, + "clarity": 0.88375, + "validity": 0.593333 + }, + "pass_threshold": false, + "detected_format": "text" + }, + { + "submission_id": "sample-18", + "weighted_score": 0.479108, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.123333, + "format_compliance": 0.75, + "coverage": 0.515682, + "clarity": 0.654583, + "validity": 0.65 + }, + "pass_threshold": false, + "detected_format": "markdown" + }, + { + "submission_id": "sample-19", + "weighted_score": 0.9, + "quality_rating": "excellent", + "scores": { + "completeness": 1.0, + "format_compliance": 1.0, + "coverage": 0.6, + "clarity": 1.0, + "validity": 1.0 + }, + "pass_threshold": true, + "detected_format": "json" + }, + { + "submission_id": "sample-20", + "weighted_score": 0.465517, + "quality_rating": "needs_improvement", + "scores": { + "completeness": 0.086667, + "format_compliance": 0.652, + "coverage": 0.447885, + "clarity": 0.91875, + "validity": 0.593333 + }, + "pass_threshold": false, + "detected_format": "text" + } +] \ No newline at end of file diff --git a/scripts/generate_sample_scorecards.py b/scripts/generate_sample_scorecards.py new file mode 100644 index 0000000..c7c6142 --- /dev/null +++ b/scripts/generate_sample_scorecards.py @@ -0,0 +1,53 @@ +import json +from pathlib import Path + +from quality_scorer import score_batch + + +def _sample_submissions() -> list[str]: + return [ + '{"title":"Q1 Roadmap","owner":"platform","milestones":["api hardening","load tests","rollout"]}', + '{"ticket":"INC-41","severity":"high","summary":"API timeouts on /reporting","actions":["restart","cache flush"]}', + "# Weekly Update\n\n## Wins\n- Reduced error rate\n- Improved latency\n\n## Risks\n- Capacity gap next sprint", + "# Deployment Plan\n\n1. Build image\n2. Run smoke tests\n3. Roll out in phases\n", + "def normalize_records(rows):\n cleaned = []\n for row in rows:\n cleaned.append(row.strip())\n return cleaned\n", + "function score(input) {\n const words = input.split(/\\s+/);\n return words.length;\n}\n", + "Release summary: backend changes are complete. Monitoring dashboards are updated. Rollout starts tomorrow.", + "This proposal describes the migration strategy, rollback safeguards, and validation checkpoints for API v2.", + '{"name":"audit","checks":{"auth":true,"rate_limit":true,"logging":true},"notes":"all green"}', + "## Incident Review\n\nRoot cause was missing timeout configuration.\n\n### Follow-ups\n- add test coverage\n- enforce lint rule\n", + "class Pipeline:\n def run(self):\n return {'status': 'ok'}\n", + "const config = { retries: 3, timeoutMs: 2000, enabled: true };", + "Plain text memo with acceptance criteria, measured outputs, and delivery constraints for the current sprint.", + "# Data Contract\n\nFields:\n- source\n- timestamp\n- value\n- unit\n", + '{"service":"search","sla":{"p95_ms":320,"error_budget":"99.9"},"status":"stable"}', + "if (response.status !== 200) {\n throw new Error('request failed');\n}\nreturn response.json();\n", + "Narrative note: we validated schema compatibility, migration scripts, and rollback posture before release.", + "### QA Checklist\n- Unit tests pass\n- Integration tests pass\n- Manual smoke complete\n", + '{"env":"prod","region":"us-east-1","checks":["db","cache","queue"],"healthy":true}', + "The document is concise, structured, and explicit about metrics, assumptions, and risk controls.", + ] + + +def main() -> None: + submissions = _sample_submissions() + scored = score_batch(submissions) + output = [] + for idx, result in enumerate(scored, start=1): + output.append( + { + "submission_id": f"sample-{idx:02d}", + "weighted_score": result["weighted_score"], + "quality_rating": result["quality_rating"], + "scores": result["scores"], + "pass_threshold": result["pass_threshold"], + "detected_format": result["detected_format"], + } + ) + + output_path = Path(__file__).resolve().parents[1] / "sample_scorecards.json" + output_path.write_text(json.dumps(output, indent=2), encoding="utf-8") + + +if __name__ == "__main__": + main() diff --git a/tests/test_quality_scorer.py b/tests/test_quality_scorer.py new file mode 100644 index 0000000..346cfe0 --- /dev/null +++ b/tests/test_quality_scorer.py @@ -0,0 +1,109 @@ +import json +import time +import unittest +from pathlib import Path + +from quality_scorer import ( + WEIGHTS, + detect_format, + evaluate_against_ground_truth, + score_batch, + score_submission, +) + + +class FormatDetectionTests(unittest.TestCase): + def test_detects_json(self): + payload = '{"title":"Report","items":[1,2,3]}' + self.assertEqual("json", detect_format(payload)) + + def test_detects_markdown(self): + payload = "# Weekly Report\n\n## Summary\n- Item A\n- Item B\n" + self.assertEqual("markdown", detect_format(payload)) + + def test_detects_code(self): + payload = "def add(a, b):\n return a + b\n" + self.assertEqual("code", detect_format(payload)) + + def test_detects_text(self): + payload = "This is a plain narrative paragraph without markdown syntax." + self.assertEqual("text", detect_format(payload)) + + +class ScoreSubmissionTests(unittest.TestCase): + def test_returns_required_schema(self): + submission = json.dumps( + { + "title": "Roadmap", + "summary": "Quarterly plan", + "milestones": ["M1", "M2", "M3"], + } + ) + result = score_submission(submission) + self.assertEqual( + { + "weighted_score", + "quality_rating", + "scores", + "feedback", + "pass_threshold", + "detected_format", + }, + set(result.keys()), + ) + self.assertEqual({"completeness", "format_compliance", "coverage", "clarity", "validity"}, set(result["scores"])) + + def test_weighted_score_uses_specified_weights(self): + submission = "alpha beta gamma" + result = score_submission(submission) + expected = ( + result["scores"]["completeness"] * WEIGHTS["completeness"] + + result["scores"]["format_compliance"] * WEIGHTS["format_compliance"] + + result["scores"]["coverage"] * WEIGHTS["coverage"] + + result["scores"]["clarity"] * WEIGHTS["clarity"] + + result["scores"]["validity"] * WEIGHTS["validity"] + ) + self.assertAlmostEqual(expected, result["weighted_score"], places=6) + + def test_feedback_is_non_empty(self): + result = score_submission("Too short") + self.assertGreater(len(result["feedback"]), 0) + + +class BatchAndBenchmarkTests(unittest.TestCase): + def test_batch_scoring_and_performance(self): + submissions = [] + for i in range(100): + submissions.append( + f"# Report {i}\n\n## Summary\nThis is submission number {i} with details and validation notes.\n" + ) + start = time.perf_counter() + results = score_batch(submissions) + duration = time.perf_counter() - start + self.assertEqual(100, len(results)) + self.assertLess(duration, 10.0) + + def test_ground_truth_error_within_tolerance(self): + submissions = [ + json.dumps({"name": "A", "metrics": {"score": 0.9}, "notes": ["ok", "ship"]}), + "# Plan\n\n## Risks\n- latency\n- cost\n\nMitigations included.", + "def compute(x):\n return x * 2\n", + "Narrative text with enough context and explicit acceptance criteria.", + ] + predictions = score_batch(submissions) + ground_truth = [r["weighted_score"] for r in predictions] + mae = evaluate_against_ground_truth(predictions, ground_truth) + self.assertLessEqual(mae, 0.05) + + def test_sample_scorecards_file_contains_twenty_entries(self): + sample_path = Path(__file__).resolve().parents[1] / "sample_scorecards.json" + payload = json.loads(sample_path.read_text(encoding="utf-8")) + self.assertEqual(20, len(payload)) + for item in payload: + self.assertIn("submission_id", item) + self.assertIn("weighted_score", item) + self.assertIn("scores", item) + + +if __name__ == "__main__": + unittest.main() From d0d3ea170bb5c39855438a42fb8f5823bc28847a Mon Sep 17 00:00:00 2001 From: yuliuyi717-ux <264093635+yuliuyi717-ux@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:57:04 +0800 Subject: [PATCH 2/2] feat: add ground-truth calibration evaluation utility --- README.md | 6 ++++++ quality_scorer.py | 14 +++++++++++++- scripts/evaluate_ground_truth.py | 31 +++++++++++++++++++++++++++++++ tests/test_quality_scorer.py | 17 +++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 scripts/evaluate_ground_truth.py diff --git a/README.md b/README.md index a0f5ae0..a439077 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,12 @@ Regenerate sample scorecards: PYTHONPATH=. python3 scripts/generate_sample_scorecards.py ``` +Evaluate scorer calibration against a provided ground-truth set: + +```bash +PYTHONPATH=. python3 scripts/evaluate_ground_truth.py submissions.json ground_truth_scores.json 0.05 +``` + ## License MIT diff --git a/quality_scorer.py b/quality_scorer.py index cd7b1e1..7395ad2 100644 --- a/quality_scorer.py +++ b/quality_scorer.py @@ -70,6 +70,19 @@ def evaluate_against_ground_truth(predictions: list[dict[str, Any]], ground_trut return sum(abs_errors) / len(abs_errors) +def evaluate_ground_truth_submission_set( + submissions: list[str], ground_truth_scores: list[float], tolerance: float = 0.05 +) -> dict[str, Any]: + predictions = score_batch(submissions) + mae = evaluate_against_ground_truth(predictions, ground_truth_scores) + return { + "count": len(submissions), + "mae": round(mae, 6), + "tolerance": tolerance, + "within_tolerance": mae <= tolerance, + } + + def _is_json(text: str) -> bool: if not text.startswith("{") and not text.startswith("["): return False @@ -237,4 +250,3 @@ def _quality_rating(weighted_score: float) -> str: if weighted_score >= 0.50: return "fair" return "needs_improvement" - diff --git a/scripts/evaluate_ground_truth.py b/scripts/evaluate_ground_truth.py new file mode 100644 index 0000000..a0d2e81 --- /dev/null +++ b/scripts/evaluate_ground_truth.py @@ -0,0 +1,31 @@ +import json +import sys +from pathlib import Path + +from quality_scorer import evaluate_ground_truth_submission_set + + +def main() -> int: + if len(sys.argv) < 3: + print( + "usage: PYTHONPATH=. python3 scripts/evaluate_ground_truth.py [tolerance]", + file=sys.stderr, + ) + return 2 + + submissions_path = Path(sys.argv[1]) + ground_truth_path = Path(sys.argv[2]) + tolerance = float(sys.argv[3]) if len(sys.argv) > 3 else 0.05 + + submissions = json.loads(submissions_path.read_text(encoding="utf-8")) + ground_truth = json.loads(ground_truth_path.read_text(encoding="utf-8")) + if not isinstance(submissions, list) or not isinstance(ground_truth, list): + raise ValueError("both input files must be JSON arrays") + + result = evaluate_ground_truth_submission_set(submissions, ground_truth, tolerance=tolerance) + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_quality_scorer.py b/tests/test_quality_scorer.py index 346cfe0..e1240b1 100644 --- a/tests/test_quality_scorer.py +++ b/tests/test_quality_scorer.py @@ -7,6 +7,7 @@ WEIGHTS, detect_format, evaluate_against_ground_truth, + evaluate_ground_truth_submission_set, score_batch, score_submission, ) @@ -104,6 +105,22 @@ def test_sample_scorecards_file_contains_twenty_entries(self): self.assertIn("weighted_score", item) self.assertIn("scores", item) + def test_ground_truth_submission_set_passes_within_tolerance(self): + submissions = [ + '{"title":"Quarterly update","status":"on-track","owner":"ops"}', + "# Weekly Notes\n\n## Done\n- shipped parser\n- added tests\n", + "def normalize(value):\n return str(value).strip().lower()\n", + "Narrative summary that includes context, decisions, and next actions.", + ] + prediction_scores = [score_submission(s)["weighted_score"] for s in submissions] + result = evaluate_ground_truth_submission_set( + submissions=submissions, + ground_truth_scores=prediction_scores, + tolerance=0.05, + ) + self.assertTrue(result["within_tolerance"]) + self.assertLessEqual(result["mae"], 0.05) + if __name__ == "__main__": unittest.main()