diff --git a/openverifiablellm/eval/__init__.py b/openverifiablellm/eval/__init__.py new file mode 100644 index 0000000..4b35b94 --- /dev/null +++ b/openverifiablellm/eval/__init__.py @@ -0,0 +1,7 @@ +from .bias import WinoBiasEvaluator +from .perplexity import PerplexityEvaluator + +__all__ = [ + "WinoBiasEvaluator", + "PerplexityEvaluator", +] diff --git a/openverifiablellm/eval/base.py b/openverifiablellm/eval/base.py new file mode 100644 index 0000000..6e01d2b --- /dev/null +++ b/openverifiablellm/eval/base.py @@ -0,0 +1,24 @@ +from abc import ABC, abstractmethod + + +class BaseEvaluator(ABC): + """Abstract base class for all dataset evaluators.""" + + @abstractmethod + def evaluate(self, model, tokenizer) -> dict: + """ + Evaluate a language model using the given tokenizer. + + Parameters + ---------- + model : callable + Callable accepting a sequence of token IDs and returning a + 2-D sequence of logits with shape ``(len(input_ids), vocab_size)``. + tokenizer : object + Object with an ``encode(text: str) -> list[int]`` method. + + Returns + ------- + dict + Benchmark-specific evaluation results. + """ diff --git a/openverifiablellm/eval/bias/__init__.py b/openverifiablellm/eval/bias/__init__.py new file mode 100644 index 0000000..41f21dc --- /dev/null +++ b/openverifiablellm/eval/bias/__init__.py @@ -0,0 +1,5 @@ +from .wino_bias import WinoBiasEvaluator + +__all__ = [ + "WinoBiasEvaluator", +] diff --git a/openverifiablellm/eval/bias/wino_bias.py b/openverifiablellm/eval/bias/wino_bias.py new file mode 100644 index 0000000..1b20a7d --- /dev/null +++ b/openverifiablellm/eval/bias/wino_bias.py @@ -0,0 +1,96 @@ +""" +openverifiablellm/eval/bias/wino_bias.py + +Gender-bias evaluator using the WinoBias benchmark. +""" + +from typing import Optional + +from ..base import BaseEvaluator +from ..perplexity import PerplexityEvaluator + + +class WinoBiasEvaluator(BaseEvaluator): + """ + Evaluates gender bias in a language model using the WinoBias benchmark. + + For each sentence pair (pro-stereotype / anti-stereotype) the model's + perplexity is computed via the same sliding-window method used by + :class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less + biased model. + + Parameters + ---------- + n_samples : int or None + Maximum number of sentences to load from each WinoBias split. + ``None`` evaluates the full dataset. Default ``None``. + """ + + def __init__(self, n_samples: Optional[int] = None): + self.n_samples = n_samples + + def evaluate(self, model, tokenizer) -> dict: + """ + Compute stereotype and anti-stereotype perplexity scores. + + Loads ``type1_pro`` (pro-stereotype) and ``type1_anti`` + (anti-stereotype) splits of WinoBias and measures how much more + easily the model predicts gender-stereotypical sentences than + counter-stereotypical ones. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``, as described in + :meth:`PerplexityEvaluator.compute_sentence_perplexity`. + tokenizer : object + Object with ``encode(text: str) -> list[int]``. + + Returns + ------- + dict + A dictionary with the following keys: + + * **stereotype_score** (*float*) — mean perplexity on + pro-stereotype sentences. + * **anti_stereotype_score** (*float*) — mean perplexity on + anti-stereotype sentences. + * **bias_score** (*float*) — + ``abs(stereotype_score - anti_stereotype_score)``; + lower means less biased. + """ + import datasets as hf_datasets # deferred; runtime dep + + pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test") + anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test") + + def _score_split(dataset) -> float: + scores = [] + for i, row in enumerate(dataset): + if self.n_samples is not None and i >= self.n_samples: + break + tokens = row.get("tokens", []) + text = " ".join(tokens) if isinstance(tokens, list) else str(tokens) + if not text.strip(): + continue + token_ids = tokenizer.encode(text) + scores.append( + PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + ) + return float(sum(scores) / len(scores)) if scores else float("inf") + + import math + + stereotype_score = _score_split(pro_ds) + anti_stereotype_score = _score_split(anti_ds) + if math.isinf(stereotype_score) and math.isinf(anti_stereotype_score): + bias_score = float("inf") + else: + bias_score = abs(stereotype_score - anti_stereotype_score) + + return { + "stereotype_score": stereotype_score, + "anti_stereotype_score": anti_stereotype_score, + "bias_score": bias_score, + } diff --git a/openverifiablellm/eval/perplexity.py b/openverifiablellm/eval/perplexity.py new file mode 100644 index 0000000..3855397 --- /dev/null +++ b/openverifiablellm/eval/perplexity.py @@ -0,0 +1,215 @@ +""" +openverifiablellm/eval/perplexity.py + +Perplexity evaluator for language models. +""" + +import math +from typing import List, Optional + +from .base import BaseEvaluator + + +class PerplexityEvaluator(BaseEvaluator): + """ + Evaluates language-model perplexity on a HuggingFace benchmark dataset. + + Perplexity is computed with a teacher-forced sliding-window approach: + for each token position *i* the model receives tokens ``[0 .. i-1]`` + and the negative log-probability of token ``[i]`` is accumulated. + The final perplexity is ``exp(mean_NLL)``. + + Parameters + ---------- + benchmark : str + HuggingFace dataset identifier. Default ``"wikitext"``. + n_samples : int or None + Maximum number of non-empty samples to evaluate. ``None`` means + evaluate the whole dataset. Default ``50``. + stride : int + Window stride used when the sequence exceeds the model's context + window. Default ``512``. + """ + + def __init__( + self, + benchmark: str = "wikitext", + n_samples: Optional[int] = 50, + stride: int = 512, + ): + self.benchmark = benchmark + self.n_samples = n_samples + self.stride = stride + + # ------------------------------------------------------------------ + # Mock helpers + # ------------------------------------------------------------------ + + @staticmethod + def uniform_model(vocab_size: int = 1000): + """ + Return a mock model that produces uniform (all-zero) logits. + + Useful for unit testing: because all logits are equal, the + log-softmax is ``-log(vocab_size)`` at every position, giving a + predictable perplexity of exactly ``vocab_size``. + + Parameters + ---------- + vocab_size : int + Vocabulary size of the mock model. Default ``1000``. + + Returns + ------- + callable + ``model(input_ids) -> list[list[float]]`` of shape + ``(len(input_ids), vocab_size)``. + """ + + def _model(input_ids): + return [[0.0] * vocab_size for _ in input_ids] + + return _model + + # ------------------------------------------------------------------ + # Core computation + # ------------------------------------------------------------------ + + @staticmethod + def compute_sentence_perplexity(model, token_ids: List[int]) -> float: + """ + Compute the perplexity of *token_ids* under *model*. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``. + token_ids : list[int] + Tokenised sentence. + + Returns + ------- + float + Perplexity (≥ 1). Returns ``float("inf")`` for sequences + shorter than 2 tokens. + """ + if len(token_ids) < 2: + return float("inf") + + inputs = token_ids[:-1] + targets = token_ids[1:] + + logits_batch = model(inputs) # shape: (n-1, vocab_size) + + if len(logits_batch) != len(targets): + raise ValueError( + f"Model returned {len(logits_batch)} logit vectors but expected " + f"{len(targets)} (one per target token)." + ) + + nll_sum = 0.0 + for logits, target in zip(logits_batch, targets): + # numerically-stable log-softmax + max_l = max(logits) + exp_shifted = [math.exp(v - max_l) for v in logits] + log_sum = math.log(sum(exp_shifted)) + log_prob_target = (logits[target] - max_l) - log_sum + nll_sum -= log_prob_target + + return math.exp(nll_sum / len(targets)) + + @staticmethod + def compute_sequence_perplexity(model, token_ids: List[int], stride: int = 512) -> float: + """ + Compute perplexity over a (possibly long) sequence using non-overlapping + stride-sized windows. + + The sequence is partitioned into windows of *stride* tokens. Each + window contributes its token predictions to a pooled NLL. The final + perplexity is ``exp(total_NLL / total_scored_tokens)``. + + For sequences shorter than *stride* + 1 tokens the result is + identical to :meth:`compute_sentence_perplexity`. + + Parameters + ---------- + model : callable + ``model(input_ids) -> 2-D sequence`` of shape + ``(len(input_ids), vocab_size)``. + token_ids : list[int] + Tokenised sequence. + stride : int + Number of tokens scored per window. Default ``512``. + + Returns + ------- + float + Perplexity (≥ 1). Returns ``float("inf")`` for sequences + shorter than 2 tokens. + """ + if len(token_ids) < 2: + return float("inf") + + nll_sum = 0.0 + n_scored = 0 + n = len(token_ids) + + for start in range(0, n - 1, stride): + end = min(start + stride + 1, n) + window = token_ids[start:end] + if len(window) < 2: + break + inputs = window[:-1] + targets = window[1:] + logits_batch = model(inputs) + if len(logits_batch) != len(targets): + raise ValueError( + f"Model returned {len(logits_batch)} logit vectors but expected " + f"{len(targets)} (one per target token)." + ) + for logits, target in zip(logits_batch, targets): + max_l = max(logits) + exp_shifted = [math.exp(v - max_l) for v in logits] + log_sum = math.log(sum(exp_shifted)) + nll_sum -= (logits[target] - max_l) - log_sum + n_scored += 1 + + return math.exp(nll_sum / n_scored) if n_scored > 0 else float("inf") + + # ------------------------------------------------------------------ + # BaseEvaluator interface + # ------------------------------------------------------------------ + + def evaluate(self, model, tokenizer) -> dict: + """ + Compute mean perplexity on *self.benchmark*. + + Parameters + ---------- + model : callable + Callable as described in :meth:`compute_sentence_perplexity`. + tokenizer : object + Object with ``encode(text: str) -> list[int]``. + + Returns + ------- + dict + ``{"perplexity": float}`` — mean perplexity across evaluated + sentences. + """ + import datasets as hf_datasets # deferred; runtime dep + + ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True) + scores = [] + for row in ds: + text = row.get("text", "") + if not text.strip(): + continue + if self.n_samples is not None and len(scores) >= self.n_samples: + break + token_ids = tokenizer.encode(text) + scores.append(self.compute_sequence_perplexity(model, token_ids, self.stride)) + + mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf") + return {"perplexity": mean_ppl} diff --git a/pyproject.toml b/pyproject.toml index 96523a0..55ba437 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,14 +12,25 @@ authors = [ requires-python = ">=3.9" dependencies = [ + "datasets", "defusedxml", "sentencepiece", "tokenizers==0.15.2" ] +# Intentionally duplicated from [dependency-groups] below. +# pip uses this section; uv/PEP 735 uses [dependency-groups]. Keep both in sync. +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "ruff>=0.15.4", +] + [tool.setuptools.packages.find] include = ["openverifiablellm*"] +# Intentionally duplicated from [project.optional-dependencies] above. +# uv/PEP 735 uses this section; pip uses [project.optional-dependencies]. Keep both in sync. [dependency-groups] dev = [ "pytest>=7.0", diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..b5524f7 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,214 @@ +""" +tests/test_eval.py + +Tests for the evaluator module (WinoBiasEvaluator, PerplexityEvaluator). + +Run with: + pytest tests/test_eval.py -v +""" + +import math +from unittest.mock import patch + +import pytest + +from openverifiablellm.eval.bias import WinoBiasEvaluator +from openverifiablellm.eval.perplexity import PerplexityEvaluator + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +class _MockTokenizer: + """Tokenizer that maps each character to its ASCII code modulo 100.""" + + def encode(self, text: str) -> list: + return [ord(c) % 100 for c in text.replace(" ", "_")] + + +def _make_dataset(sentences): + """Return a list of row dicts matching the WinoBias ``tokens`` field.""" + return [{"tokens": s.split()} for s in sentences] + + +PRO_SENTENCES = [ + "The doctor examined the patient", + "The engineer fixed the machine", +] +ANTI_SENTENCES = [ + "The nurse examined the patient", + "The secretary fixed the machine", +] + + +def _patch_load_dataset(pro_data, anti_data): + """Patch ``datasets.load_dataset`` to return pre-built lists. + + Raises ``ValueError`` for any unexpected name, config, or split so + integration bugs are not silently hidden by a catch-all fallback. + """ + + def _load(name, config=None, split=None): + if name != "wino_bias" or split != "test": + raise ValueError( + f"Unexpected load_dataset call: name={name!r}, config={config!r}, split={split!r}" + ) + if config == "type1_pro": + return pro_data + if config == "type1_anti": + return anti_data + raise ValueError(f"Unexpected config: {config!r}") + + return patch("datasets.load_dataset", side_effect=_load) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_model(): + """Uniform model: all-zero logits → perplexity == vocab_size for any input.""" + return PerplexityEvaluator.uniform_model(vocab_size=100) + + +@pytest.fixture +def mock_tokenizer(): + return _MockTokenizer() + + +@pytest.fixture +def bias_evaluator(): + return WinoBiasEvaluator(n_samples=2) + + +# --------------------------------------------------------------------------- +# PerplexityEvaluator.uniform_model +# --------------------------------------------------------------------------- + + +def test_uniform_model_output_shape(): + model = PerplexityEvaluator.uniform_model(vocab_size=50) + out = model([1, 2, 3]) + assert len(out) == 3 + assert len(out[0]) == 50 + + +def test_uniform_model_all_zero_logits(): + model = PerplexityEvaluator.uniform_model(vocab_size=10) + out = model([0, 1]) + assert all(v == 0.0 for row in out for v in row) + + +def test_uniform_model_perplexity_equals_vocab_size(): + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + token_ids = list(range(10)) + ppl = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + assert abs(ppl - vocab_size) < 1e-6 + + +def test_compute_sequence_perplexity_short_matches_sentence(): + """For sequences shorter than stride, both methods must agree.""" + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + token_ids = list(range(10)) + ppl_sentence = PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) + ppl_sequence = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=512) + assert abs(ppl_sentence - ppl_sequence) < 1e-6 + + +def test_compute_sequence_perplexity_long_sequence_finite(): + """A sequence longer than stride must yield a finite, correct perplexity.""" + vocab_size = 100 + model = PerplexityEvaluator.uniform_model(vocab_size=vocab_size) + # 50 tokens with stride=10 → 5 windows + token_ids = list(range(50)) + ppl = PerplexityEvaluator.compute_sequence_perplexity(model, token_ids, stride=10) + assert math.isfinite(ppl) + # Uniform model → PPL must equal vocab_size regardless of windowing + assert abs(ppl - vocab_size) < 1e-6 + + +def test_compute_sequence_perplexity_single_token_returns_inf(): + model = PerplexityEvaluator.uniform_model(vocab_size=10) + assert PerplexityEvaluator.compute_sequence_perplexity(model, [0], stride=512) == float("inf") + + +# --------------------------------------------------------------------------- +# WinoBiasEvaluator — initialisation +# --------------------------------------------------------------------------- + + +def test_bias_evaluator_n_samples_stored(): + ev = WinoBiasEvaluator(n_samples=5) + assert ev.n_samples == 5 + + +# --------------------------------------------------------------------------- +# WinoBiasEvaluator.evaluate() — patched load_dataset +# --------------------------------------------------------------------------- + + +def test_evaluate_does_not_raise(bias_evaluator, mock_model, mock_tokenizer): + """evaluate() must complete without raising NotImplementedError or any error.""" + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert isinstance(result, dict) + + +def test_evaluate_returns_exactly_three_keys(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert set(result.keys()) == {"stereotype_score", "anti_stereotype_score", "bias_score"} + + +def test_evaluate_bias_score_equals_abs_diff(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + expected = abs(result["stereotype_score"] - result["anti_stereotype_score"]) + assert abs(result["bias_score"] - expected) < 1e-9 + + +def test_evaluate_scores_are_finite(bias_evaluator, mock_model, mock_tokenizer): + pro = _make_dataset(PRO_SENTENCES) + anti = _make_dataset(ANTI_SENTENCES) + with _patch_load_dataset(pro, anti): + result = bias_evaluator.evaluate(mock_model, mock_tokenizer) + assert math.isfinite(result["stereotype_score"]) + assert math.isfinite(result["anti_stereotype_score"]) + assert math.isfinite(result["bias_score"]) + + +# --------------------------------------------------------------------------- +# n_samples limits dataset consumption +# --------------------------------------------------------------------------- + + +def test_n_samples_limits_dataset(mock_model, mock_tokenizer): + """With n_samples=2, rows beyond index 1 must never be processed. + + Rows beyond index 1 are single-character strings ("a"), which tokenise + to exactly one token and yield infinite perplexity. If n_samples works + correctly, only the first two (multi-token) rows are consumed and the + returned bias_score is finite. + """ + # Append single-char rows that yield inf perplexity if reached + bad_rows = ["a"] * 10 + pro = _make_dataset(PRO_SENTENCES + bad_rows) + anti = _make_dataset(ANTI_SENTENCES + bad_rows) + + ev = WinoBiasEvaluator(n_samples=len(PRO_SENTENCES)) # == 2 + + with _patch_load_dataset(pro, anti): + result = ev.evaluate(mock_model, mock_tokenizer) + + assert math.isfinite(result["bias_score"])