-
-
Notifications
You must be signed in to change notification settings - Fork 28
Feat/bias evaluator WinoBias (Gender) #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
12d5d55
b3ef1fb
3e8addc
33a36dd
c6288c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| from .bias import WinoBiasEvaluator | ||
| from .perplexity import PerplexityEvaluator | ||
|
|
||
| __all__ = [ | ||
| "WinoBiasEvaluator", | ||
| "PerplexityEvaluator", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from abc import ABC, abstractmethod | ||
|
|
||
|
|
||
| class BaseEvaluator(ABC): | ||
| """Abstract base class for all dataset evaluators.""" | ||
|
|
||
| @abstractmethod | ||
| def evaluate(self, model, tokenizer) -> dict: | ||
| """ | ||
| Evaluate a language model using the given tokenizer. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : callable | ||
| Callable accepting a sequence of token IDs and returning a | ||
| 2-D sequence of logits with shape ``(len(input_ids), vocab_size)``. | ||
| tokenizer : object | ||
| Object with an ``encode(text: str) -> list[int]`` method. | ||
|
|
||
| Returns | ||
| ------- | ||
| dict | ||
| Benchmark-specific evaluation results. | ||
| """ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| from .wino_bias import WinoBiasEvaluator | ||
|
|
||
| __all__ = [ | ||
| "WinoBiasEvaluator", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| """ | ||
| openverifiablellm/eval/bias/wino_bias.py | ||
|
|
||
| Gender-bias evaluator using the WinoBias benchmark. | ||
| """ | ||
|
|
||
| from typing import Optional | ||
|
|
||
| from ..base import BaseEvaluator | ||
| from ..perplexity import PerplexityEvaluator | ||
|
|
||
|
|
||
| class WinoBiasEvaluator(BaseEvaluator): | ||
| """ | ||
| Evaluates gender bias in a language model using the WinoBias benchmark. | ||
|
|
||
| For each sentence pair (pro-stereotype / anti-stereotype) the model's | ||
| perplexity is computed via the same sliding-window method used by | ||
| :class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less | ||
| biased model. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| n_samples : int or None | ||
| Maximum number of sentences to load from each WinoBias split. | ||
| ``None`` evaluates the full dataset. Default ``None``. | ||
| """ | ||
|
|
||
| def __init__(self, n_samples: Optional[int] = None): | ||
| self.n_samples = n_samples | ||
|
|
||
| def evaluate(self, model, tokenizer) -> dict: | ||
| """ | ||
| Compute stereotype and anti-stereotype perplexity scores. | ||
|
|
||
| Loads ``type1_pro`` (pro-stereotype) and ``type1_anti`` | ||
| (anti-stereotype) splits of WinoBias and measures how much more | ||
| easily the model predicts gender-stereotypical sentences than | ||
| counter-stereotypical ones. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : callable | ||
| ``model(input_ids) -> 2-D sequence`` of shape | ||
| ``(len(input_ids), vocab_size)``, as described in | ||
| :meth:`PerplexityEvaluator.compute_sentence_perplexity`. | ||
| tokenizer : object | ||
| Object with ``encode(text: str) -> list[int]``. | ||
|
|
||
| Returns | ||
| ------- | ||
| dict | ||
| A dictionary with the following keys: | ||
|
|
||
| * **stereotype_score** (*float*) — mean perplexity on | ||
| pro-stereotype sentences. | ||
| * **anti_stereotype_score** (*float*) — mean perplexity on | ||
| anti-stereotype sentences. | ||
| * **bias_score** (*float*) — | ||
| ``abs(stereotype_score - anti_stereotype_score)``; | ||
| lower means less biased. | ||
| """ | ||
| import datasets as hf_datasets # deferred; runtime dep | ||
|
|
||
| pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test") | ||
| anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test") | ||
|
|
||
| def _score_split(dataset) -> float: | ||
| scores = [] | ||
| for i, row in enumerate(dataset): | ||
| if self.n_samples is not None and i >= self.n_samples: | ||
| break | ||
| tokens = row.get("tokens", []) | ||
| text = " ".join(tokens) if isinstance(tokens, list) else str(tokens) | ||
| if not text.strip(): | ||
| continue | ||
| token_ids = tokenizer.encode(text) | ||
| scores.append( | ||
| PerplexityEvaluator.compute_sentence_perplexity(model, token_ids) | ||
| ) | ||
| return float(sum(scores) / len(scores)) if scores else float("inf") | ||
|
|
||
| import math | ||
|
|
||
| stereotype_score = _score_split(pro_ds) | ||
| anti_stereotype_score = _score_split(anti_ds) | ||
| if math.isinf(stereotype_score) and math.isinf(anti_stereotype_score): | ||
| bias_score = float("inf") | ||
| else: | ||
| bias_score = abs(stereotype_score - anti_stereotype_score) | ||
|
|
||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return { | ||
| "stereotype_score": stereotype_score, | ||
| "anti_stereotype_score": anti_stereotype_score, | ||
| "bias_score": bias_score, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,215 @@ | ||
| """ | ||
| openverifiablellm/eval/perplexity.py | ||
|
|
||
| Perplexity evaluator for language models. | ||
| """ | ||
|
|
||
| import math | ||
| from typing import List, Optional | ||
|
|
||
| from .base import BaseEvaluator | ||
|
|
||
|
|
||
| class PerplexityEvaluator(BaseEvaluator): | ||
| """ | ||
| Evaluates language-model perplexity on a HuggingFace benchmark dataset. | ||
|
|
||
| Perplexity is computed with a teacher-forced sliding-window approach: | ||
| for each token position *i* the model receives tokens ``[0 .. i-1]`` | ||
| and the negative log-probability of token ``[i]`` is accumulated. | ||
| The final perplexity is ``exp(mean_NLL)``. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| benchmark : str | ||
| HuggingFace dataset identifier. Default ``"wikitext"``. | ||
| n_samples : int or None | ||
| Maximum number of non-empty samples to evaluate. ``None`` means | ||
| evaluate the whole dataset. Default ``50``. | ||
| stride : int | ||
| Window stride used when the sequence exceeds the model's context | ||
| window. Default ``512``. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| benchmark: str = "wikitext", | ||
| n_samples: Optional[int] = 50, | ||
| stride: int = 512, | ||
| ): | ||
| self.benchmark = benchmark | ||
| self.n_samples = n_samples | ||
| self.stride = stride | ||
|
|
||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # ------------------------------------------------------------------ | ||
| # Mock helpers | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| @staticmethod | ||
| def uniform_model(vocab_size: int = 1000): | ||
| """ | ||
| Return a mock model that produces uniform (all-zero) logits. | ||
|
|
||
| Useful for unit testing: because all logits are equal, the | ||
| log-softmax is ``-log(vocab_size)`` at every position, giving a | ||
| predictable perplexity of exactly ``vocab_size``. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| vocab_size : int | ||
| Vocabulary size of the mock model. Default ``1000``. | ||
|
|
||
| Returns | ||
| ------- | ||
| callable | ||
| ``model(input_ids) -> list[list[float]]`` of shape | ||
| ``(len(input_ids), vocab_size)``. | ||
| """ | ||
|
|
||
| def _model(input_ids): | ||
| return [[0.0] * vocab_size for _ in input_ids] | ||
|
|
||
| return _model | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # Core computation | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| @staticmethod | ||
| def compute_sentence_perplexity(model, token_ids: List[int]) -> float: | ||
| """ | ||
| Compute the perplexity of *token_ids* under *model*. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : callable | ||
| ``model(input_ids) -> 2-D sequence`` of shape | ||
| ``(len(input_ids), vocab_size)``. | ||
| token_ids : list[int] | ||
| Tokenised sentence. | ||
|
|
||
| Returns | ||
| ------- | ||
| float | ||
| Perplexity (≥ 1). Returns ``float("inf")`` for sequences | ||
| shorter than 2 tokens. | ||
| """ | ||
| if len(token_ids) < 2: | ||
| return float("inf") | ||
|
|
||
| inputs = token_ids[:-1] | ||
| targets = token_ids[1:] | ||
|
|
||
| logits_batch = model(inputs) # shape: (n-1, vocab_size) | ||
|
|
||
| if len(logits_batch) != len(targets): | ||
| raise ValueError( | ||
| f"Model returned {len(logits_batch)} logit vectors but expected " | ||
| f"{len(targets)} (one per target token)." | ||
| ) | ||
|
|
||
| nll_sum = 0.0 | ||
| for logits, target in zip(logits_batch, targets): | ||
| # numerically-stable log-softmax | ||
| max_l = max(logits) | ||
| exp_shifted = [math.exp(v - max_l) for v in logits] | ||
| log_sum = math.log(sum(exp_shifted)) | ||
| log_prob_target = (logits[target] - max_l) - log_sum | ||
| nll_sum -= log_prob_target | ||
|
|
||
| return math.exp(nll_sum / len(targets)) | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| @staticmethod | ||
| def compute_sequence_perplexity(model, token_ids: List[int], stride: int = 512) -> float: | ||
| """ | ||
| Compute perplexity over a (possibly long) sequence using non-overlapping | ||
| stride-sized windows. | ||
|
|
||
| The sequence is partitioned into windows of *stride* tokens. Each | ||
| window contributes its token predictions to a pooled NLL. The final | ||
| perplexity is ``exp(total_NLL / total_scored_tokens)``. | ||
|
|
||
| For sequences shorter than *stride* + 1 tokens the result is | ||
| identical to :meth:`compute_sentence_perplexity`. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : callable | ||
| ``model(input_ids) -> 2-D sequence`` of shape | ||
| ``(len(input_ids), vocab_size)``. | ||
| token_ids : list[int] | ||
| Tokenised sequence. | ||
| stride : int | ||
| Number of tokens scored per window. Default ``512``. | ||
|
|
||
| Returns | ||
| ------- | ||
| float | ||
| Perplexity (≥ 1). Returns ``float("inf")`` for sequences | ||
| shorter than 2 tokens. | ||
| """ | ||
| if len(token_ids) < 2: | ||
| return float("inf") | ||
|
|
||
| nll_sum = 0.0 | ||
| n_scored = 0 | ||
| n = len(token_ids) | ||
|
|
||
| for start in range(0, n - 1, stride): | ||
| end = min(start + stride + 1, n) | ||
| window = token_ids[start:end] | ||
| if len(window) < 2: | ||
| break | ||
| inputs = window[:-1] | ||
| targets = window[1:] | ||
| logits_batch = model(inputs) | ||
| if len(logits_batch) != len(targets): | ||
| raise ValueError( | ||
| f"Model returned {len(logits_batch)} logit vectors but expected " | ||
| f"{len(targets)} (one per target token)." | ||
| ) | ||
| for logits, target in zip(logits_batch, targets): | ||
| max_l = max(logits) | ||
| exp_shifted = [math.exp(v - max_l) for v in logits] | ||
| log_sum = math.log(sum(exp_shifted)) | ||
| nll_sum -= (logits[target] - max_l) - log_sum | ||
| n_scored += 1 | ||
|
|
||
| return math.exp(nll_sum / n_scored) if n_scored > 0 else float("inf") | ||
|
|
||
| # ------------------------------------------------------------------ | ||
| # BaseEvaluator interface | ||
| # ------------------------------------------------------------------ | ||
|
|
||
| def evaluate(self, model, tokenizer) -> dict: | ||
| """ | ||
| Compute mean perplexity on *self.benchmark*. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : callable | ||
| Callable as described in :meth:`compute_sentence_perplexity`. | ||
| tokenizer : object | ||
| Object with ``encode(text: str) -> list[int]``. | ||
|
|
||
| Returns | ||
| ------- | ||
| dict | ||
| ``{"perplexity": float}`` — mean perplexity across evaluated | ||
| sentences. | ||
| """ | ||
| import datasets as hf_datasets # deferred; runtime dep | ||
|
|
||
| ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True) | ||
| scores = [] | ||
| for row in ds: | ||
| text = row.get("text", "") | ||
| if not text.strip(): | ||
| continue | ||
| if self.n_samples is not None and len(scores) >= self.n_samples: | ||
| break | ||
| token_ids = tokenizer.encode(text) | ||
| scores.append(self.compute_sequence_perplexity(model, token_ids, self.stride)) | ||
|
|
||
| mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf") | ||
| return {"perplexity": mean_ppl} | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,14 +12,25 @@ authors = [ | |||||||||||||||||||||||||||||
| requires-python = ">=3.9" | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| dependencies = [ | ||||||||||||||||||||||||||||||
| "datasets", | ||||||||||||||||||||||||||||||
| "defusedxml", | ||||||||||||||||||||||||||||||
| "sentencepiece", | ||||||||||||||||||||||||||||||
| "tokenizers==0.15.2" | ||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||
|
Comment on lines
14
to
19
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: find . -name "pyproject.toml" -type fRepository: AOSSIE-Org/OpenVerifiableLLM Length of output: 88 🏁 Script executed: cat -n ./pyproject.tomlRepository: AOSSIE-Org/OpenVerifiableLLM Length of output: 1676 Add These are core LLM dependencies for this project and should be in the required dependency list, not optional. Suggested patch dependencies = [
"datasets",
+ "numpy",
+ "torch",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| # Intentionally duplicated from [dependency-groups] below. | ||||||||||||||||||||||||||||||
| # pip uses this section; uv/PEP 735 uses [dependency-groups]. Keep both in sync. | ||||||||||||||||||||||||||||||
| [project.optional-dependencies] | ||||||||||||||||||||||||||||||
| dev = [ | ||||||||||||||||||||||||||||||
| "pytest>=7.0", | ||||||||||||||||||||||||||||||
| "ruff>=0.15.4", | ||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| [tool.setuptools.packages.find] | ||||||||||||||||||||||||||||||
| include = ["openverifiablellm*"] | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
| # Intentionally duplicated from [project.optional-dependencies] above. | ||||||||||||||||||||||||||||||
| # uv/PEP 735 uses this section; pip uses [project.optional-dependencies]. Keep both in sync. | ||||||||||||||||||||||||||||||
| [dependency-groups] | ||||||||||||||||||||||||||||||
| dev = [ | ||||||||||||||||||||||||||||||
| "pytest>=7.0", | ||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider filtering
infvalues before computing the mean.If
compute_sentence_perplexityreturnsfloat("inf")for any sentence (e.g., sequences with < 2 tokens), the entire split score becomesinfsincesum([..., inf, ...])isinf. While WinoBias sentences are typically well-formed, malformed or edge-case entries could skew the entire evaluation.🛡️ Suggested defensive approach
This filters out infinite values, computing the mean only over valid perplexity scores.
🤖 Prompt for AI Agents