Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions openverifiablellm/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .bias import WinoBiasEvaluator
from .perplexity import PerplexityEvaluator

__all__ = [
"WinoBiasEvaluator",
"PerplexityEvaluator",
]
24 changes: 24 additions & 0 deletions openverifiablellm/eval/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABC, abstractmethod


class BaseEvaluator(ABC):
"""Abstract base class for all dataset evaluators."""

@abstractmethod
def evaluate(self, model, tokenizer) -> dict:
"""
Evaluate a language model using the given tokenizer.

Parameters
----------
model : callable
Callable accepting a sequence of token IDs and returning a
2-D sequence of logits with shape ``(len(input_ids), vocab_size)``.
tokenizer : object
Object with an ``encode(text: str) -> list[int]`` method.

Returns
-------
dict
Benchmark-specific evaluation results.
"""
5 changes: 5 additions & 0 deletions openverifiablellm/eval/bias/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .wino_bias import WinoBiasEvaluator

__all__ = [
"WinoBiasEvaluator",
]
96 changes: 96 additions & 0 deletions openverifiablellm/eval/bias/wino_bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
openverifiablellm/eval/bias/wino_bias.py

Gender-bias evaluator using the WinoBias benchmark.
"""

from typing import Optional

from ..base import BaseEvaluator
from ..perplexity import PerplexityEvaluator


class WinoBiasEvaluator(BaseEvaluator):
"""
Evaluates gender bias in a language model using the WinoBias benchmark.

For each sentence pair (pro-stereotype / anti-stereotype) the model's
perplexity is computed via the same sliding-window method used by
:class:`PerplexityEvaluator`. A lower ``bias_score`` indicates a less
biased model.

Parameters
----------
n_samples : int or None
Maximum number of sentences to load from each WinoBias split.
``None`` evaluates the full dataset. Default ``None``.
"""

def __init__(self, n_samples: Optional[int] = None):
self.n_samples = n_samples

def evaluate(self, model, tokenizer) -> dict:
"""
Compute stereotype and anti-stereotype perplexity scores.

Loads ``type1_pro`` (pro-stereotype) and ``type1_anti``
(anti-stereotype) splits of WinoBias and measures how much more
easily the model predicts gender-stereotypical sentences than
counter-stereotypical ones.

Parameters
----------
model : callable
``model(input_ids) -> 2-D sequence`` of shape
``(len(input_ids), vocab_size)``, as described in
:meth:`PerplexityEvaluator.compute_sentence_perplexity`.
tokenizer : object
Object with ``encode(text: str) -> list[int]``.

Returns
-------
dict
A dictionary with the following keys:

* **stereotype_score** (*float*) — mean perplexity on
pro-stereotype sentences.
* **anti_stereotype_score** (*float*) — mean perplexity on
anti-stereotype sentences.
* **bias_score** (*float*) —
``abs(stereotype_score - anti_stereotype_score)``;
lower means less biased.
"""
import datasets as hf_datasets # deferred; runtime dep

pro_ds = hf_datasets.load_dataset("wino_bias", "type1_pro", split="test")
anti_ds = hf_datasets.load_dataset("wino_bias", "type1_anti", split="test")

def _score_split(dataset) -> float:
scores = []
for i, row in enumerate(dataset):
if self.n_samples is not None and i >= self.n_samples:
break
tokens = row.get("tokens", [])
text = " ".join(tokens) if isinstance(tokens, list) else str(tokens)
if not text.strip():
continue
token_ids = tokenizer.encode(text)
scores.append(
PerplexityEvaluator.compute_sentence_perplexity(model, token_ids)
)
return float(sum(scores) / len(scores)) if scores else float("inf")
Comment on lines +78 to +81
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Consider filtering inf values before computing the mean.

If compute_sentence_perplexity returns float("inf") for any sentence (e.g., sequences with < 2 tokens), the entire split score becomes inf since sum([..., inf, ...]) is inf. While WinoBias sentences are typically well-formed, malformed or edge-case entries could skew the entire evaluation.

🛡️ Suggested defensive approach
-            return float(sum(scores) / len(scores)) if scores else float("inf")
+            finite_scores = [s for s in scores if math.isfinite(s)]
+            return float(sum(finite_scores) / len(finite_scores)) if finite_scores else float("inf")

This filters out infinite values, computing the mean only over valid perplexity scores.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@openverifiablellm/eval/bias/wino_bias.py` around lines 78 - 81, The current
averaging in the block that calls
PerplexityEvaluator.compute_sentence_perplexity can produce float("inf") if any
sentence returns infinity; update the logic in the loop/return that collects
scores (where scores is appended using
PerplexityEvaluator.compute_sentence_perplexity(model, token_ids)) to filter out
math.isfinite or not-infinite values before computing the mean, and if no finite
scores remain return float("inf") (or keep the original fallback) so a single
infinite sentence doesn't make the whole split score infinite.


import math

stereotype_score = _score_split(pro_ds)
anti_stereotype_score = _score_split(anti_ds)
if math.isinf(stereotype_score) and math.isinf(anti_stereotype_score):
bias_score = float("inf")
else:
bias_score = abs(stereotype_score - anti_stereotype_score)

return {
"stereotype_score": stereotype_score,
"anti_stereotype_score": anti_stereotype_score,
"bias_score": bias_score,
}
215 changes: 215 additions & 0 deletions openverifiablellm/eval/perplexity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""
openverifiablellm/eval/perplexity.py

Perplexity evaluator for language models.
"""

import math
from typing import List, Optional

from .base import BaseEvaluator


class PerplexityEvaluator(BaseEvaluator):
"""
Evaluates language-model perplexity on a HuggingFace benchmark dataset.

Perplexity is computed with a teacher-forced sliding-window approach:
for each token position *i* the model receives tokens ``[0 .. i-1]``
and the negative log-probability of token ``[i]`` is accumulated.
The final perplexity is ``exp(mean_NLL)``.

Parameters
----------
benchmark : str
HuggingFace dataset identifier. Default ``"wikitext"``.
n_samples : int or None
Maximum number of non-empty samples to evaluate. ``None`` means
evaluate the whole dataset. Default ``50``.
stride : int
Window stride used when the sequence exceeds the model's context
window. Default ``512``.
"""

def __init__(
self,
benchmark: str = "wikitext",
n_samples: Optional[int] = 50,
stride: int = 512,
):
self.benchmark = benchmark
self.n_samples = n_samples
self.stride = stride

# ------------------------------------------------------------------
# Mock helpers
# ------------------------------------------------------------------

@staticmethod
def uniform_model(vocab_size: int = 1000):
"""
Return a mock model that produces uniform (all-zero) logits.

Useful for unit testing: because all logits are equal, the
log-softmax is ``-log(vocab_size)`` at every position, giving a
predictable perplexity of exactly ``vocab_size``.

Parameters
----------
vocab_size : int
Vocabulary size of the mock model. Default ``1000``.

Returns
-------
callable
``model(input_ids) -> list[list[float]]`` of shape
``(len(input_ids), vocab_size)``.
"""

def _model(input_ids):
return [[0.0] * vocab_size for _ in input_ids]

return _model

# ------------------------------------------------------------------
# Core computation
# ------------------------------------------------------------------

@staticmethod
def compute_sentence_perplexity(model, token_ids: List[int]) -> float:
"""
Compute the perplexity of *token_ids* under *model*.

Parameters
----------
model : callable
``model(input_ids) -> 2-D sequence`` of shape
``(len(input_ids), vocab_size)``.
token_ids : list[int]
Tokenised sentence.

Returns
-------
float
Perplexity (≥ 1). Returns ``float("inf")`` for sequences
shorter than 2 tokens.
"""
if len(token_ids) < 2:
return float("inf")

inputs = token_ids[:-1]
targets = token_ids[1:]

logits_batch = model(inputs) # shape: (n-1, vocab_size)

if len(logits_batch) != len(targets):
raise ValueError(
f"Model returned {len(logits_batch)} logit vectors but expected "
f"{len(targets)} (one per target token)."
)

nll_sum = 0.0
for logits, target in zip(logits_batch, targets):
# numerically-stable log-softmax
max_l = max(logits)
exp_shifted = [math.exp(v - max_l) for v in logits]
log_sum = math.log(sum(exp_shifted))
log_prob_target = (logits[target] - max_l) - log_sum
nll_sum -= log_prob_target

return math.exp(nll_sum / len(targets))

@staticmethod
def compute_sequence_perplexity(model, token_ids: List[int], stride: int = 512) -> float:
"""
Compute perplexity over a (possibly long) sequence using non-overlapping
stride-sized windows.

The sequence is partitioned into windows of *stride* tokens. Each
window contributes its token predictions to a pooled NLL. The final
perplexity is ``exp(total_NLL / total_scored_tokens)``.

For sequences shorter than *stride* + 1 tokens the result is
identical to :meth:`compute_sentence_perplexity`.

Parameters
----------
model : callable
``model(input_ids) -> 2-D sequence`` of shape
``(len(input_ids), vocab_size)``.
token_ids : list[int]
Tokenised sequence.
stride : int
Number of tokens scored per window. Default ``512``.

Returns
-------
float
Perplexity (≥ 1). Returns ``float("inf")`` for sequences
shorter than 2 tokens.
"""
if len(token_ids) < 2:
return float("inf")

nll_sum = 0.0
n_scored = 0
n = len(token_ids)

for start in range(0, n - 1, stride):
end = min(start + stride + 1, n)
window = token_ids[start:end]
if len(window) < 2:
break
inputs = window[:-1]
targets = window[1:]
logits_batch = model(inputs)
if len(logits_batch) != len(targets):
raise ValueError(
f"Model returned {len(logits_batch)} logit vectors but expected "
f"{len(targets)} (one per target token)."
)
for logits, target in zip(logits_batch, targets):
max_l = max(logits)
exp_shifted = [math.exp(v - max_l) for v in logits]
log_sum = math.log(sum(exp_shifted))
nll_sum -= (logits[target] - max_l) - log_sum
n_scored += 1

return math.exp(nll_sum / n_scored) if n_scored > 0 else float("inf")

# ------------------------------------------------------------------
# BaseEvaluator interface
# ------------------------------------------------------------------

def evaluate(self, model, tokenizer) -> dict:
"""
Compute mean perplexity on *self.benchmark*.

Parameters
----------
model : callable
Callable as described in :meth:`compute_sentence_perplexity`.
tokenizer : object
Object with ``encode(text: str) -> list[int]``.

Returns
-------
dict
``{"perplexity": float}`` — mean perplexity across evaluated
sentences.
"""
import datasets as hf_datasets # deferred; runtime dep

ds = hf_datasets.load_dataset(self.benchmark, split="test", streaming=True)
scores = []
for row in ds:
text = row.get("text", "")
if not text.strip():
continue
if self.n_samples is not None and len(scores) >= self.n_samples:
break
token_ids = tokenizer.encode(text)
scores.append(self.compute_sequence_perplexity(model, token_ids, self.stride))

mean_ppl = float(sum(scores) / len(scores)) if scores else float("inf")
return {"perplexity": mean_ppl}
11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,25 @@ authors = [
requires-python = ">=3.9"

dependencies = [
"datasets",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
Comment on lines 14 to 19
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

find . -name "pyproject.toml" -type f

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 88


🏁 Script executed:

cat -n ./pyproject.toml

Repository: AOSSIE-Org/OpenVerifiableLLM

Length of output: 1676


Add numpy and torch to required dependencies.

These are core LLM dependencies for this project and should be in the required dependency list, not optional.

Suggested patch
 dependencies = [
     "datasets",
+    "numpy",
+    "torch",
     "defusedxml",
     "sentencepiece",
     "tokenizers==0.15.2"
 ]
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
dependencies = [
"datasets",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
dependencies = [
"datasets",
"numpy",
"torch",
"defusedxml",
"sentencepiece",
"tokenizers==0.15.2"
]
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pyproject.toml` around lines 14 - 19, The dependencies list in pyproject.toml
is missing core LLM packages; add "numpy" and "torch" to the existing
dependencies array (alongside "datasets", "defusedxml", "sentencepiece",
"tokenizers==0.15.2") so they are installed as required dependencies; update the
dependencies section to include "numpy" and "torch" entries and ensure versions
are specified if necessary for compatibility.


# Intentionally duplicated from [dependency-groups] below.
# pip uses this section; uv/PEP 735 uses [dependency-groups]. Keep both in sync.
[project.optional-dependencies]
dev = [
"pytest>=7.0",
"ruff>=0.15.4",
]

[tool.setuptools.packages.find]
include = ["openverifiablellm*"]

# Intentionally duplicated from [project.optional-dependencies] above.
# uv/PEP 735 uses this section; pip uses [project.optional-dependencies]. Keep both in sync.
[dependency-groups]
dev = [
"pytest>=7.0",
Expand Down
Loading
Loading