diff --git a/.gitignore b/.gitignore
index c55e460..705653c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -333,4 +333,5 @@ __pycache__/
*.pyd
*.bz2
-*.venv
\ No newline at end of file
+*.venv
+venv/
\ No newline at end of file
diff --git a/openverifiablellm/tokenizer/base.py b/openverifiablellm/tokenizer/base.py
index 5a8d3fd..8751dcb 100644
--- a/openverifiablellm/tokenizer/base.py
+++ b/openverifiablellm/tokenizer/base.py
@@ -19,13 +19,30 @@ def __init__(self, vocab_size: int, min_frequency: int):
@abstractmethod
def train(self, text_file: Path, save_path: Path):
- """Train tokenizer and save model."""
+ """Train tokenizer on a text corpus and save artifacts to save_path."""
+ pass
+
+ @abstractmethod
+ def encode(self, text: str) -> list[int]:
+ """Encode text into a list of integer token ids."""
+ pass
+
+ @abstractmethod
+ def decode(self, ids: list[int]) -> str:
+ """Decode a list of integer token ids back into text."""
+ pass
+
+ @abstractmethod
+ def load(self, tokenizer_dir: Path):
+ """Load a previously trained tokenizer from disk."""
pass
@abstractmethod
def get_vocab_path(self, tokenizer_dir: Path) -> Path:
+ """Return path to the vocabulary file."""
pass
@abstractmethod
- def get_merges_path(self, tokenizer_dir: Path):
+ def get_merges_path(self, tokenizer_dir: Path) -> Path | None:
+ """Return path to the merges file, or None if not applicable."""
pass
diff --git a/openverifiablellm/tokenizer/bpe_tokenizer.py b/openverifiablellm/tokenizer/bpe_tokenizer.py
index ac9fd7b..2b665b4 100644
--- a/openverifiablellm/tokenizer/bpe_tokenizer.py
+++ b/openverifiablellm/tokenizer/bpe_tokenizer.py
@@ -8,7 +8,43 @@
class BPETokenizer(BaseTokenizer):
+ """
+ Byte-level BPE tokenizer implementation.
+
+ Wraps HuggingFace's ByteLevelBPETokenizer and implements
+ the full BaseTokenizer interface including train, encode,
+ decode, and load.
+ """
+
+ def __init__(self, vocab_size: int, min_frequency: int):
+ super().__init__(vocab_size, min_frequency)
+ self._tokenizer = None
+
+ # ------------------------------------------------------------------
+ # Training
+ # ------------------------------------------------------------------
+
def train(self, text_file: Path, save_path: Path):
+ """
+ Train BPE tokenizer on text corpus and save artifacts.
+
+ Args:
+ text_file: Path to training text corpus.
+ save_path: Directory to save vocab.json and merges.txt.
+
+ Raises:
+ FileNotFoundError: If text_file does not exist or is not a file.
+ """
+
+ text_file = Path(text_file)
+ save_path = Path(save_path)
+
+ if not text_file.is_file():
+ raise FileNotFoundError(
+ f"Training file not found at {text_file}. Please provide a valid text corpus file."
+ )
+
+ save_path.mkdir(parents=True, exist_ok=True)
tokenizer = ByteLevelBPETokenizer()
@@ -21,8 +57,109 @@ def train(self, text_file: Path, save_path: Path):
tokenizer.save_model(str(save_path))
+ self._tokenizer = tokenizer
+
+ # ------------------------------------------------------------------
+ # Encode / Decode
+ # ------------------------------------------------------------------
+
+ def encode(self, text: str) -> list[int]:
+ """
+ Encode text into a list of token ids.
+
+ Args:
+ text: Input string to tokenize.
+
+ Returns:
+ List of integer token ids.
+
+ Raises:
+ RuntimeError: If tokenizer has not been trained or loaded.
+ """
+
+ self._check_loaded()
+ return self._tokenizer.encode(text).ids
+
+ def decode(self, ids: list[int]) -> str:
+ """
+ Decode a list of token ids back into text.
+
+ Args:
+ ids: List of integer token ids.
+
+ Returns:
+ Decoded string.
+
+ Raises:
+ RuntimeError: If tokenizer has not been trained or loaded.
+ """
+
+ self._check_loaded()
+ return self._tokenizer.decode(ids, skip_special_tokens=False)
+
+ # ------------------------------------------------------------------
+ # Load
+ # ------------------------------------------------------------------
+
+ def load(self, tokenizer_dir: Path):
+ """
+ Load a previously trained BPE tokenizer from disk.
+
+ Args:
+ tokenizer_dir: Directory containing vocab.json and merges.txt.
+
+ Raises:
+ FileNotFoundError: If vocab.json or merges.txt are not found.
+ """
+
+ tokenizer_dir = Path(tokenizer_dir)
+
+ vocab_path = tokenizer_dir / "vocab.json"
+ merges_path = tokenizer_dir / "merges.txt"
+
+ if not vocab_path.is_file():
+ raise FileNotFoundError(
+ f"vocab.json not found at {vocab_path}. Please train the tokenizer first."
+ )
+
+ if not merges_path.is_file():
+ raise FileNotFoundError(
+ f"merges.txt not found at {merges_path}. Please train the tokenizer first."
+ )
+
+ self._tokenizer = ByteLevelBPETokenizer(
+ vocab=str(vocab_path),
+ merges=str(merges_path),
+ )
+
+ # 🔧 Fix: re-register special tokens after loading
+ self._tokenizer.add_special_tokens(SPECIAL_TOKENS)
+
+ # ------------------------------------------------------------------
+ # Artifact paths
+ # ------------------------------------------------------------------
+
def get_vocab_path(self, tokenizer_dir: Path) -> Path:
- return tokenizer_dir / "vocab.json"
+ """Return path to vocab.json file."""
+ return Path(tokenizer_dir) / "vocab.json"
def get_merges_path(self, tokenizer_dir: Path) -> Path:
- return tokenizer_dir / "merges.txt"
+ """Return path to merges.txt file."""
+ return Path(tokenizer_dir) / "merges.txt"
+
+ # ------------------------------------------------------------------
+ # Internal helpers
+ # ------------------------------------------------------------------
+
+ def _check_loaded(self):
+ """
+ Check that tokenizer is loaded before encode/decode.
+
+ Raises:
+ RuntimeError: If tokenizer has not been trained or loaded.
+ """
+
+ if self._tokenizer is None:
+ raise RuntimeError(
+ "BPE tokenizer is not loaded. Call train() or load() before encode/decode."
+ )
diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py
index ad7f789..cc0c558 100644
--- a/openverifiablellm/verify.py
+++ b/openverifiablellm/verify.py
@@ -386,15 +386,17 @@ def verify_preprocessing(
"environment_hash",
expected=manifest.get("environment_hash"),
actual=current_env["environment_hash"],
- detail="Environment fingerprint comparison"
+ detail="Environment fingerprint comparison",
)
else:
- report.add(CheckResult(
- name="environment_hash",
- status=CheckStatus.SKIP,
- detail="Field absent from manifest (older version)"
- ))
-
+ report.add(
+ CheckResult(
+ name="environment_hash",
+ status=CheckStatus.SKIP,
+ detail="Field absent from manifest (older version)",
+ )
+ )
+
# 4. Re-run preprocessing in an isolated temp directory
tmp_dir = Path(tempfile.mkdtemp(prefix="ovllm_verify_"))
try:
diff --git a/tests/test_bpebase.py b/tests/test_bpebase.py
new file mode 100644
index 0000000..d990e9e
--- /dev/null
+++ b/tests/test_bpebase.py
@@ -0,0 +1,253 @@
+import pytest
+
+from openverifiablellm.tokenizer.bpe_tokenizer import BPETokenizer
+
+# ------------------------------------------------------------------
+# Fixtures
+# ------------------------------------------------------------------
+
+
+@pytest.fixture
+def sample_text_file(tmp_path):
+ """Create a sample text file for training."""
+ text = (
+ "Wikipedia is a free online encyclopedia.\n"
+ "It is written collaboratively by volunteers.\n"
+ "Anyone can edit Wikipedia articles.\n"
+ "Wikipedia was launched on January 15 2001.\n"
+ "It is one of the most popular websites in the world.\n"
+ ) * 500
+
+ text_file = tmp_path / "sample.txt"
+ text_file.write_text(text, encoding="utf-8")
+ return text_file
+
+
+@pytest.fixture
+def trained_tokenizer(tmp_path, sample_text_file):
+ """Train and return path to trained BPETokenizer."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.train(sample_text_file, tmp_path / "tokenizer")
+ return tmp_path / "tokenizer"
+
+
+# ------------------------------------------------------------------
+# Training tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_train_creates_artifacts(tmp_path, sample_text_file):
+ """Training should produce vocab.json and merges.txt."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ save_path = tmp_path / "tokenizer"
+
+ tokenizer.train(sample_text_file, save_path)
+
+ assert (save_path / "vocab.json").is_file()
+ assert (save_path / "merges.txt").is_file()
+
+
+def test_bpe_train_creates_save_directory(tmp_path, sample_text_file):
+ """train() should create save_path directory if it does not exist."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ save_path = tmp_path / "nested" / "tokenizer" / "dir"
+
+ assert not save_path.exists()
+
+ tokenizer.train(sample_text_file, save_path)
+
+ assert save_path.exists()
+
+
+def test_bpe_train_raises_file_not_found(tmp_path):
+ """train() should raise FileNotFoundError for missing text file."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ with pytest.raises(FileNotFoundError, match="Training file not found"):
+ tokenizer.train(tmp_path / "nonexistent.txt", tmp_path / "tokenizer")
+
+
+def test_bpe_train_raises_if_directory_passed(tmp_path):
+ """train() should raise FileNotFoundError if directory passed as text_file."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ with pytest.raises(FileNotFoundError, match="Training file not found"):
+ tokenizer.train(tmp_path, tmp_path / "tokenizer")
+
+
+# ------------------------------------------------------------------
+# Encode / Decode tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_encode_returns_list_of_ints(trained_tokenizer):
+ """encode() should return a list of integers."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.load(trained_tokenizer)
+
+ ids = tokenizer.encode("hello world")
+
+ assert isinstance(ids, list)
+ assert all(isinstance(i, int) for i in ids)
+
+
+def test_bpe_encode_decode_roundtrip(trained_tokenizer):
+ """encode then decode should return original text."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.load(trained_tokenizer)
+
+ text = "Wikipedia is a free online encyclopedia"
+ ids = tokenizer.encode(text)
+ decoded = tokenizer.decode(ids)
+
+ assert decoded.strip() == text.strip()
+
+
+def test_bpe_encode_works_after_train(tmp_path, sample_text_file):
+ """encode() should work immediately after train() without calling load()."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.train(sample_text_file, tmp_path / "tokenizer")
+
+ ids = tokenizer.encode("hello world")
+
+ assert isinstance(ids, list)
+ assert len(ids) > 0
+
+
+def test_bpe_encode_raises_if_not_loaded():
+ """encode() should raise RuntimeError if model not loaded."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ with pytest.raises(RuntimeError, match="not loaded"):
+ tokenizer.encode("hello world")
+
+
+def test_bpe_decode_raises_if_not_loaded():
+ """decode() should raise RuntimeError if model not loaded."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ with pytest.raises(RuntimeError, match="not loaded"):
+ tokenizer.decode([1, 2, 3])
+
+
+# ------------------------------------------------------------------
+# Load tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_load_from_disk(trained_tokenizer):
+ """load() should successfully restore tokenizer from disk."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.load(trained_tokenizer)
+
+ assert tokenizer._tokenizer is not None
+
+
+def test_bpe_encode_works_after_load(trained_tokenizer):
+ """encode() should work correctly after load()."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.load(trained_tokenizer)
+
+ ids = tokenizer.encode("hello world")
+
+ assert isinstance(ids, list)
+ assert len(ids) > 0
+
+
+def test_bpe_load_raises_if_vocab_missing(tmp_path):
+ """load() should raise FileNotFoundError if vocab.json not found."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ with pytest.raises(FileNotFoundError, match=r"vocab\.json not found"):
+ tokenizer.load(tmp_path)
+
+
+def test_bpe_load_raises_if_merges_missing(tmp_path):
+ """load() should raise FileNotFoundError if merges.txt not found."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+
+ # Create vocab.json but not merges.txt
+ (tmp_path / "vocab.json").write_text("{}", encoding="utf-8")
+
+ with pytest.raises(FileNotFoundError, match=r"merges\.txt not found"):
+ tokenizer.load(tmp_path)
+
+
+# ------------------------------------------------------------------
+# Artifact path tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_get_vocab_path(tmp_path):
+ """get_vocab_path() should return path to vocab.json."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ vocab_path = tokenizer.get_vocab_path(tmp_path)
+
+ assert vocab_path == tmp_path / "vocab.json"
+
+
+def test_bpe_get_merges_path(tmp_path):
+ """get_merges_path() should return path to merges.txt."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ merges_path = tokenizer.get_merges_path(tmp_path)
+
+ assert merges_path == tmp_path / "merges.txt"
+
+
+# ------------------------------------------------------------------
+# Special tokens tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_special_tokens_in_vocabulary(trained_tokenizer):
+ """Special tokens should be present in trained vocabulary."""
+ tokenizer = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer.load(trained_tokenizer)
+
+ vocab_path = trained_tokenizer / "vocab.json"
+ vocab_content = vocab_path.read_text(encoding="utf-8")
+
+ assert "" in vocab_content
+ assert "" in vocab_content
+ assert "" in vocab_content
+ assert "" in vocab_content
+ assert "" in vocab_content
+
+
+# ------------------------------------------------------------------
+# Determinism tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_training_is_deterministic(tmp_path, sample_text_file):
+ """Training twice on same data should produce same vocab."""
+ save_path_1 = tmp_path / "tokenizer_1"
+ save_path_2 = tmp_path / "tokenizer_2"
+
+ tokenizer_1 = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer_1.train(sample_text_file, save_path_1)
+
+ tokenizer_2 = BPETokenizer(vocab_size=1000, min_frequency=2)
+ tokenizer_2.train(sample_text_file, save_path_2)
+
+ vocab_1 = (save_path_1 / "vocab.json").read_text(encoding="utf-8")
+ vocab_2 = (save_path_2 / "vocab.json").read_text(encoding="utf-8")
+
+ assert vocab_1 == vocab_2
+
+
+# ------------------------------------------------------------------
+# Constructor validation tests
+# ------------------------------------------------------------------
+
+
+def test_bpe_raises_if_vocab_size_zero():
+ """BPETokenizer should raise ValueError if vocab_size <= 0."""
+ with pytest.raises(ValueError, match="vocab_size must be > 0"):
+ BPETokenizer(vocab_size=0, min_frequency=2)
+
+
+def test_bpe_raises_if_min_frequency_zero():
+ """BPETokenizer should raise ValueError if min_frequency <= 0."""
+ with pytest.raises(ValueError, match="min_frequency must be > 0"):
+ BPETokenizer(vocab_size=1000, min_frequency=0)