Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
42f8ca0
ANS weight compression: 1.6 MB (13.9%) lossless savings over LZMA
OE-GOD Apr 9, 2026
70f5985
Add ready-to-run ANS experiment script for 8xH100
OE-GOD Apr 10, 2026
91879fb
Add 5-run design space sweep for ANS advantage
OE-GOD Apr 10, 2026
67f5f94
Integrate ANS into #1 entry (PR 1517) — USE_ANS=1 to enable
OE-GOD Apr 11, 2026
7042566
Add post-TTT GPTQ calibration (POST_TTT_GPTQ=1) — matches Hessians to…
OE-GOD Apr 11, 2026
2a0b935
Add progressive recurrence (PROGRESSIVE_RECUR=1, RECUR_MAX_K=4, RECUR…
OE-GOD Apr 11, 2026
ea25cfd
Add quantization optimization sweep: k-sweep, ANS vs Brotli, high WD,…
OE-GOD Apr 12, 2026
334fd30
Fix post-TTT GPTQ: handle 1D tensor in collect_hessians_from_tokens
OE-GOD Apr 13, 2026
292fddc
Fix post-TTT GPTQ: cast val_tokens to int64 for embedding layer
OE-GOD Apr 13, 2026
7b9dcb7
Add casefold retokenization pipeline — CPU-only data prep for casefol…
OE-GOD Apr 13, 2026
67a59af
Add fast multiprocessing casefold tokenization
OE-GOD Apr 13, 2026
95950db
Add streaming casefold tokenization — no RAM explosion, writes shards…
OE-GOD Apr 13, 2026
8cd62ec
Add PR 813 BackoffNgramMixer training script (0.6671 BPB)
OE-GOD Apr 14, 2026
706c525
Add per-matrix quantization test (Q,K at int4/int5 vs uniform int6)
OE-GOD Apr 14, 2026
0cebe52
Add diagnostic: analyze WHERE and WHY the model fails before trying t…
OE-GOD Apr 14, 2026
2ff2f72
Add word-start loss weighting (WORD_START_WEIGHT=1.5) — data-driven f…
OE-GOD Apr 14, 2026
bc5d0b2
Add PR 1493 merged #1 entry (decoded + raw)
OE-GOD Apr 14, 2026
3677596
Add GDN-Hybrid architecture from PR 1545 (1.028 BPB, non-transformer)
OE-GOD Apr 15, 2026
aa00007
Score-first TTT: compliant with Condition 3 (score-before-update)
OE-GOD Apr 18, 2026
caa4a9f
Fix score-first TTT: use global chunk split with per-rank distribution
OE-GOD Apr 18, 2026
090291a
Fix nonlocal scope for tensor accumulators in _score_range
OE-GOD Apr 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions casefold_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""Fast casefold retokenization using multiprocessing."""
import json
import os
import sys
import time
import unicodedata
from multiprocessing import Pool, cpu_count
from pathlib import Path

import numpy as np
import sentencepiece as spm

DOCS_PATH = Path("data/docs_selected.jsonl")
TOKENIZER_PATH = Path("data/tokenizers/fineweb_8192_bpe_casefold.model")
DATASET_DIR = Path("data/datasets/fineweb10B_sp8192_casefold")
NUM_VAL_DOCS = 50_000
SHARD_SIZE = 10**8

# Global tokenizer (loaded per process)
_sp = None

def _init_worker():
global _sp
_sp = spm.SentencePieceProcessor(model_file=str(TOKENIZER_PATH))

def _process_line(line):
doc = json.loads(line)
text = doc.get("text", "")
if not text:
return None
cf = unicodedata.normalize('NFKC', text).lower()
tokens = _sp.encode(cf)
return tokens

def main():
if not TOKENIZER_PATH.exists():
print(f"Tokenizer not found: {TOKENIZER_PATH}")
print("Run: python3 casefold_retokenize.py --train-tokenizer")
sys.exit(1)

DATASET_DIR.mkdir(parents=True, exist_ok=True)
nproc = cpu_count() or 8
print(f"=== Fast casefold tokenization ({nproc} workers) ===")

# Read all lines
print(f"Reading {DOCS_PATH}...")
with open(DOCS_PATH) as f:
lines = f.readlines()
total = len(lines)
print(f"Total docs: {total:,}")

# Process in parallel
t0 = time.time()
val_tokens = []
train_tokens = []

batch_size = 10000
processed = 0

with Pool(nproc, initializer=_init_worker) as pool:
for i in range(0, total, batch_size):
batch = lines[i:i+batch_size]
results = pool.map(_process_line, batch)
for tokens in results:
if tokens is None:
continue
if processed < NUM_VAL_DOCS:
val_tokens.extend(tokens)
else:
train_tokens.extend(tokens)
processed += 1

elapsed = time.time() - t0
rate = processed / elapsed
if (i // batch_size) % 10 == 0:
print(f" {processed:,}/{total:,} ({rate:.0f} docs/s, {len(val_tokens)+len(train_tokens):,} tokens)")

elapsed = time.time() - t0
print(f"\nDone: {processed:,} docs, {len(val_tokens)+len(train_tokens):,} tokens in {elapsed:.0f}s")

# Save val
val_arr = np.array(val_tokens, dtype=np.uint16)
val_path = DATASET_DIR / "fineweb_val_000000.bin"
val_arr.tofile(str(val_path))
print(f"Val: {val_path} ({len(val_arr):,} tokens)")

# Save train shards
train_arr = np.array(train_tokens, dtype=np.uint16)
n_shards = max(1, (len(train_arr) + SHARD_SIZE - 1) // SHARD_SIZE)
for i in range(n_shards):
s = i * SHARD_SIZE
e = min(s + SHARD_SIZE, len(train_arr))
shard = train_arr[s:e]
path = DATASET_DIR / f"fineweb_train_{i:06d}.bin"
shard.tofile(str(path))
print(f"Train shard {i}: {path} ({len(shard):,} tokens)")

print(f"\nDataset: {DATASET_DIR}")
print(f"To train:")
print(f" VOCAB_SIZE=8192 DATA_PATH={DATASET_DIR}/ TOKENIZER_PATH={TOKENIZER_PATH} USE_ANS=1 torchrun --standalone --nproc_per_node=8 train_gpt_ans.py")

if __name__ == "__main__":
main()
228 changes: 228 additions & 0 deletions casefold_retokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
Casefold + retokenize FineWeb data for Parameter Golf.

Takes docs_selected.jsonl (raw text), applies NFKC normalization + lowercasing,
then tokenizes with the casefold SP8192 tokenizer and saves as .bin shards.

Usage:
# Step 1: Train casefold tokenizer on the data
python3 casefold_retokenize.py --train-tokenizer

# Step 2: Retokenize all documents
python3 casefold_retokenize.py --tokenize

# Or do both:
python3 casefold_retokenize.py --train-tokenizer --tokenize
"""

import argparse
import json
import os
import struct
import sys
import time
import unicodedata
from pathlib import Path

import numpy as np

DOCS_PATH = Path("data/docs_selected.jsonl")
TOKENIZER_DIR = Path("data/tokenizers")
DATASET_DIR = Path("data/datasets/fineweb10B_sp8192_casefold")
VOCAB_SIZE = 8192
NUM_VAL_DOCS = 50_000
SHARD_SIZE = 10**8 # tokens per shard
SP_MODEL_PATH = TOKENIZER_DIR / "fineweb_8192_bpe_casefold.model"


def casefold_text(text: str) -> str:
"""NFKC normalize + lowercase. Same transform as PR #1578/#1585."""
text = unicodedata.normalize('NFKC', text)
text = text.lower()
return text


def train_tokenizer():
"""Train a SentencePiece BPE tokenizer on casefolded text."""
import sentencepiece as spm
import tempfile

print("=== Training casefold SP8192 tokenizer ===")
print(f"Reading docs from {DOCS_PATH}...")

# Write casefolded text to temp file for SP training
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False, encoding='utf-8')
n_docs = 0
max_train_docs = 500_000 # Use subset for tokenizer training (faster)

with open(DOCS_PATH) as f:
for line in f:
if n_docs >= max_train_docs:
break
doc = json.loads(line)
text = doc.get("text", "")
if not text:
continue
text = casefold_text(text)
tmp.write(text + "\n")
n_docs += 1
if n_docs % 50_000 == 0:
print(f" Processed {n_docs:,} docs for tokenizer training...")

tmp.close()
print(f" Total: {n_docs:,} docs written to temp file")

# Train SentencePiece
TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
model_prefix = str(SP_MODEL_PATH).replace('.model', '')

print(f" Training SentencePiece BPE (vocab={VOCAB_SIZE})...")
spm.SentencePieceTrainer.train(
input=tmp.name,
model_prefix=model_prefix,
vocab_size=VOCAB_SIZE,
model_type='bpe',
character_coverage=1.0,
num_threads=os.cpu_count() or 4,
train_extremely_large_corpus=True,
max_sentence_length=16384,
shuffle_input_sentence=True,
byte_fallback=True,
)

os.unlink(tmp.name)
print(f" Tokenizer saved: {SP_MODEL_PATH}")

# Verify
sp = spm.SentencePieceProcessor(model_file=str(SP_MODEL_PATH))
print(f" Vocab size: {sp.get_piece_size()}")
test = "the quick brown fox jumps over the lazy dog"
tokens = sp.encode(test)
print(f" Test: '{test}' → {len(tokens)} tokens")


def tokenize_docs():
"""Tokenize all documents with casefold + SP8192 and save as .bin shards."""
import sentencepiece as spm

print("=== Tokenizing with casefold SP8192 ===")

if not SP_MODEL_PATH.exists():
print(f"ERROR: Tokenizer not found at {SP_MODEL_PATH}")
print("Run with --train-tokenizer first")
sys.exit(1)

sp = spm.SentencePieceProcessor(model_file=str(SP_MODEL_PATH))
print(f" Tokenizer: {SP_MODEL_PATH} (vocab={sp.get_piece_size()})")

DATASET_DIR.mkdir(parents=True, exist_ok=True)

# Count docs first
print(f" Counting docs in {DOCS_PATH}...")
total_docs = sum(1 for _ in open(DOCS_PATH))
print(f" Total docs: {total_docs:,}")

# Tokenize all docs, split into val (first 50K) and train (rest)
all_tokens_val = []
all_tokens_train = []
val_byte_counts = []
train_byte_counts = []

t0 = time.time()
n_docs = 0
total_tokens = 0

with open(DOCS_PATH) as f:
for line in f:
doc = json.loads(line)
text = doc.get("text", "")
if not text:
continue

# Casefold
cf_text = casefold_text(text)
original_bytes = len(text.encode('utf-8'))

# Tokenize
tokens = sp.encode(cf_text)
total_tokens += len(tokens)

if n_docs < NUM_VAL_DOCS:
all_tokens_val.extend(tokens)
val_byte_counts.extend([original_bytes] * len(tokens))
else:
all_tokens_train.extend(tokens)
train_byte_counts.extend([original_bytes] * len(tokens))

n_docs += 1
if n_docs % 100_000 == 0:
elapsed = time.time() - t0
rate = n_docs / elapsed
print(f" {n_docs:,}/{total_docs:,} docs ({rate:.0f} docs/s, {total_tokens:,} tokens)")

elapsed = time.time() - t0
print(f" Done: {n_docs:,} docs, {total_tokens:,} tokens in {elapsed:.1f}s")
print(f" Val tokens: {len(all_tokens_val):,}")
print(f" Train tokens: {len(all_tokens_train):,}")
print(f" Tokens/byte: {total_tokens / sum(len(json.loads(l).get('text','').encode('utf-8')) for l in open(DOCS_PATH) if json.loads(l).get('text','')):.4f}" if False else "")

# Save val shard
val_arr = np.array(all_tokens_val, dtype=np.uint16)
val_path = DATASET_DIR / "fineweb_val_000000.bin"
val_arr.tofile(str(val_path))
print(f" Saved val: {val_path} ({val_arr.nbytes / 1e6:.1f} MB, {len(val_arr):,} tokens)")

# Save train shards
train_arr = np.array(all_tokens_train, dtype=np.uint16)
n_shards = max(1, len(train_arr) // SHARD_SIZE + (1 if len(train_arr) % SHARD_SIZE else 0))

for i in range(n_shards):
start = i * SHARD_SIZE
end = min((i + 1) * SHARD_SIZE, len(train_arr))
shard = train_arr[start:end]
shard_path = DATASET_DIR / f"fineweb_train_{i:06d}.bin"
shard.tofile(str(shard_path))
print(f" Saved train shard {i}: {shard_path} ({shard.nbytes / 1e6:.1f} MB, {len(shard):,} tokens)")

# Copy tokenizer files to expected location
vocab_path = str(SP_MODEL_PATH).replace('.model', '.vocab')
if Path(vocab_path).exists():
import shutil
shutil.copy(vocab_path, TOKENIZER_DIR / "fineweb_8192_bpe_casefold.vocab")

print(f"\n=== Done ===")
print(f" Dataset: {DATASET_DIR}")
print(f" Tokenizer: {SP_MODEL_PATH}")
print(f" Val: 1 shard, {len(all_tokens_val):,} tokens")
print(f" Train: {n_shards} shards, {len(all_tokens_train):,} tokens")
print(f"\nTo train:")
print(f" VOCAB_SIZE=8192 DATA_PATH={DATASET_DIR}/ TOKENIZER_PATH={SP_MODEL_PATH} USE_ANS=1 torchrun --standalone --nproc_per_node=8 train_gpt_ans.py")


def main():
parser = argparse.ArgumentParser(description="Casefold + retokenize for Parameter Golf")
parser.add_argument("--train-tokenizer", action="store_true", help="Train casefold SP8192 tokenizer")
parser.add_argument("--tokenize", action="store_true", help="Tokenize all docs with casefold")

args = parser.parse_args()

if not args.train_tokenizer and not args.tokenize:
parser.print_help()
return

if not DOCS_PATH.exists():
print(f"ERROR: {DOCS_PATH} not found.")
print("Download it first:")
print(" python3 data/cached_challenge_fineweb.py --variant sp1024 --with-docs --train-shards 0")
sys.exit(1)

if args.train_tokenizer:
train_tokenizer()

if args.tokenize:
tokenize_docs()


if __name__ == "__main__":
main()
Loading