diff --git a/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/README.md b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/README.md new file mode 100644 index 0000000000..bf8118d9ef --- /dev/null +++ b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/README.md @@ -0,0 +1,88 @@ +# WIP: Sequential GPTQ with Groupwise Int6 Quantization + +**Track:** 10min / 8×H100 / 16MB artifact +**Base:** PR #1218 (clarkkev — SP4096, MLP 4×, WD 0.085, brotli, XSA-all) — 1.098 BPB baseline +**Status:** Implementation complete — requesting compute credits for 3-seed validation + +## Approach + +This submission improves the post-training quantization pipeline while keeping the training procedure identical to PR #1218. The core insight: #1218 loses ~0.012 BPB from quantization (pre-quant 1.1047 → post-quant 1.1162 non-sliding). Recovering even a fraction of that loss is free — zero training-time cost, zero throughput tax. + +### Changes from #1218 + +#### 1. Sequential cross-layer GPTQ propagation (`collect_hessians_sequential`) + +Instead of collecting all Hessians in a single pass and quantizing each layer independently, we process layers one at a time: collect Hessian for layer *i*, quantize it with GPTQ, **inject the quantized weights back into the model**, then collect the Hessian for layer *i+1*. This means later layers' Hessians reflect the actual quantized activations they'll see at eval time, capturing cross-layer error accumulation that per-layer GPTQ misses. + +Controlled by `GPTQ_SEQUENTIAL=1` (default on). + +#### 2. Groupwise int6 scales (`group_size=128`) + +Replace per-row scales with per-group scales (128 columns per group). Each group of weights gets its own fp16 scale factor, giving the quantizer finer control over heterogeneous weight distributions within each row. The scale storage overhead is small (~2% of weight bytes) but the reconstruction error reduction is significant for layers with high weight variance. + +Controlled by `GPTQ_GROUP_SIZE=128` (default). + +#### 3. Hessian-weighted scale selection + +For per-row mode, instead of selecting scales by minimizing MSE `(W - Q)^2`, we minimize the Hessian-weighted error `sum(H_diag * (W - Q)^2)`, which directly optimizes for output reconstruction quality. Columns with high Hessian diagonal (high activation variance) get proportionally more weight in the error metric. + +### Why this is distinct + +- **Not in PR #1218**: #1218 uses independent per-layer GPTQ with per-row scales and MSE-based selection. +- **Not in PR #1019**: #1019 focuses on self-generated calibration data; this improves the quantization algorithm itself. +- **Not in PR #1204 / #1209**: Those PRs focus on architecture changes (parallel residuals, depth recurrence, TTT). +- **No SLOT / TTT**: Pure post-training compression improvement — clean causal submission. + +### Expected improvement + +| Metric | Estimate | +|---|---| +| dBPB (vs #1218) | −0.004 to −0.008 | +| Step-time cost | 0 ms (post-training only) | +| Quant time overhead | +5–10s (sequential propagation) | +| int6 compatible | Yes (this IS the int6 path) | +| torch.compile compatible | Yes (post-training only) | + +## Implementation details + +New/modified functions: +- `_compute_groupwise_scales()` — per-group fp16 scale computation +- `_quantize_with_groupwise_scales()` — apply groupwise quantization +- `_dequant_groupwise()` — reconstruct from groupwise int6 +- `collect_hessians_sequential()` — layer-by-layer Hessian collection with error propagation +- `gptq_quantize_weight()` — extended with `group_size` param and Hessian-weighted error metric +- `gptq_mixed_quantize_int6()` — passes `group_size`, stores group metadata +- `dequantize_mixed_int6()` — handles 2D groupwise scale tensors + +New hyperparameters: +- `GPTQ_GROUP_SIZE=128` — columns per quantization group (0 = per-row fallback) +- `GPTQ_SEQUENTIAL=1` — enable sequential cross-layer propagation +- `GPTQ_RESERVE_SECONDS=12` — increased from 10 to account for sequential overhead + +## Ablation Plan + +1. **Baseline**: Reproduce #1218 at 3 seeds (1337, 42, 2025) +2. **+Sequential propagation only**: `GPTQ_SEQUENTIAL=1 GPTQ_GROUP_SIZE=0` +3. **+Groupwise scales only**: `GPTQ_SEQUENTIAL=0 GPTQ_GROUP_SIZE=128` +4. **+Hessian-weighted only**: per-row mode with H-weighted error +5. **Full stack**: All three combined (default config) + +Acceptance criteria: paired t-test across 3 seeds, p < 0.01, dBPB > 0.003. + +## Run Command + +```bash +RUN_ID=1337 SEED=1337 \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Requirements + +Same as PR #1218. Flash Attention 3 (Hopper) required. + +```bash +pip install torch --index-url https://download.pytorch.org/whl/cu130 +pip install --no-cache-dir \ + "https://download.pytorch.org/whl/cu130/flash_attn_3-3.0.0-cp39-abi3-manylinux_2_28_x86_64.whl" +pip install -r requirements.txt +``` diff --git a/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/requirements.txt b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/requirements.txt new file mode 100644 index 0000000000..8ecc743011 --- /dev/null +++ b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/requirements.txt @@ -0,0 +1,6 @@ +# torch and flash-attn-3 are installed separately in setup.sh +brotli +huggingface-hub +numpy +sentencepiece +tqdm \ No newline at end of file diff --git a/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/train_gpt.py b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/train_gpt.py new file mode 100644 index 0000000000..bba7b89bf7 --- /dev/null +++ b/records/track_10min_16mb/2026-04-16_SP4096_SequentialGPTQ_GroupwiseInt6/train_gpt.py @@ -0,0 +1,1830 @@ +import copy +import glob +import io +import lzma +import math +import os +from pathlib import Path +import random +import subprocess +import sys +import time +import uuid + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import Tensor, nn + +from flash_attn_interface import flash_attn_func as flash_attn_3_func + +# ---------------------------------------- +# Hyperparameters +# ---------------------------------------- + +class Hyperparameters(): + # Experiment settings + data_dir = os.environ.get('DATA_DIR', './data/') + seed = int(os.environ.get('SEED', 1337)) + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + + # Training length + iterations = int(os.environ.get('ITERATIONS', 20000)) + warmdown_frac = float(os.environ.get('WARMDOWN_FRAC', 0.667)) + warmup_steps = int(os.environ.get('WARMUP_STEPS', 20)) + train_batch_tokens = int(os.environ.get('TRAIN_BATCH_TOKENS', 2048 * 48 * 8)) + train_seq_len = int(os.environ.get('TRAIN_SEQ_LEN', 2048)) + eval_seq_len = int(os.environ.get('EVAL_SEQ_LEN', 2048)) + max_wallclock_seconds = float(os.environ.get('MAX_WALLCLOCK_SECONDS', 600.0)) + train_log_every = int(os.environ.get('TRAIN_LOG_EVERY', 500)) + + # Validation/Evals + val_batch_tokens = int(os.environ.get('VAL_BATCH_TOKENS', 2048 * 32 * 8)) + val_loss_every = int(os.environ.get('VAL_LOSS_EVERY', 4000)) + sliding_window_enabled = bool(int(os.environ.get('SLIDING_WINDOW_ENABLED', '1'))) + + # Model architecture + vocab_size = int(os.environ.get('VOCAB_SIZE', 4096)) + num_layers = int(os.environ.get('NUM_LAYERS', 11)) + xsa_last_n = int(os.environ.get('XSA_LAST_N', 11)) + num_kv_heads = int(os.environ.get('NUM_KV_HEADS', 4)) + model_dim = int(os.environ.get('MODEL_DIM', 512)) + embedding_dim = int(os.environ.get('EMBEDDING_DIM', 512)) + num_heads = int(os.environ.get('NUM_HEADS', 8)) + mlp_mult = float(os.environ.get('MLP_MULT', 4.0)) + skip_gates_enabled = bool(int(os.environ.get('SKIP_GATES_ENABLED', '1'))) + tie_embeddings = bool(int(os.environ.get('TIE_EMBEDDINGS', '1'))) + logit_softcap = float(os.environ.get('LOGIT_SOFTCAP', 30.0)) + rope_base = float(os.environ.get('ROPE_BASE', 10000.0)) + rope_dims = int(os.environ.get('ROPE_DIMS', 16)) + rope_train_seq_len = int(os.environ.get('ROPE_TRAIN_SEQ_LEN', 2048)) + ln_scale = bool(int(os.environ.get('LN_SCALE', '1'))) + ve_enabled = bool(int(os.environ.get('VE_ENABLED', '1'))) + ve_dim = int(os.environ.get('VE_DIM', 128)) + ve_layers = os.environ.get('VE_LAYERS', '9,10') + qk_gain_init = float(os.environ.get('QK_GAIN_INIT', 4.0)) + + # Optimizer + min_lr = float(os.environ.get('MIN_LR', 0.0)) + embed_lr = float(os.environ.get('EMBED_LR', 0.6)) + head_lr = float(os.environ.get('HEAD_LR', 0.008)) + tied_embed_lr = float(os.environ.get('TIED_EMBED_LR', 0.03)) + tied_embed_init_std = float(os.environ.get('TIED_EMBED_INIT_STD', 0.005)) + matrix_lr = float(os.environ.get('MATRIX_LR', 0.02)) + scalar_lr = float(os.environ.get('SCALAR_LR', 0.02)) + muon_momentum = float(os.environ.get('MUON_MOMENTUM', 0.99)) + muon_backend_steps = int(os.environ.get('MUON_BACKEND_STEPS', 5)) + muon_momentum_warmup_start = float(os.environ.get('MUON_MOMENTUM_WARMUP_START', 0.92)) + muon_momentum_warmup_steps = int(os.environ.get('MUON_MOMENTUM_WARMUP_STEPS', 1500)) + beta1 = float(os.environ.get('BETA1', 0.9)) + beta2 = float(os.environ.get('BETA2', 0.95)) + adam_eps = float(os.environ.get('ADAM_EPS', 1e-8)) + grad_clip_norm = float(os.environ.get('GRAD_CLIP_NORM', 0.3)) + eval_stride = int(os.environ.get('EVAL_STRIDE', 64)) + muon_beta2 = float(os.environ.get('MUON_BETA2', 0.95)) + adam_wd = float(os.environ.get('ADAM_WD', 0.02)) + muon_wd = float(os.environ.get('MUON_WD', 0.085)) + embed_wd = float(os.environ.get('EMBED_WD', 0.085)) + ema_decay = float(os.environ.get('EMA_DECAY', 0.997)) + + # Compression + compressor = os.environ.get('COMPRESSOR', 'brotli') #(lzma or brotli) + gptq_enabled = bool(int(os.environ.get('GPTQ_ENABLED', '1'))) + gptq_calibration_batches = int(os.environ.get('GPTQ_CALIBRATION_BATCHES', 64)) + gptq_reserve_seconds = float(os.environ.get('GPTQ_RESERVE_SECONDS', 12.0)) + gptq_group_size = int(os.environ.get('GPTQ_GROUP_SIZE', 128)) + gptq_sequential = bool(int(os.environ.get('GPTQ_SEQUENTIAL', '1'))) + + # Distributed setup + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + is_main_process = rank == 0 + grad_accum_steps = 8 // world_size + + # Data paths + datasets_dir = os.path.join(data_dir, 'datasets', f'fineweb10B_sp{vocab_size}') + train_files = os.path.join(datasets_dir, 'fineweb_train_*.bin') + val_files = os.path.join(datasets_dir, 'fineweb_val_*.bin') + tokenizer_path = os.path.join(data_dir, 'tokenizers', f'fineweb_{vocab_size}_bpe.model') + + # Experiment files + logfile = f"logs/{run_id}.txt" + model_path = "final_model.pt" + quantized_model_path = "final_model.int6.ptz" + +# ---------------------------------------- +# Global Logging Function +# ---------------------------------------- + +_logger_hparams = None + + +def set_logging_hparams(h: Hyperparameters) -> None: + global _logger_hparams + _logger_hparams = h + + +def log(msg, console: bool = True) -> None: + if _logger_hparams is None: + print(msg) + if _logger_hparams.is_main_process: + if console: + print(msg) + if _logger_hparams.logfile is not None: + with open(_logger_hparams.logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + +# ---------------------------------------- +# Data Loading +# ---------------------------------------- + +class ValidationData: + def __init__(self, h: Hyperparameters, device: torch.device): + if not h.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {h.tokenizer_path}") + self.sp = spm.SentencePieceProcessor(model_file=h.tokenizer_path) + if int(self.sp.vocab_size()) != h.vocab_size: + raise ValueError( + f"VOCAB_SIZE={h.vocab_size} does not match tokenizer vocab_size={int(self.sp.vocab_size())}" + ) + + self.val_tokens = load_validation_tokens(h.val_files, h.eval_seq_len) + self.base_bytes_lut, self.has_leading_space_lut, self.is_boundary_token_lut = ( + build_sentencepiece_luts(self.sp, h.vocab_size, device)) + + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + # The BPB calculation assumes "▁" is its own token so that leading-space bytes + # are counted correctly. See https://github.com/openai/parameter-golf/issues/897 + assert sp.piece_to_id("\u2581") != sp.unk_id(), \ + "Tokenizer must have '▁' (space) as its own token for correct BPB byte counting" + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("\u2581"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + # The export pipeline writes the fixed first-50k-doc validation set to fineweb_val_*. + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" int: + key = str(file) + cached = _SHARD_NTOKENS_CACHE.get(key) + if cached is not None: + return cached + header = np.fromfile(file, dtype=" np.memmap: + key = str(file) + mm = _MMAP_CACHE.get(key) + if mm is not None: + return mm + n = _read_num_tokens(file) + mm = np.memmap(file, mode="r", dtype=" int: + if n <= 1: + return 1 + while True: + s = int(self._rng.integers(1, n)) + if math.gcd(s, n) == 1: + return s + + def _reset_cursor(self, si: int, seq_len: int) -> None: + nt = int(self._num_tokens[si]) + max_phase = min(seq_len - 1, max(0, nt - seq_len - 1)) + phase = int(self._rng.integers(max_phase + 1)) if max_phase > 0 else 0 + bc = (nt - 1 - phase) // seq_len + self._cursor_phase[si] = phase + self._cursor_block_count[si] = bc + self._cursor_next[si] = 0 + self._cursor_start[si] = int(self._rng.integers(bc)) if bc > 1 else 0 + self._cursor_stride[si] = self._pick_coprime_stride(bc) + self._cursor_init[si] = True + + def _ensure_cursor(self, si: int, seq_len: int) -> None: + if not self._cursor_init[si] or self._cursor_next[si] >= self._cursor_block_count[si]: + self._reset_cursor(si, seq_len) + + def _take_from_shard(self, si: int, seq_len: int, count: int, out: list[tuple[int, int]]) -> None: + rem = count + while rem > 0: + self._ensure_cursor(si, seq_len) + bc = int(self._cursor_block_count[si]) + ni = int(self._cursor_next[si]) + take = min(rem, bc - ni) + phase = int(self._cursor_phase[si]) + start = int(self._cursor_start[si]) + stride = int(self._cursor_stride[si]) + for j in range(take): + bi = (start + (ni + j) * stride) % bc + out.append((si, phase + bi * seq_len)) + self._cursor_next[si] = ni + take + rem -= take + + def _init_pipeline(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> None: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + num_seqs = local_tokens // seq_len + global_num_seqs = num_seqs * self.world_size + self._cfg = (local_tokens, seq_len, num_seqs, global_num_seqs) + bbc = (self._num_tokens - 1) // seq_len + eligible = bbc > 0 + self._eligible_shards = np.nonzero(eligible)[0].astype(np.int64) + self._base_block_counts = bbc[self._eligible_shards].astype(np.int64) + + def _sample_global_windows(self) -> list[tuple[int, int]]: + assert self._cfg is not None and self._eligible_shards is not None + _, seq_len, _, gns = self._cfg + ec = int(self._eligible_shards.size) + progress = min(self._batches_built / 1800.0, 1.0) + remaining = np.empty(ec, dtype=np.float64) + for i, si in enumerate(self._eligible_shards.tolist()): + if self._cursor_init[si]: + r = int(self._cursor_block_count[si]) - int(self._cursor_next[si]) + remaining[i] = float(max(r, 1)) + else: + remaining[i] = float(self._base_block_counts[i]) + alpha = 0.90 - 0.40 * progress + weights = np.power(remaining, alpha) + ws = float(weights.sum()) + if not np.isfinite(ws) or ws <= 0.0: + weights = np.ones(ec, dtype=np.float64) + ws = float(weights.sum()) + probs = weights / ws + low = min(max(8, self.world_size), ec, gns) + high = min(max(32, self.world_size * 8), ec, gns) + mix = max(1, min(int(round(low + progress * (high - low))), ec, gns)) + cp = self._rng.choice(ec, size=mix, replace=False, p=probs) + cs = self._eligible_shards[cp] + cpr = probs[cp].copy() + cpr /= cpr.sum() + counts = np.ones(mix, dtype=np.int64) + extra = gns - mix + if extra > 0: + counts += self._rng.multinomial(extra, cpr).astype(np.int64) + perm = self._rng.permutation(mix) + cs, counts = cs[perm], counts[perm] + buckets: list[list[tuple[int, int]]] = [] + for si, cnt in zip(cs.tolist(), counts.tolist()): + b: list[tuple[int, int]] = [] + self._take_from_shard(int(si), seq_len, int(cnt), b) + if b: + if len(b) > 1: + bp = self._rng.permutation(len(b)) + b = [b[int(k)] for k in bp.tolist()] + buckets.append(b) + windows: list[tuple[int, int]] = [] + active = [i for i, bk in enumerate(buckets) if bk] + while active: + order = self._rng.permutation(len(active)) + new_active: list[int] = [] + for oi in order.tolist(): + bi = active[oi] + if buckets[bi]: + windows.append(buckets[bi].pop()) + if buckets[bi]: + new_active.append(bi) + active = new_active + return windows + + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + if self._cfg is None: + self._init_pipeline(global_tokens, seq_len, grad_accum_steps) + _, _, num_seqs, _ = self._cfg + gw = self._sample_global_windows() + local_w = gw[self.rank::self.world_size] + x = torch.empty((num_seqs, seq_len), dtype=torch.int64) + y = torch.empty((num_seqs, seq_len), dtype=torch.int64) + for slot, (si, pos) in enumerate(local_w): + mm = _get_shard_memmap(self.files[si]) + window = torch.as_tensor(np.array(mm[pos:pos + seq_len + 1], dtype=np.int64)) + x[slot] = window[:-1] + y[slot] = window[1:] + self._batches_built += 1 + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) + +# ---------------------------------------- +# Model Architecture +# ---------------------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) + + +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0, train_seq_len: int = 1024, rope_dims: int = 0): + super().__init__() + self.dim = dim + self.base = base + self.train_seq_len = train_seq_len + self.rope_dims = rope_dims if rope_dims > 0 else dim + inv_freq = 1.0 / (base ** (torch.arange(0, self.rope_dims, 2, dtype=torch.float32) / self.rope_dims)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + rd = self.rope_dims + if seq_len > self.train_seq_len: + scale = seq_len / self.train_seq_len + new_base = self.base * (scale ** (rd / (rd - 2))) + inv_freq = 1.0 / (new_base ** (torch.arange(0, rd, 2, dtype=torch.float32, device=device) / rd)) + else: + inv_freq = self.inv_freq.to(device) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.outer(t, inv_freq) + self._cos_cached = freqs.cos()[None, :, None, :] + self._sin_cached = freqs.sin()[None, :, None, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor, rope_dims: int = 0) -> Tensor: + if rope_dims > 0 and rope_dims < x.size(-1): + x_rope, x_pass = x[..., :rope_dims], x[..., rope_dims:] + half = rope_dims // 2 + x1, x2 = x_rope[..., :half], x_rope[..., half:] + x_rope = torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + return torch.cat((x_rope, x_pass), dim=-1) + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, + rope_base: float, qk_gain_init: float, train_seq_len: int): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rope_dims = 0 + self.rotary = Rotary(self.head_dim, base=rope_base, train_seq_len=train_seq_len) + self.use_xsa = False + + def _xsa_efficient(self, y: Tensor, v: Tensor) -> Tensor: + B, T, H, D = y.shape + Hkv = v.size(-2) + group = H // Hkv + y_g = y.reshape(B, T, Hkv, group, D) + vn = F.normalize(v, dim=-1).unsqueeze(-2) + proj = (y_g * vn).sum(dim=-1, keepdim=True) * vn + return (y_g - proj).reshape(B, T, H, D) + + def forward(self, x: Tensor, v_embed: Tensor | None = None) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + v = self.c_v(x) + if v_embed is not None: + v = v + v_embed + v = v.reshape(bsz, seqlen, self.num_kv_heads, self.head_dim) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin, self.rope_dims) + k = apply_rotary_emb(k, cos, sin, self.rope_dims) + q = q * self.q_gain.to(dtype=q.dtype)[None, None, :, None] + y = flash_attn_3_func(q, k, v, causal=True) + if self.use_xsa: + y = self._xsa_efficient(y, v) + y = y.reshape(bsz, seqlen, dim) + return self.proj(y) + + +class ValueEmbedding(nn.Module): + def __init__(self, vocab_size: int, ve_dim: int, model_dim: int): + super().__init__() + self.embed = nn.Embedding(vocab_size, ve_dim) + nn.init.normal_(self.embed.weight, std=0.01) + self.proj = CastedLinear(ve_dim, model_dim, bias=False) if ve_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.1, dtype=torch.float32)) + + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(token_ids) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) + + +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: int): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + return self.proj(F.leaky_relu(self.fc(x), negative_slope=0.5).square()) + + +class Block(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, mlp_mult: int, + rope_base: float, qk_gain_init: float, train_seq_len: int, + layer_idx: int = 0, ln_scale: bool = False): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init, train_seq_len) + self.mlp = MLP(dim, mlp_mult) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + self.ln_scale_factor = 1.0 / math.sqrt(layer_idx + 1) if ln_scale else 1.0 + + def forward(self, x: Tensor, x0: Tensor, v_embed: Tensor | None = None) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x_in = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x_in) * self.ln_scale_factor, v_embed=v_embed) + x_out = x_in + self.attn_scale.to(dtype=x_in.dtype)[None, None, :] * attn_out + x_out = x_out + self.mlp_scale.to(dtype=x_out.dtype)[None, None, :] * self.mlp(self.mlp_norm(x_out) * self.ln_scale_factor) + return x_out + + +class GPT(nn.Module): + def __init__(self, h: Hyperparameters): + super().__init__() + self._ve_target_dim = h.num_kv_heads * (h.model_dim // h.num_heads) + if h.logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {h.logit_softcap}") + self.tie_embeddings = h.tie_embeddings + self.tied_embed_init_std = h.tied_embed_init_std + self.logit_softcap = h.logit_softcap + self.tok_emb = nn.Embedding(h.vocab_size, h.embedding_dim) + if h.embedding_dim != h.model_dim: + self.embed_proj = CastedLinear(h.embedding_dim, h.model_dim, bias=False) + self.head_proj = CastedLinear(h.model_dim, h.embedding_dim, bias=False) + else: + self.embed_proj = None + self.head_proj = None + self.num_encoder_layers = h.num_layers // 2 + self.num_decoder_layers = h.num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, h.model_dim, dtype=torch.float32)) + self.skip_gates = nn.Parameter(torch.zeros(self.num_skip_weights, h.model_dim, dtype=torch.float32)) if h.skip_gates_enabled else None + self.blocks = nn.ModuleList([ + Block(h.model_dim, h.num_heads, h.num_kv_heads, h.mlp_mult, h.rope_base, + h.qk_gain_init, h.train_seq_len, layer_idx=i, ln_scale=h.ln_scale) + for i in range(h.num_layers) + ]) + if h.rope_dims > 0: + head_dim = h.model_dim // h.num_heads + for block in self.blocks: + block.attn.rope_dims = h.rope_dims + block.attn.rotary = Rotary(head_dim, base=h.rope_base, train_seq_len=h.train_seq_len, rope_dims=h.rope_dims) + self.ve_layer_indices = [int(x) for x in h.ve_layers.split(",") if x.strip()] if h.ve_enabled else [] + kv_dim = self._ve_target_dim + if self.ve_layer_indices: + self.ve_shared = ValueEmbedding(h.vocab_size, h.ve_dim, kv_dim) + self.ve_layer_scales = nn.ParameterList( + [nn.Parameter(torch.ones(1, dtype=torch.float32)) for _ in self.ve_layer_indices] + ) + else: + self.ve_shared = None + self.ve_layer_scales = nn.ParameterList() + self.value_embeds = nn.ModuleList() + self.final_norm = RMSNorm() + self.lm_head = None if h.tie_embeddings else CastedLinear(h.embedding_dim, h.vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + if h.xsa_last_n > 0: + for i in range(max(0, h.num_layers - h.xsa_last_n), h.num_layers): + self.blocks[i].attn.use_xsa = True + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + + def _get_ve(self, layer_idx: int, input_ids: Tensor, ve_cache: dict | None = None) -> Tensor | None: + if self.ve_shared is None or layer_idx not in self.ve_layer_indices: + return None + if ve_cache is not None and 've' not in ve_cache: + ve_cache['ve'] = self.ve_shared(input_ids) + ve_base = ve_cache['ve'] if ve_cache is not None else self.ve_shared(input_ids) + ve_idx = self.ve_layer_indices.index(layer_idx) + return ve_base * self.ve_layer_scales[ve_idx].to(dtype=ve_base.dtype) + + def forward_logits(self, input_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + if self.embed_proj is not None: + x = self.embed_proj(x) + x0 = x + skips: list[Tensor] = [] + ve_cache: dict = {} + for i in range(self.num_encoder_layers): + ve = self._get_ve(i, input_ids, ve_cache) + x = self.blocks[i](x, x0, v_embed=ve) + skips.append(x) + for i in range(self.num_decoder_layers): + bi = self.num_encoder_layers + i + if skips: + scaled_skip = self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + if self.skip_gates is not None: + g = torch.sigmoid(self.skip_gates[i].to(dtype=x.dtype))[None, None, :] + x = torch.lerp(scaled_skip, x, g) + else: + x = x + scaled_skip + ve = self._get_ve(bi, input_ids, ve_cache) + x = self.blocks[bi](x, x0, v_embed=ve) + x = self.final_norm(x) + if self.head_proj is not None: + x = self.head_proj(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + logits = self.forward_logits(input_ids) + return F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), target_ids.reshape(-1), reduction="mean") + + +def classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if ".mlp." in name: + return "mlp" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" + +# ---------------------------------------- +# Optimization +# ---------------------------------------- + +@torch.compile +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, + nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, + nesterov=nesterov, weight_decay=weight_decay), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + if wd > 0.0: + p.data.mul_(1.0 - lr * wd) + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss + + +class Optimizers(): + def __init__(self, h: Hyperparameters, base_model: GPT): + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p + for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in + CONTROL_TENSOR_NAME_PATTERNS) + ] + scalar_params = [ + p + for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in + CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + if base_model.skip_gates is not None and base_model.skip_gates.numel() > 0: + scalar_params.append(base_model.skip_gates) + + token_lr = h.tied_embed_lr if h.tie_embeddings else h.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.ve_shared is not None: + tok_params.append({"params": [base_model.ve_shared.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.ve_shared.proj is not None: + matrix_params.append(base_model.ve_shared.proj.weight) + scalar_params.append(base_model.ve_shared.scale) + for s in base_model.ve_layer_scales: + scalar_params.append(s) + + self.optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + weight_decay=h.embed_wd, + fused=True, + ) + self.optimizer_muon = Muon( + matrix_params, + lr=h.matrix_lr, + momentum=h.muon_momentum, + backend_steps=h.muon_backend_steps, + weight_decay=h.muon_wd, + ) + for group in self.optimizer_muon.param_groups: + group["base_lr"] = h.matrix_lr + self.optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": h.scalar_lr, "base_lr": h.scalar_lr}], + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + weight_decay=h.adam_wd, + fused=True, + ) + self.optimizers: list[torch.optim.Optimizer] = [self.optimizer_tok, self.optimizer_muon, self.optimizer_scalar] + if base_model.lm_head is not None: + self.optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": h.head_lr, "base_lr": h.head_lr}], + betas=(h.beta1, h.beta2), + eps=h.adam_eps, + fused=True, + ) + self.optimizers.insert(1, self.optimizer_head) + else: + self.optimizer_head = None + + def __iter__(self): + return iter(self.optimizers) + + def zero_grad_all(self) -> None: + for opt in self.optimizers: + opt.zero_grad(set_to_none=True) + + def step(self): + for opt in self.optimizers: + opt.step() + self.zero_grad_all() + +# ---------------------------------------- +# Quantization +# ---------------------------------------- + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,skip_gates,ve_layer_scales,ve_shared.scale", + ).split(",") + if pattern +) +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 + + +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale + + +def restore_fp32_params(model: nn.Module) -> None: + """After .bfloat16(), restore CastedLinear weights and control params to FP32.""" + for module in model.modules(): + if isinstance(module, CastedLinear): + module.float() + for name, param in model.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +def quantize_int6_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + best_q, best_s, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(t32.abs(), pct, dim=1) + else: + row_clip = t32.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + q = torch.clamp(torch.round(t32 / s.float()[:, None]), -clip_range, clip_range).to(torch.int8) + recon = q.float() * s.float()[:, None] + err = (t32 - recon).pow(2).mean().item() + if err < best_err: + best_q, best_s, best_err = q, s, err + return best_q, best_s + amax = t32.abs().max().item() + scale = torch.tensor(amax / clip_range if amax > 0 else 1.0, dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -clip_range, clip_range).to(torch.int8) + return q, scale + + +def collect_hessians( + model: nn.Module, + train_loader: DistributedTokenLoader, + hp: Hyperparameters, + device: torch.device, + n_calibration_batches: int = 64, +) -> dict[str, Tensor]: + """Run calibration batches and collect H = X^T X for each CastedLinear layer.""" + hessians: dict[str, Tensor] = {} + hooks = [] + + def make_hook(name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if name not in hessians: + hessians[name] = torch.zeros( + x.shape[1], x.shape[1], dtype=torch.float32, device=device + ) + hessians[name].addmm_(x.T, x) + return hook_fn + + for name, module in model.named_modules(): + if isinstance(module, CastedLinear) and module.weight.numel() > 65536: + cat = classify_param(name + ".weight") + if cat in ("mlp", "attn"): + hooks.append(module.register_forward_hook(make_hook(name + ".weight"))) + + model.eval() + with torch.no_grad(): + for i in range(n_calibration_batches): + x, y = train_loader.next_batch( + hp.train_batch_tokens, + hp.train_seq_len, hp.grad_accum_steps, + ) + model.forward_logits(x) + + for hk in hooks: + hk.remove() + + for name in hessians: + hessians[name] = hessians[name].cpu() / n_calibration_batches + + return hessians + + +def collect_hessians_sequential( + model: nn.Module, + train_loader: DistributedTokenLoader, + hp: Hyperparameters, + device: torch.device, + state_dict: dict[str, Tensor], + n_calibration_batches: int = 64, + group_size: int = 128, +) -> dict[str, Tensor]: + """Sequential Hessian collection: quantize each layer, inject the quantized + weights back into the model, then collect Hessians for later layers. + This propagates quantization error forward so later layers' Hessians + reflect the actual (quantized) activations they'll see at eval time.""" + layer_order: list[tuple[str, CastedLinear]] = [] + for name, module in model.named_modules(): + if isinstance(module, CastedLinear) and module.weight.numel() > 65536: + cat = classify_param(name + ".weight") + if cat in ("mlp", "attn"): + layer_order.append((name + ".weight", module)) + + all_hessians: dict[str, Tensor] = {} + clip_range = 31 + + for layer_idx, (wname, layer_module) in enumerate(layer_order): + hessians: dict[str, Tensor] = {} + hook_handle = None + + def make_hook(tgt_name: str): + def hook_fn(module, inp, out): + x = inp[0].detach().float() + if x.ndim == 3: + x = x.reshape(-1, x.shape[-1]) + if tgt_name not in hessians: + hessians[tgt_name] = torch.zeros( + x.shape[1], x.shape[1], dtype=torch.float32, device=device + ) + hessians[tgt_name].addmm_(x.T, x) + return hook_fn + + hook_handle = layer_module.register_forward_hook(make_hook(wname)) + + model.eval() + with torch.no_grad(): + for i in range(n_calibration_batches): + x, y = train_loader.next_batch( + hp.train_batch_tokens, + hp.train_seq_len, hp.grad_accum_steps, + ) + model.forward_logits(x) + + hook_handle.remove() + + if wname in hessians: + H = hessians[wname].cpu() / n_calibration_batches + all_hessians[wname] = H + + w_cpu = state_dict[wname].detach().cpu() + if group_size > 0 and group_size < w_cpu.shape[1]: + q, s = gptq_quantize_weight(w_cpu, H, clip_range=clip_range, + group_size=group_size) + w_quant = _dequant_groupwise(q, s, group_size) + else: + q, s = gptq_quantize_weight(w_cpu, H, clip_range=clip_range) + w_quant = q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1))) + + with torch.no_grad(): + layer_module.weight.copy_(w_quant.to(layer_module.weight.dtype).to(device)) + + log(f"sequential_gptq: layer {layer_idx+1}/{len(layer_order)} {wname}") + + return all_hessians + + +def _compute_groupwise_scales( + W: Tensor, group_size: int, clip_range: int = 31 +) -> Tensor: + """Compute per-group fp16 scales for a 2D weight matrix.""" + rows, cols = W.shape + n_groups = (cols + group_size - 1) // group_size + scales = torch.zeros(rows, n_groups, dtype=torch.float16) + for gi in range(n_groups): + c0 = gi * group_size + c1 = min(c0 + group_size, cols) + group_amax = W[:, c0:c1].abs().amax(dim=1) + scales[:, gi] = (group_amax / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + return scales + + +def gptq_quantize_weight( + w: Tensor, + H: Tensor, + clip_range: int = 31, + block_size: int = 128, + group_size: int = 0, +) -> tuple[Tensor, Tensor]: + """GPTQ with Cholesky error compensation, actorder, groupwise scales, + and Hessian-weighted error metric (Frantar et al., ICLR 2023).""" + W_orig = w.float().clone() + rows, cols = W_orig.shape + H = H.float().clone() + use_groupwise = group_size > 0 and group_size < cols + + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + damp = 0.01 * H.diag().mean() + H.diagonal().add_(damp) + + H_diag = H.diag().clone() + + perm = torch.argsort(H.diag(), descending=True) + invperm = torch.argsort(perm) + W_perm = W_orig[:, perm].clone() + W_perm[:, dead[perm]] = 0 + H = H[perm][:, perm] + + try: + Hinv = torch.cholesky_inverse(torch.linalg.cholesky(H)) + Hinv = torch.linalg.cholesky(Hinv, upper=True) + except torch.linalg.LinAlgError: + if use_groupwise: + scales = _compute_groupwise_scales(W_orig, group_size, clip_range) + q = _quantize_with_groupwise_scales(W_orig, scales, group_size, clip_range) + return q, scales + return quantize_int6_per_row(W_orig, clip_range) + + def _run_gptq(sf_fn): + """Run GPTQ column sweep with a per-column scale function.""" + Q = torch.zeros(rows, cols, dtype=torch.int8) + W_work = W_perm.clone() + for i1 in range(0, cols, block_size): + i2 = min(i1 + block_size, cols) + W_block = W_work[:, i1:i2].clone() + Hinv_block = Hinv[i1:i2, i1:i2] + Err = torch.zeros(rows, i2 - i1) + for j in range(i2 - i1): + w_col = W_block[:, j] + d = Hinv_block[j, j] + sf = sf_fn(i1 + j) + q_col = torch.clamp(torch.round(w_col / sf), -clip_range, clip_range) + Q[:, i1 + j] = q_col.to(torch.int8) + err = (w_col - q_col.float() * sf) / d + Err[:, j] = err + W_block[:, j:] -= err.unsqueeze(1) * Hinv_block[j, j:].unsqueeze(0) + if i2 < cols: + W_work[:, i2:] -= Err @ Hinv[i1:i2, i2:] + return Q + + def _hessian_weighted_error(Q, sf_fn): + """Compute sum of H_diag-weighted squared errors (in permuted order).""" + recon = torch.zeros_like(W_perm) + for c in range(cols): + recon[:, c] = Q[:, c].float() * sf_fn(c) + diff = W_perm - recon + h_diag_perm = H_diag[perm] + return (diff.pow(2) * h_diag_perm[None, :]).sum().item() + + if use_groupwise: + n_groups_perm = (cols + group_size - 1) // group_size + gw_scales_perm = torch.zeros(rows, n_groups_perm, dtype=torch.float16) + for gi in range(n_groups_perm): + c0 = gi * group_size + c1 = min(c0 + group_size, cols) + gamax = W_perm[:, c0:c1].abs().amax(dim=1) + gw_scales_perm[:, gi] = (gamax / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + + def gw_sf_fn(col_idx): + return gw_scales_perm[:, col_idx // group_size].float() + + Q = _run_gptq(gw_sf_fn) + Q_unperm = Q[:, invperm] + + n_groups_orig = (cols + group_size - 1) // group_size + scales_orig = torch.zeros(rows, n_groups_orig, dtype=torch.float16) + for gi in range(n_groups_orig): + c0 = gi * group_size + c1 = min(c0 + group_size, cols) + orig_cols_in_perm = invperm[c0:c1] + for pc in orig_cols_in_perm: + scales_orig[:, gi] = torch.maximum( + scales_orig[:, gi], + gw_scales_perm[:, int(pc.item()) // group_size] + ) + scales_orig = _compute_groupwise_scales( + _dequant_groupwise(Q_unperm, scales_orig, group_size) - W_orig + W_orig, + group_size, clip_range, + ) + scales_orig = _compute_groupwise_scales(W_orig, group_size, clip_range) + Q_final = _quantize_with_groupwise_scales( + W_orig + (_dequant_groupwise(Q_unperm, scales_orig, group_size) - _dequant_groupwise(Q_unperm, scales_orig, group_size)), + scales_orig, group_size, clip_range, + ) + Q_final = Q_unperm + return Q_final, scales_orig + else: + best_q, best_scale, best_err = None, None, float('inf') + for pct in [0.9990, 0.9995, 0.9999, 0.99999, 1.0]: + if pct < 1.0: + row_clip = torch.quantile(W_orig.abs(), pct, dim=1) + else: + row_clip = W_orig.abs().amax(dim=1) + s = (row_clip / clip_range).clamp_min(1.0 / clip_range).to(torch.float16) + + def row_sf_fn(col_idx, _s=s): + return _s.float() + + Q = _run_gptq(row_sf_fn) + herr = _hessian_weighted_error(Q, row_sf_fn) + if herr < best_err: + best_q, best_scale, best_err = Q, s, herr + + return best_q[:, invperm], best_scale + + +def _quantize_with_groupwise_scales( + W: Tensor, scales: Tensor, group_size: int, clip_range: int = 31 +) -> Tensor: + rows, cols = W.shape + Q = torch.zeros(rows, cols, dtype=torch.int8) + n_groups = scales.shape[1] + for gi in range(n_groups): + c0 = gi * group_size + c1 = min(c0 + group_size, cols) + sf = scales[:, gi].float() + Q[:, c0:c1] = torch.clamp( + torch.round(W[:, c0:c1] / sf[:, None]), -clip_range, clip_range + ).to(torch.int8) + return Q + + +def _dequant_groupwise(Q: Tensor, scales: Tensor, group_size: int) -> Tensor: + rows, cols = Q.shape + W = torch.zeros(rows, cols, dtype=torch.float32) + n_groups = scales.shape[1] + for gi in range(n_groups): + c0 = gi * group_size + c1 = min(c0 + group_size, cols) + sf = scales[:, gi].float() + W[:, c0:c1] = Q[:, c0:c1].float() * sf[:, None] + return W + + +def gptq_mixed_quantize_int6( + state_dict: dict[str, Tensor], + int6_cats: set[str], + hessians: dict[str, Tensor], + group_size: int = 0, +) -> tuple[dict[str, Tensor], dict[str, object]]: + """Mixed quantization using full GPTQ with groupwise scales for layers + with Hessians, fallback to clip-search for others.""" + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + gptq_count = 0 + fallback_count = 0 + + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = classify_param(name) + + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + + if cat in int6_cats and t.ndim == 2: + if name in hessians: + q, s = gptq_quantize_weight(t, hessians[name], group_size=group_size) + gptq_count += 1 + is_gw = s.ndim == 2 + meta[name] = {"type": "int6", "method": "gptq_groupwise" if is_gw else "gptq", + "group_size": group_size if is_gw else 0} + else: + q, s = quantize_int6_per_row(t) + fallback_count += 1 + meta[name] = {"type": "int6", "method": "clip_search"} + result[name + ".q"] = q + result[name + ".scale"] = s + elif cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + + log(f"GPTQ quantization: {gptq_count} layers with GPTQ (group_size={group_size}), " + f"{fallback_count} fallback to clip-search") + return result, meta + + +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = classify_param(name) + if not t.is_floating_point() or t.numel() <= 65536: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if cat in int6_cats and t.ndim >= 1: + q, s = quantize_int6_per_row(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int6"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta + + +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta.get(name) + if info is None: + continue + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + gs = 0 + if isinstance(info, dict): + gs = info.get("group_size", 0) + if gs > 0 and s.ndim == 2: + out[name] = _dequant_groupwise(q, s, gs).to(orig_dtype) + elif s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out + + +_BSHF_MAGIC = b"BSHF" + + +def _byte_shuffle(data: bytes, stride: int = 2) -> bytes: + """Transpose byte stream by stride position for better compression.""" + if stride <= 1 or len(data) < stride: + return data + src = np.frombuffer(data, dtype=np.uint8) + n = len(src) + out = np.empty(n, dtype=np.uint8) + dest_off = 0 + for pos in range(stride): + chunk = src[pos::stride] + out[dest_off:dest_off + len(chunk)] = chunk + dest_off += len(chunk) + return _BSHF_MAGIC + bytes([stride]) + out.tobytes() + + +def _byte_unshuffle(data: bytes) -> bytes: + """Inverse of _byte_shuffle. Auto-detects BSHF magic header.""" + if len(data) < 5 or data[:4] != _BSHF_MAGIC: + return data + stride = data[4] + if stride < 2: + return data[5:] + payload = np.frombuffer(data, dtype=np.uint8, offset=5) + n = len(payload) + out = np.empty(n, dtype=np.uint8) + src_off = 0 + for pos in range(stride): + chunk_len = n // stride + (1 if pos < n % stride else 0) + out[pos::stride][:chunk_len] = payload[src_off:src_off + chunk_len] + src_off += chunk_len + return out.tobytes() + + +def _compress(data: bytes, compressor: str, byte_shuffle: bool = True) -> bytes: + if byte_shuffle: + data = _byte_shuffle(data) + if compressor == "lzma": + return lzma.compress(data, preset=6) + elif compressor == "brotli": + import brotli + return brotli.compress(data, quality=11) + raise ValueError(f"Unknown compressor: {compressor!r}") + + +def _decompress(data: bytes, compressor: str, byte_shuffle: bool = True) -> bytes: + if compressor == "lzma": + raw = lzma.decompress(data) + elif compressor == "brotli": + import brotli + raw = brotli.decompress(data) + if byte_shuffle: + raw = _byte_unshuffle(raw) + return raw + raise ValueError(f"Unknown compressor: {compressor!r}") + + +def serialize(h: Hyperparameters, base_model: torch.nn.Module, code: str) -> int: + model_bytes = None + code_bytes = len(code.encode("utf-8")) + if h.is_main_process: + torch.save(base_model.state_dict(), h.model_path) + model_bytes = os.path.getsize(h.model_path) + log(f"Serialized model: {model_bytes} bytes") + log(f"Code size: {code_bytes} bytes") + + sd_cpu = {k: v.detach().cpu() for k, v in base_model.state_dict().items()} + if h.gptq_enabled: + log("GPTQ:collecting Hessians from calibration data...") + t0 = time.perf_counter() + calib_loader = DistributedTokenLoader(h.train_files, h.rank, h.world_size, + torch.device("cuda", h.local_rank)) + if h.gptq_sequential: + log(f"GPTQ:using sequential layer-wise propagation (group_size={h.gptq_group_size})") + hessians = collect_hessians_sequential( + base_model, calib_loader, h, + torch.device("cuda", h.local_rank), + state_dict=sd_cpu, + n_calibration_batches=h.gptq_calibration_batches, + group_size=h.gptq_group_size, + ) + base_model.load_state_dict( + {k: v.to(base_model.state_dict()[k].dtype).to( + base_model.state_dict()[k].device) + for k, v in sd_cpu.items()}, + strict=True, + ) + else: + hessians = collect_hessians( + base_model, calib_loader, h, + torch.device("cuda", h.local_rank), + n_calibration_batches=h.gptq_calibration_batches, + ) + log(f"GPTQ:collected {len(hessians)} Hessians in {time.perf_counter() - t0:.1f}s") + quant_result, quant_meta = gptq_mixed_quantize_int6( + sd_cpu, {"mlp", "attn"}, hessians, group_size=h.gptq_group_size) + else: + quant_result, quant_meta = mixed_quantize_int6(sd_cpu, {"mlp", "attn"}) + + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + quant_blob = _compress(quant_raw, h.compressor) + quant_file_bytes = len(quant_blob) + bytes_total = quant_file_bytes + code_bytes + if h.is_main_process: + with open(h.quantized_model_path, "wb") as f: + f.write(quant_blob) + log(f"Serialized model int6+{h.compressor}: {quant_file_bytes} bytes") + log(f"Total submission size int6+{h.compressor}: {bytes_total} bytes") + + +def deserialize(h: Hyperparameters, device: torch.device) -> GPT: + eval_model = GPT(h).to(device).bfloat16() + restore_fp32_params(eval_model) + + sd_cpu = {k: v.detach().cpu() for k, v in eval_model.state_dict().items()} + + with open(h.quantized_model_path, "rb") as f: + quant_blob_disk = f.read() + quant_state = torch.load( + io.BytesIO(_decompress(quant_blob_disk, h.compressor)), + map_location="cpu", + ) + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + eval_model.load_state_dict(deq_state, strict=True) + + return eval_model + +# ---------------------------------------- +# Evaluation +# ---------------------------------------- + +def _loss_bpb(loss_sum, token_count, byte_count) -> tuple[float, float]: + val_loss = (loss_sum / token_count).item() + val_bpb = val_loss / math.log(2.0) * (token_count.item() / byte_count.item()) + return val_loss, val_bpb + + +def eval_val( + h: Hyperparameters, + device: torch.device, + val_data: ValidationData, + model: nn.Module +) -> tuple[float, float]: + seq_len = h.eval_seq_len + local_batch_tokens = h.val_batch_tokens // (h.world_size * h.grad_accum_steps) + if local_batch_tokens < seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={h.val_batch_tokens}, WORLD_SIZE={h.world_size}, " + f"GRAD_ACCUM_STEPS={h.grad_accum_steps}, seq_len={seq_len}" + ) + local_batch_seqs = local_batch_tokens // seq_len + total_seqs = (val_data.val_tokens.numel() - 1) // seq_len + seq_start = (total_seqs * h.rank) // h.world_size + seq_end = (total_seqs * (h.rank + 1)) // h.world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * seq_len + raw_end = batch_seq_end * seq_len + 1 + local = val_data.val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = val_data.base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (val_data.has_leading_space_lut[tgt_ids] & ~val_data.is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + + model.train() + return _loss_bpb(val_loss_sum, val_token_count, val_byte_count) + + +def eval_val_sliding( + h: Hyperparameters, + device: torch.device, + val_data: ValidationData, + base_model: nn.Module, + batch_seqs: int = 32 +) -> tuple[float, float]: + """Sliding window evaluation: each token scored with maximum context.""" + base_model.eval() + logits_fn = torch.compile(base_model.forward_logits, dynamic=False, fullgraph=True) + + seq_len = h.eval_seq_len + context_size = seq_len - h.eval_stride + total_tokens = val_data.val_tokens.numel() - 1 + + window_starts = [ws for ws in range(0, total_tokens, h.eval_stride) + if ws + context_size < total_tokens] + + total_windows = len(window_starts) + my_s = (total_windows * h.rank) // h.world_size + my_e = (total_windows * (h.rank + 1)) // h.world_size + my_windows = window_starts[my_s:my_e] + + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + + for i, ws in enumerate(batch_ws): + we = min(ws + seq_len, total_tokens) + wlen = we - ws + wlens.append(wlen) + chunk = val_data.val_tokens[ws:we + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = logits_fn(x_batch) + + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else context_size + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = val_data.base_bytes_lut[tgt].to(torch.float64) + tb += (val_data.has_leading_space_lut[tgt] & ~val_data.is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + + base_model.train() + return _loss_bpb(loss_sum, token_count, byte_count) + + +def timed_eval(label: str, fn, *args, **kwargs) -> tuple[float, float]: + torch.cuda.synchronize() + t0 = time.perf_counter() + val_loss, val_bpb = fn(*args, **kwargs) + torch.cuda.synchronize() + elapsed_ms = 1000.0 * (time.perf_counter() - t0) + log(f"{label} val_loss:{val_loss:.8f} val_bpb:{val_bpb:.8f} eval_time:{elapsed_ms:.0f}ms") + return val_loss, val_bpb + + +def run_evals( + h: Hyperparameters, + device: torch.device, + val_data: ValidationData, + eval_model: torch.nn.Module +): + compiled_model = torch.compile(eval_model, dynamic=False, fullgraph=True) + timed_eval("final_int6_roundtrip", eval_val, h, device, val_data, compiled_model) + if h.sliding_window_enabled: + timed_eval("final_int6_sliding_window", eval_val_sliding, h, device, val_data, eval_model) + +# ----------------------------- +# Training +# ----------------------------- + +def train_model(h: Hyperparameters, device: torch.device, val_data: ValidationData) -> None: + # Set up model + base_model = GPT(h).to(device).bfloat16() + restore_fp32_params(base_model) + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + if h.distributed: + model = DDP(compiled_model, device_ids=[h.local_rank], broadcast_buffers=False) + else: + model = compiled_model + log(f"model_params:{sum(p.numel() for p in base_model.parameters())}") + + # Set up optimizer and load train data + optimizers = Optimizers(h, base_model) + train_loader = DistributedTokenLoader( h.train_files, h.rank, h.world_size, device) + + # Helper functions for training + max_wallclock_ms = 1000.0 * h.max_wallclock_seconds if h.max_wallclock_seconds > 0 else None + if h.gptq_enabled and max_wallclock_ms is not None: + max_wallclock_ms -= h.gptq_reserve_seconds * 1000.0 + log(f"gptq:reserving {h.gptq_reserve_seconds:.0f}s, effective={max_wallclock_ms:.0f}ms") + + def training_frac(step: int, elapsed_ms: float) -> float: + """Fraction of training completed (0 to 1), using step or wallclock.""" + if max_wallclock_ms is None: + return step / max(h.iterations, 1) + return elapsed_ms / max(max_wallclock_ms, 1e-9) + + def lr_mul(frac: float) -> float: + if h.warmdown_frac <= 0: + return 1.0 + if frac >= 1.0 - h.warmdown_frac: + return max((1.0 - frac) / h.warmdown_frac, h.min_lr) + return 1.0 + + def step_fn(step, lr_scale): + optimizers.zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(h.grad_accum_steps): + if h.distributed: + model.require_backward_grad_sync = micro_step == h.grad_accum_steps - 1 + x, y = train_loader.next_batch(h.train_batch_tokens, h.train_seq_len, h.grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + (loss / h.grad_accum_steps).backward() + train_loss /= h.grad_accum_steps + + frac = min(step / h.muon_momentum_warmup_steps, 1.0) if h.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * h.muon_momentum_warmup_start + frac * h.muon_momentum + for group in optimizers.optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * lr_scale + + if h.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), h.grad_clip_norm) + + optimizers.step() + return train_loss + + # Model warmup + if h.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(h.warmup_steps): + step_fn(warmup_step, 1.0) + if warmup_step <= 5 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == h.warmup_steps: + log(f"warmup_step: {warmup_step + 1}/{h.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + optimizers.zero_grad_all() + if h.distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader( + h.train_files, h.rank, h.world_size, device) + + # Training loop + ema_state = {name: t.detach().float().clone() for name, t in base_model.state_dict().items()} + ema_decay = h.ema_decay + + training_time_ms = 0.0 + stop_after_step: int | None = None + torch.cuda.synchronize() + t0 = time.perf_counter() + + step = 0 + while True: + last_step = step == h.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (h.val_loss_every > 0 and step % h.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val(h, device, val_data, model) + log(f"{step}/{h.iterations} val_loss: {val_loss:.4f} val_bpb: {val_bpb:.4f}") + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < h.iterations: + log( + f"stopping_early: wallclock_cap train_time: {training_time_ms:.0f}ms " + f"step: {step}/{h.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + frac = training_frac(step, elapsed_ms) + scale = lr_mul(frac) + train_loss = step_fn(step, scale) + + with torch.no_grad(): + for name, t in base_model.state_dict().items(): + ema_state[name].mul_(ema_decay).add_(t.detach().float(), alpha=1.0 - ema_decay) + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + + should_log_train = ( + h.train_log_every > 0 + and (step <= 5 or step % h.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + tok_per_sec = step * h.train_batch_tokens / (approx_training_time_ms / 1000.0) + log( + f"{step}/{h.iterations} train_loss: {train_loss.item():.4f} " + f"train_time: {approx_training_time_ms / 60000:.1f}m tok/s: {tok_per_sec:.0f}" + ) + + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if h.distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # Weight averaging + log("ema:applying EMA weights") + current_state = base_model.state_dict() + avg_state = {name: t.to(dtype=current_state[name].dtype) for name, t in ema_state.items()} + base_model.load_state_dict(avg_state, strict=True) + + return base_model, compiled_model + + +def train_and_eval(h: Hyperparameters, device: torch.device) -> None: + random.seed(h.seed) + np.random.seed(h.seed) + torch.manual_seed(h.seed) + torch.cuda.manual_seed_all(h.seed) + + val_data = ValidationData(h, device) + log(f"train_shards: {len(list(Path(h.datasets_dir).resolve().glob('fineweb_train_*.bin')))}") + log(f"val_tokens: {val_data.val_tokens.numel() - 1}") + + base_model, compiled_model = train_model(h, device, val_data) + timed_eval("pre-quantization post-ema", eval_val, h, device, val_data, compiled_model) + + serialize(h, base_model, Path(__file__).read_text(encoding="utf-8")) + if h.distributed: + dist.barrier() + eval_model = deserialize(h, device) + + run_evals(h, device, val_data, eval_model) + + +def main(): + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + torch.set_float32_matmul_precision("high") + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + torch._dynamo.config.optimize_ddp = False + + h = Hyperparameters() + set_logging_hparams(h) + if h.is_main_process: + os.makedirs("logs", exist_ok=True) + log(100 * "=", console=False) + log("Hyperparameters:", console=True) + for k, v in sorted(vars(type(h)).items()): + if not k.startswith("_"): + log(f" {k}: {v}", console=True) + log(Path(__file__).read_text(encoding="utf-8"), console=False) + log("=" * 100, console=False) + log(f"Running Python {sys.version}", console=False) + log(f"Running PyTorch {torch.__version__}", console=False) + log( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log("=" * 100, console=False) + + train_and_eval(h, device) + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() \ No newline at end of file