openai · amaljithkuttamath · Mar 23, 2026 · Mar 23, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/generate.py b/generate.py
@@ -0,0 +1,118 @@
+"""
+Generate text from a trained parameter-golf model checkpoint.
+
+Usage:
+  python generate.py --checkpoint final_model.pt --prompt "The" --max_tokens 200
+
+Requires the train_gpt.py (or train_gpt_submission.py) in the same directory
+for model class definitions.
+"""
+import argparse
+import sys
+import os
+
+# Mock flash_attn before importing train script
+import types
+mock_fa = types.ModuleType("flash_attn")
+def _mock_flash(q, k, v, causal=False):
+    import torch
+    import torch.nn.functional as F
+    B, T, H, D = q.shape
+    Hkv = k.shape[2]
+    group = H // Hkv
+    if group > 1:
+        k = k.unsqueeze(3).expand(B, T, Hkv, group, D).reshape(B, T, H, D)
+        v = v.unsqueeze(3).expand(B, T, Hkv, group, D).reshape(B, T, H, D)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    scale = 1.0 / (D ** 0.5)
+    attn = torch.matmul(q * scale, k.transpose(-2, -1))
+    mask = torch.triu(torch.ones(T, T, device=q.device, dtype=torch.bool), diagonal=1)
+    attn = attn.masked_fill(mask, float("-inf"))
+    attn = torch.softmax(attn.float(), dim=-1).to(q.dtype)
+    out = torch.matmul(attn, v)
+    return out.transpose(1, 2)
+
+mock_fa.flash_attn_func = _mock_flash
+sys.modules["flash_attn"] = mock_fa
+sys.modules["flash_attn_interface"] = mock_fa
+
+import torch
+import sentencepiece as spm
+
+
+def load_model_and_tokenizer(checkpoint_path, script_path="train_gpt_submission.py",
+                              tokenizer_path=None):
+    sys.path.insert(0, os.path.dirname(os.path.abspath(script_path)))
+    spec = __import__(os.path.splitext(os.path.basename(script_path))[0])
+
+    args = spec.Hyperparameters
+    if tokenizer_path is None:
+        tokenizer_path = args.tokenizer_path
+
+    model = spec.GPT(
+        vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim,
+        num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult,
+        tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std,
+        logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init,
+        bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim,
+        value_residual=args.value_residual, gated_attention=args.gated_attention,
+    )
+
+    state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+
+    sp = spm.SentencePieceProcessor(model_file=tokenizer_path)
+
+    return model, sp, args
+
+
+@torch.no_grad()
+def generate(model, sp, prompt, max_tokens=200, temperature=0.8, top_k=50, device="cpu"):
+    model = model.to(device).float()
+
+    token_ids = sp.encode(prompt)
+    tokens = torch.tensor([token_ids], dtype=torch.long, device=device)
+
+    print(f"\n--- Prompt: \"{prompt}\" ---\n")
+    print(prompt, end="", flush=True)
+
+    for _ in range(max_tokens):
+        x = tokens[:, -2048:]  # context window
+        logits = model.forward_logits(x)
+        logits = logits[:, -1, :] / temperature
+
+        if top_k > 0:
+            v, _ = torch.topk(logits, top_k)
+            logits[logits < v[:, [-1]]] = float("-inf")
+
+        probs = torch.softmax(logits.float(), dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        tokens = torch.cat([tokens, next_token], dim=1)
+
+        decoded = sp.decode([next_token.item()])
+        print(decoded, end="", flush=True)
+
+    print("\n\n--- Done ---")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate text from parameter-golf model")
+    parser.add_argument("--checkpoint", required=True, help="Path to final_model.pt")
+    parser.add_argument("--script", default="train_gpt_submission.py", help="Training script for model defs")
+    parser.add_argument("--tokenizer", default=None, help="Path to tokenizer .model file")
+    parser.add_argument("--prompt", default="The", help="Text prompt")
+    parser.add_argument("--max_tokens", type=int, default=200)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
+
+    model, sp, hparams = load_model_and_tokenizer(args.checkpoint, args.script, args.tokenizer)
+    generate(model, sp, args.prompt, args.max_tokens, args.temperature, args.top_k, args.device)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/records/track_10min_16mb/2026-03-23_11L_VR_GA_AdamWTTT_1.0891/README.md b/records/track_10min_16mb/2026-03-23_11L_VR_GA_AdamWTTT_1.0891/README.md
@@ -0,0 +1,48 @@
+# Record: 11L VR + GA + LeakyReLU² + Legal Score-First TTT (val_bpb=pending)
+
+**val_bpb = pending rerun** | 8xH100 SXM, 600s training + legal TTT eval
+
+## Approach
+
+Architecture improvements on the standard 11L competitive stack:
+
+**Value Residual** (ResFormer, arXiv:2410.17897): Each attention block receives the raw V from the first block. A learned 2-element lambda blends first-block V with current V before attention. Block 0 passes V through unchanged (no lambda parameter). Adds 2 params per layer (layers 1-10 only).
+
+**Gated Attention** (arXiv:2505.06708): Per-head sigmoid gate on attention output. Learned weight matrix (dim x num_heads) + bias initialized to 4.0 (near-open gate at init). Adds 4104 params per layer.
+
+**LeakyReLU(0.5)²**: Replaces relu² in MLP. Preserves negative gradient flow. Proven by PR #569 and #535.
+
+**Legal score-first TTT**: Score each validation chunk before training on it. Every token evaluated BEFORE the model has seen it. AdamW optimizer, cosine LR across chunks, last 2 blocks + norms unfrozen.
+
+Both VR and GA were ablated individually in PR #413 (-0.015 and -0.003 bpb respectively, -0.017 combined). This is the first validation with legal TTT + LeakyReLU².
+
+## Previous result (pre-eval TTT, non-compliant)
+
+The initial submission used pre-eval TTT (training on all val data before scoring), which is not competition-legal per issue #402. That result (1.0891) is invalid. This update switches to legal score-first TTT. Score pending rerun.
+
+## Config
+
+All hyperparameters set as defaults in train_gpt.py. Key settings:
+
+```
+NUM_LAYERS=11  MODEL_DIM=512  NUM_HEADS=8  NUM_KV_HEADS=4
+MATRIX_LR=0.025  SCALAR_LR=0.025  TIED_EMBED_LR=0.035
+ITERATIONS=9000  WARMDOWN_ITERS=1200
+EMA_ENABLED=1  EMA_DECAY=0.997
+VALUE_RESIDUAL=1  GATED_ATTENTION=1
+TTT_ENABLED=1  TTT_LR=0.0001  TTT_EPOCHS=3  TTT_UNFREEZE_BLOCKS=2
+EVAL_STRIDE=64
+```
+
+## Run command
+
+```bash
+torchrun --standalone --nproc_per_node=8 train_gpt.py
+```
+
+## Credits
+
+- **PR #576** (cmcdnd): Legal score-first TTT implementation, temperature calibration
+- **PR #569** (gowtham0992): VRL + LeakyReLU² + Full GPTQ (best non-TTT)
+- **PR #413**: Value Residual + Gated Attention ablation
+- **PR #315** (jfprincz): Foundation architecture (U-Net skips, SmearGate, orthogonal init)
diff --git a/records/track_10min_16mb/2026-03-23_11L_VR_GA_AdamWTTT_1.0891/submission.json b/records/track_10min_16mb/2026-03-23_11L_VR_GA_AdamWTTT_1.0891/submission.json
@@ -0,0 +1,43 @@
+{
+  "track": "10min_16mb",
+  "val_bpb": 1.0891,
+  "val_bpb_exact": 1.08909943,
+  "seeds": {
+    "1337": 1.08909943
+  },
+  "artifact_bytes": 14195825,
+  "code_bytes": 78596,
+  "hardware": "8xH100 SXM",
+  "training_seconds": 600,
+  "training_steps": 6021,
+  "step_avg_ms": 99.66,
+  "ttt_epochs": 10,
+  "ttt_optimizer": "adamw",
+  "ttt_lr": 0.0005,
+  "ttt_seconds": 171.8,
+  "eval_stride": 64,
+  "eval_seq_len": 2048,
+  "model": {
+    "num_layers": 11,
+    "model_dim": 512,
+    "num_heads": 8,
+    "num_kv_heads": 4,
+    "mlp_mult": 3,
+    "vocab_size": 1024,
+    "params": 27137221
+  },
+  "techniques": [
+    "Value Residual (ResFormer)",
+    "Gated Attention",
+    "EMA (decay=0.997)",
+    "AdamW TTT (10 epochs)",
+    "SmearGate",
+    "BigramHash (4096 buckets)",
+    "Orthogonal init",
+    "U-Net skip connections",
+    "GPTQ-lite quantization",
+    "2% magnitude pruning",
+    "Int6 + zlib compression",
+    "Sliding window eval (stride=64)"
+  ]
+}