Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
Generate text from a trained parameter-golf model checkpoint.

Usage:
python generate.py --checkpoint final_model.pt --prompt "The" --max_tokens 200

Requires the train_gpt.py (or train_gpt_submission.py) in the same directory
for model class definitions.
"""
import argparse
import sys
import os

# Mock flash_attn before importing train script
import types
mock_fa = types.ModuleType("flash_attn")
def _mock_flash(q, k, v, causal=False):
import torch
import torch.nn.functional as F
B, T, H, D = q.shape
Hkv = k.shape[2]
group = H // Hkv
if group > 1:
k = k.unsqueeze(3).expand(B, T, Hkv, group, D).reshape(B, T, H, D)
v = v.unsqueeze(3).expand(B, T, Hkv, group, D).reshape(B, T, H, D)
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
scale = 1.0 / (D ** 0.5)
attn = torch.matmul(q * scale, k.transpose(-2, -1))
mask = torch.triu(torch.ones(T, T, device=q.device, dtype=torch.bool), diagonal=1)
attn = attn.masked_fill(mask, float("-inf"))
attn = torch.softmax(attn.float(), dim=-1).to(q.dtype)
out = torch.matmul(attn, v)
return out.transpose(1, 2)

mock_fa.flash_attn_func = _mock_flash
sys.modules["flash_attn"] = mock_fa
sys.modules["flash_attn_interface"] = mock_fa

import torch
import sentencepiece as spm


def load_model_and_tokenizer(checkpoint_path, script_path="train_gpt_submission.py",
tokenizer_path=None):
sys.path.insert(0, os.path.dirname(os.path.abspath(script_path)))
spec = __import__(os.path.splitext(os.path.basename(script_path))[0])

args = spec.Hyperparameters
if tokenizer_path is None:
tokenizer_path = args.tokenizer_path

model = spec.GPT(
vocab_size=args.vocab_size, num_layers=args.num_layers, model_dim=args.model_dim,
num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, mlp_mult=args.mlp_mult,
tie_embeddings=args.tie_embeddings, tied_embed_init_std=args.tied_embed_init_std,
logit_softcap=args.logit_softcap, rope_base=args.rope_base, qk_gain_init=args.qk_gain_init,
bigram_vocab_size=args.bigram_vocab_size, bigram_dim=args.bigram_dim,
value_residual=args.value_residual, gated_attention=args.gated_attention,
)

state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
model.load_state_dict(state_dict, strict=True)
model.eval()

sp = spm.SentencePieceProcessor(model_file=tokenizer_path)

return model, sp, args


@torch.no_grad()
def generate(model, sp, prompt, max_tokens=200, temperature=0.8, top_k=50, device="cpu"):
model = model.to(device).float()

token_ids = sp.encode(prompt)
tokens = torch.tensor([token_ids], dtype=torch.long, device=device)

print(f"\n--- Prompt: \"{prompt}\" ---\n")
print(prompt, end="", flush=True)

for _ in range(max_tokens):
x = tokens[:, -2048:] # context window
logits = model.forward_logits(x)
logits = logits[:, -1, :] / temperature

if top_k > 0:
v, _ = torch.topk(logits, top_k)
logits[logits < v[:, [-1]]] = float("-inf")

probs = torch.softmax(logits.float(), dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
tokens = torch.cat([tokens, next_token], dim=1)

decoded = sp.decode([next_token.item()])
print(decoded, end="", flush=True)

print("\n\n--- Done ---")


def main():
parser = argparse.ArgumentParser(description="Generate text from parameter-golf model")
parser.add_argument("--checkpoint", required=True, help="Path to final_model.pt")
parser.add_argument("--script", default="train_gpt_submission.py", help="Training script for model defs")
parser.add_argument("--tokenizer", default=None, help="Path to tokenizer .model file")
parser.add_argument("--prompt", default="The", help="Text prompt")
parser.add_argument("--max_tokens", type=int, default=200)
parser.add_argument("--temperature", type=float, default=0.8)
parser.add_argument("--top_k", type=int, default=50)
parser.add_argument("--device", default="cpu")
args = parser.parse_args()

model, sp, hparams = load_model_and_tokenizer(args.checkpoint, args.script, args.tokenizer)
generate(model, sp, args.prompt, args.max_tokens, args.temperature, args.top_k, args.device)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Record: 11L VR + GA + LeakyReLU² + Legal Score-First TTT (val_bpb=pending)

**val_bpb = pending rerun** | 8xH100 SXM, 600s training + legal TTT eval

## Approach

Architecture improvements on the standard 11L competitive stack:

**Value Residual** (ResFormer, arXiv:2410.17897): Each attention block receives the raw V from the first block. A learned 2-element lambda blends first-block V with current V before attention. Block 0 passes V through unchanged (no lambda parameter). Adds 2 params per layer (layers 1-10 only).

**Gated Attention** (arXiv:2505.06708): Per-head sigmoid gate on attention output. Learned weight matrix (dim x num_heads) + bias initialized to 4.0 (near-open gate at init). Adds 4104 params per layer.

**LeakyReLU(0.5)²**: Replaces relu² in MLP. Preserves negative gradient flow. Proven by PR #569 and #535.

**Legal score-first TTT**: Score each validation chunk before training on it. Every token evaluated BEFORE the model has seen it. AdamW optimizer, cosine LR across chunks, last 2 blocks + norms unfrozen.

Both VR and GA were ablated individually in PR #413 (-0.015 and -0.003 bpb respectively, -0.017 combined). This is the first validation with legal TTT + LeakyReLU².

## Previous result (pre-eval TTT, non-compliant)

The initial submission used pre-eval TTT (training on all val data before scoring), which is not competition-legal per issue #402. That result (1.0891) is invalid. This update switches to legal score-first TTT. Score pending rerun.

## Config

All hyperparameters set as defaults in train_gpt.py. Key settings:

```
NUM_LAYERS=11 MODEL_DIM=512 NUM_HEADS=8 NUM_KV_HEADS=4
MATRIX_LR=0.025 SCALAR_LR=0.025 TIED_EMBED_LR=0.035
ITERATIONS=9000 WARMDOWN_ITERS=1200
EMA_ENABLED=1 EMA_DECAY=0.997
VALUE_RESIDUAL=1 GATED_ATTENTION=1
TTT_ENABLED=1 TTT_LR=0.0001 TTT_EPOCHS=3 TTT_UNFREEZE_BLOCKS=2
EVAL_STRIDE=64
```

## Run command

```bash
torchrun --standalone --nproc_per_node=8 train_gpt.py
```

## Credits

- **PR #576** (cmcdnd): Legal score-first TTT implementation, temperature calibration
- **PR #569** (gowtham0992): VRL + LeakyReLU² + Full GPTQ (best non-TTT)
- **PR #413**: Value Residual + Gated Attention ablation
- **PR #315** (jfprincz): Foundation architecture (U-Net skips, SmearGate, orthogonal init)
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"track": "10min_16mb",
"val_bpb": 1.0891,
"val_bpb_exact": 1.08909943,
"seeds": {
"1337": 1.08909943
},
"artifact_bytes": 14195825,
"code_bytes": 78596,
"hardware": "8xH100 SXM",
"training_seconds": 600,
"training_steps": 6021,
"step_avg_ms": 99.66,
"ttt_epochs": 10,
"ttt_optimizer": "adamw",
"ttt_lr": 0.0005,
"ttt_seconds": 171.8,
"eval_stride": 64,
"eval_seq_len": 2048,
"model": {
"num_layers": 11,
"model_dim": 512,
"num_heads": 8,
"num_kv_heads": 4,
"mlp_mult": 3,
"vocab_size": 1024,
"params": 27137221
},
"techniques": [
"Value Residual (ResFormer)",
"Gated Attention",
"EMA (decay=0.997)",
"AdamW TTT (10 epochs)",
"SmearGate",
"BigramHash (4096 buckets)",
"Orthogonal init",
"U-Net skip connections",
"GPTQ-lite quantization",
"2% magnitude pruning",
"Int6 + zlib compression",
"Sliding window eval (stride=64)"
]
}
Loading