diff --git a/docs/superpowers/plans/2026-04-23-mtp-implementation.md b/docs/superpowers/plans/2026-04-23-mtp-implementation.md new file mode 100644 index 0000000000..641146536f --- /dev/null +++ b/docs/superpowers/plans/2026-04-23-mtp-implementation.md @@ -0,0 +1,399 @@ +# Multi-Token Prediction (MTP) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add 2 auxiliary prediction heads (+2, +3 tokens) during training to improve representation quality, with lambda annealing to zero so aux params never enter the artifact. + +**Architecture:** Two `Linear(512,512)` transforms after final norm, each projecting through shared `tok_emb.weight` to predict future tokens. Combined aux loss weighted by λ=0.3, annealed to 0 over last 30% of training. Training-only — excluded from serialization. + +**Tech Stack:** PyTorch, single-file modification to `train_gpt.py` + +--- + +### Task 1: Add MTP hyperparameters + +**Files:** +- Modify: `train_gpt.py:39-88` (Hyperparameters class) + +- [ ] **Step 1: Add MTP env vars to Hyperparameters** + +Add these lines after line 88 (`grad_clip_norm`): + +```python + # Multi-Token Prediction + mtp_enabled = bool(int(os.environ.get("MTP_ENABLED", "0"))) + mtp_lambda = float(os.environ.get("MTP_LAMBDA", 0.3)) + mtp_anneal_start = float(os.environ.get("MTP_ANNEAL_START", 0.7)) + mtp_num_heads = int(os.environ.get("MTP_NUM_HEADS", 2)) +``` + +- [ ] **Step 2: Commit** + +```bash +git add train_gpt.py +git commit -m "feat: add MTP hyperparameters" +``` + +--- + +### Task 2: Add MTPHead module and integrate into GPT + +**Files:** +- Modify: `train_gpt.py:606-724` (after MLP class, modify GPT class) + +- [ ] **Step 1: Add MTPHead class after the MLP class (after line 618)** + +```python +class MTPHead(nn.Module): + def __init__(self, model_dim: int): + super().__init__() + self.transform = CastedLinear(model_dim, model_dim, bias=False) + self.transform._zero_init = True + + def forward(self, hidden: Tensor, tok_emb_weight: Tensor, softcap: float) -> Tensor: + h = self.transform(hidden) + hidden + logits = F.linear(h, tok_emb_weight) + return softcap * torch.tanh(logits / softcap) +``` + +Note: Uses `CastedLinear` (existing class at line 509) to match the codebase pattern. `_zero_init = True` is handled by the existing `_init_weights` loop at line 696-698. + +- [ ] **Step 2: Add mtp_heads to GPT.__init__** + +In `GPT.__init__`, add a `mtp_num_heads` parameter and create the heads. After line 690 (`self.lm_head._zero_init = True`), before `self._init_weights()`: + +```python + self.mtp_heads = nn.ModuleList( + [MTPHead(model_dim) for _ in range(mtp_num_heads)] + ) if mtp_num_heads > 0 else nn.ModuleList() +``` + +Update the `__init__` signature to accept `mtp_num_heads: int = 0`. + +- [ ] **Step 3: Modify GPT.forward to support MTP** + +Replace lines 700-724 with: + +```python + def forward(self, input_ids: Tensor, target_ids: Tensor, mtp_lambda: float = 0.0) -> Tensor: + x = self.tok_emb(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x0 = x + skips: list[Tensor] = [] + + for i in range(self.num_encoder_layers): + x = self.blocks[i](x, x0) + skips.append(x) + for i in range(self.num_decoder_layers): + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + x = self.blocks[self.num_encoder_layers + i](x, x0) + + x = self.final_norm(x) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x_flat, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x_flat) + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + + if mtp_lambda > 0.0 and self.training and len(self.mtp_heads) > 0: + aux_loss = torch.zeros((), device=main_loss.device, dtype=main_loss.dtype) + for k, head in enumerate(self.mtp_heads): + shift = k + 2 + h = x[:, :-shift, :].reshape(-1, x.size(-1)) + t = target_ids[:, shift:].reshape(-1) + aux_logits = head(h, self.tok_emb.weight, self.logit_softcap) + aux_loss = aux_loss + F.cross_entropy(aux_logits.float(), t, reduction="mean") + return main_loss + mtp_lambda * aux_loss / len(self.mtp_heads) + + return main_loss +``` + +- [ ] **Step 4: Commit** + +```bash +git add train_gpt.py +git commit -m "feat: add MTPHead module and integrate into GPT forward" +``` + +--- + +### Task 3: Wire MTP into training loop and optimizer + +**Files:** +- Modify: `train_gpt.py:826-838` (model construction) +- Modify: `train_gpt.py:846-893` (optimizer setup) +- Modify: `train_gpt.py:1007-1018` (training loop forward call) + +- [ ] **Step 1: Pass mtp_num_heads to GPT constructor** + +At line 826, add `mtp_num_heads` to the GPT call: + +```python + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + mtp_num_heads=args.mtp_num_heads if args.mtp_enabled else 0, + ).to(device).bfloat16() +``` + +- [ ] **Step 2: Add MTP head params to scalar optimizer** + +After line 863 (`scalar_params.append(base_model.skip_weights)`), add: + +```python + for head in base_model.mtp_heads: + for p in head.parameters(): + scalar_params.append(p) +``` + +This puts MTP head params in the Adam optimizer with `scalar_lr`. Since they use `CastedLinear` (2D), they'd normally go into Muon via `block_named_params`, but MTP heads aren't in `base_model.blocks` — they're at the top level. The `block_named_params` filter on line 851 only captures `base_model.blocks.named_parameters()`, so MTP params are already excluded from Muon. We just need to make sure they're picked up by some optimizer. + +Actually, MTP heads are `nn.ModuleList` at the GPT level, not inside `blocks`. The current code only collects params from `base_model.blocks` (line 851) and `base_model.skip_weights` (line 862-863). MTP params would be orphaned. We need to add them explicitly. + +- [ ] **Step 3: Compute mtp_lambda schedule and pass to forward** + +In the training loop, after line 1008 (`scale = lr_mul(step, elapsed_ms)`), add the lambda computation: + +```python + mtp_lambda = 0.0 + if args.mtp_enabled: + total_steps_approx = max(int(args.max_wallclock_seconds * 1000.0 / max(elapsed_ms / max(step, 1), 1.0)), step + 1) + progress = step / total_steps_approx + if progress >= args.mtp_anneal_start: + mtp_lambda = args.mtp_lambda * (1.0 - (progress - args.mtp_anneal_start) / (1.0 - args.mtp_anneal_start)) + else: + mtp_lambda = args.mtp_lambda +``` + +Then modify the forward call at line 1016: + +```python + loss = model(x, y, mtp_lambda=mtp_lambda) +``` + +Also update the warmup forward call at line 948: + +```python + warmup_loss = model(x, y, mtp_lambda=args.mtp_lambda if args.mtp_enabled else 0.0) +``` + +- [ ] **Step 4: Log MTP config** + +After line 910 (`log0(f"seed:{args.seed}")`), add: + +```python + if args.mtp_enabled: + mtp_params = sum(p.numel() for h in base_model.mtp_heads for p in h.parameters()) + log0(f"mtp:enabled heads:{args.mtp_num_heads} lambda:{args.mtp_lambda} anneal_start:{args.mtp_anneal_start} params:{mtp_params}") +``` + +- [ ] **Step 5: Commit** + +```bash +git add train_gpt.py +git commit -m "feat: wire MTP into training loop, optimizer, and lambda schedule" +``` + +--- + +### Task 4: Exclude MTP heads from serialization + +**Files:** +- Modify: `train_gpt.py:1068-1069` (model save) +- Modify: `train_gpt.py:1076` (quantization) + +- [ ] **Step 1: Filter MTP params from saved state dict** + +At line 1069, replace: +```python + torch.save(base_model.state_dict(), "final_model.pt") +``` +with: +```python + save_state = {k: v for k, v in base_model.state_dict().items() if not k.startswith("mtp_")} + torch.save(save_state, "final_model.pt") +``` + +At line 1076, replace: +```python + quant_obj, quant_stats = quantize_state_dict_int8(base_model.state_dict()) +``` +with: +```python + quant_state = {k: v for k, v in base_model.state_dict().items() if not k.startswith("mtp_")} + quant_obj, quant_stats = quantize_state_dict_int8(quant_state) +``` + +- [ ] **Step 2: Handle roundtrip load with strict=False for MTP params** + +At line 1099, the roundtrip validation loads the quantized state dict back. Since MTP params were excluded, we need `strict=False` or to only load matching keys: + +Replace: +```python + base_model.load_state_dict(dequantize_state_dict_int8(quant_state), strict=True) +``` +with: +```python + roundtrip_state = dequantize_state_dict_int8(quant_state) + base_model.load_state_dict(roundtrip_state, strict=False) +``` + +This is safe because eval mode doesn't use MTP heads (they're gated by `self.training` and `mtp_lambda > 0.0`). + +- [ ] **Step 3: Commit** + +```bash +git add train_gpt.py +git commit -m "feat: exclude MTP heads from serialization and quantization" +``` + +--- + +### Task 5: Local smoke test + +**Files:** None (verification only) + +- [ ] **Step 1: Quick CPU smoke test** + +Run a minimal sanity check that the code parses and the model can be constructed with MTP enabled: + +```bash +python3 -c " +import os +os.environ['MTP_ENABLED'] = '1' +os.environ['MTP_NUM_HEADS'] = '2' +os.environ['VOCAB_SIZE'] = '1024' +os.environ['NUM_LAYERS'] = '4' +os.environ['MODEL_DIM'] = '128' +os.environ['NUM_HEADS'] = '4' +os.environ['NUM_KV_HEADS'] = '2' +os.environ['MLP_MULT'] = '2' + +# Import just the classes +import importlib.util, sys +spec = importlib.util.spec_from_file_location('tgp', 'train_gpt.py') +mod = importlib.util.module_from_spec(spec) +sys.modules['tgp'] = mod + +import torch, torch.nn as nn, torch.nn.functional as F +from torch import Tensor + +# We need to exec just the class definitions +exec(open('train_gpt.py').read().split('def main')[0]) + +model = GPT( + vocab_size=1024, num_layers=4, model_dim=128, + num_heads=4, num_kv_heads=2, mlp_mult=2, + tie_embeddings=True, tied_embed_init_std=0.005, + logit_softcap=30.0, rope_base=10000.0, + qk_gain_init=1.5, mtp_num_heads=2, +) +x = torch.randint(0, 1024, (1, 32)) +y = torch.randint(0, 1024, (1, 32)) + +# Test without MTP +loss_no_mtp = model(x, y, mtp_lambda=0.0) +print(f'Loss without MTP: {loss_no_mtp.item():.4f}') + +# Test with MTP +model.train() +loss_with_mtp = model(x, y, mtp_lambda=0.3) +print(f'Loss with MTP (lambda=0.3): {loss_with_mtp.item():.4f}') + +# Verify MTP adds to loss +assert loss_with_mtp.item() >= loss_no_mtp.item() - 0.1, 'MTP loss should be >= main loss' + +# Verify MTP params excluded from filtered state dict +full_keys = set(model.state_dict().keys()) +filtered_keys = {k for k in full_keys if not k.startswith('mtp_')} +mtp_keys = full_keys - filtered_keys +print(f'Total params keys: {len(full_keys)}, MTP keys: {len(mtp_keys)}, Saved keys: {len(filtered_keys)}') +assert len(mtp_keys) > 0, 'Should have MTP keys' +assert all(k.startswith('mtp_') for k in mtp_keys) + +print('All smoke tests passed!') +" +``` + +Expected: All assertions pass, prints loss values and key counts. + +- [ ] **Step 2: Commit (if any fixes needed)** + +```bash +git add train_gpt.py +git commit -m "fix: smoke test corrections for MTP" +``` + +--- + +### Task 6: GPU integration test + +**Files:** None (verification on RunPod) + +- [ ] **Step 1: Run a short MTP training test on GPU** + +On the RunPod server, run a quick 100-step test to verify MTP works end-to-end with distributed training, compilation, and serialization: + +```bash +MTP_ENABLED=1 MTP_LAMBDA=0.3 MTP_NUM_HEADS=2 \ +ITERATIONS=100 VAL_LOSS_EVERY=50 MAX_WALLCLOCK_SECONDS=120 \ +SEED=42 RUN_ID=mtp_smoke \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +Verify in the log: +1. `mtp:enabled heads:2 lambda:0.3 anneal_start:0.7 params:...` appears +2. Training runs without errors +3. Serialization completes (no MTP keys in saved model) +4. Roundtrip validation produces a valid val_bpb + +- [ ] **Step 2: Compare step time with and without MTP** + +Run the same 100 steps without MTP for timing comparison: + +```bash +MTP_ENABLED=0 ITERATIONS=100 VAL_LOSS_EVERY=50 MAX_WALLCLOCK_SECONDS=120 \ +SEED=42 RUN_ID=baseline_smoke \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +Compare `step_avg` between the two runs. MTP overhead should be <5ms. + +- [ ] **Step 3: Commit any fixes** + +--- + +### Task 7: Full training run and evaluation + +**Files:** None (execution on RunPod) + +- [ ] **Step 1: Launch full 600s MTP training run** + +```bash +MTP_ENABLED=1 MTP_LAMBDA=0.3 MTP_NUM_HEADS=2 MTP_ANNEAL_START=0.7 \ +SEED=1337 RUN_ID=mtp_full_v1 \ +torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +- [ ] **Step 2: Compare val_bpb against baseline (1.0783)** + +If val_bpb improves by >=0.003 (target: <=1.0753), proceed to 3-seed runs. If not, tune MTP_LAMBDA and MTP_ANNEAL_START. + +- [ ] **Step 3: Record results** + +Create `records/track_10min_16mb/2026-04-23_MTP_AuxHeads/` with README.md, logs, and train_gpt.py. diff --git a/docs/superpowers/specs/2026-04-23-mtp-design.md b/docs/superpowers/specs/2026-04-23-mtp-design.md new file mode 100644 index 0000000000..5f6f5bfc0b --- /dev/null +++ b/docs/superpowers/specs/2026-04-23-mtp-design.md @@ -0,0 +1,130 @@ +# Multi-Token Prediction (MTP) for Parameter Golf + +## Goal + +Add multi-token prediction auxiliary heads to improve representation quality during training without increasing artifact size. Target: reduce val_bpb by 0.005-0.01 from our current 1.0783. + +## Design Decisions + +- **Approach:** DeepSeek-V3 style — 2 aux heads predicting +2 and +3 tokens from final hidden states +- **Head architecture:** `Linear(512, 512)` zero-init transform → shared `tok_emb.weight` projection → logit softcap → CE loss +- **Loss weight:** λ=0.3 total (0.15 per head), annealed linearly to 0 over the last 30% of training +- **Artifact impact:** Zero — aux heads are training-only, dropped before quantization/serialization + +## Architecture + +### Aux Head Module + +```python +class MTPHead(nn.Module): + def __init__(self, model_dim: int): + super().__init__() + self.transform = nn.Linear(model_dim, model_dim, bias=False) + nn.init.zeros_(self.transform.weight) # identity-like at init + + def forward(self, hidden: Tensor, tok_emb_weight: Tensor, softcap: float) -> Tensor: + h = self.transform(hidden) + hidden # residual connection + logits = F.linear(h, tok_emb_weight) + return softcap * torch.tanh(logits / softcap) +``` + +Key details: +- Zero-init weight + residual connection means at init this is identical to the main head +- Shares `tok_emb.weight` for vocabulary projection (no extra vocab params) +- Same logit softcap as main head + +### Integration into GPT.forward() + +```python +def forward(self, input_ids, target_ids, mtp_lambda=0.0): + # ... existing trunk unchanged ... + x = self.final_norm(x) + + # Main head (unchanged) + x_flat = x.reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + logits = self._project_logits(x_flat) + main_loss = F.cross_entropy(logits.float(), targets, reduction="mean") + + if mtp_lambda > 0.0 and self.training: + aux_loss = 0.0 + for k, head in enumerate(self.mtp_heads): + shift = k + 2 # predict +2, +3 + # Shift: use hidden[:-shift] to predict target[shift:] + h = x[:, :-shift, :].reshape(-1, x.size(-1)) + t = target_ids[:, shift:].reshape(-1) + aux_logits = head(h, self.tok_emb.weight, self.logit_softcap) + aux_loss += F.cross_entropy(aux_logits.float(), t, reduction="mean") + return main_loss + mtp_lambda * aux_loss / len(self.mtp_heads) + + return main_loss +``` + +### Lambda Schedule + +```python +mtp_lambda_start = 0.3 +mtp_anneal_start = 0.7 # start annealing at 70% of training +# At each step: +progress = step / total_steps +if progress >= mtp_anneal_start: + mtp_lambda = mtp_lambda_start * (1.0 - (progress - mtp_anneal_start) / (1.0 - mtp_anneal_start)) +else: + mtp_lambda = mtp_lambda_start +``` + +### Artifact Serialization + +MTP heads are excluded from the saved model: +```python +state_dict = {k: v for k, v in model.state_dict().items() if not k.startswith("mtp_")} +``` + +No changes needed to GPTQ, quantization, or eval code — they never see the aux heads. + +## Parameter Budget + +| Component | Params | Artifact Impact | +|-----------|--------|-----------------| +| mtp_head_2 transform | 262,144 | None (training-only) | +| mtp_head_3 transform | 262,144 | None (training-only) | +| **Total** | **524,288** | **0 bytes** | + +## Timing Budget + +| Component | Estimated Cost | Impact | +|-----------|---------------|--------| +| 2x Linear(512,512) forward | ~1ms | | +| 2x tok_emb projection | ~1ms | | +| 2x CE loss + backward | ~2ms | | +| **Total per step** | **~4ms** | ~90ms → ~94ms/step | +| **Steps lost in 600s** | ~280 fewer | ~6740 → ~6380 steps | + +The representation improvement must compensate for ~360 lost steps. DeepSeek-V3 showed MTP gains are substantial even at smaller scales, so this is a favorable bet. + +## Env Vars + +| Var | Default | Description | +|-----|---------|-------------| +| `MTP_ENABLED` | `0` | Enable multi-token prediction aux heads | +| `MTP_LAMBDA` | `0.3` | Initial aux loss weight | +| `MTP_ANNEAL_START` | `0.7` | Fraction of training where lambda annealing begins | +| `MTP_NUM_HEADS` | `2` | Number of aux heads (predict +2 through +N+1) | + +## Optimizer Groups + +MTP head parameters go into the AdamW group (not Muon) since they're small linear layers: +- lr: same as `scalar_lr` or `head_lr` +- wd: same as `adam_wd` + +## Success Criteria + +- val_bpb improves by >=0.003 over baseline (same seed, same step count adjusted for overhead) +- No artifact size increase +- Training completes within 600s wallclock + +## Risks + +1. **Overhead too high:** If per-step cost exceeds ~5ms, the lost steps may not be worth it. Mitigation: MTP_ENABLED=0 fallback. +2. **Destabilizes training:** Aux gradients could interfere with main loss convergence. Mitigation: zero-init + residual means aux heads start as identity; λ annealing removes their influence before final convergence. +3. **No effect at small scale:** MTP gains were demonstrated at 100B+ scale. At 36M params the benefit may be smaller. This is the core research risk. diff --git a/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/README.md b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/README.md new file mode 100644 index 0000000000..566016d96a --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/README.md @@ -0,0 +1,50 @@ +# Record: SP8192 + Triple Recurrence + Banking + Fused MLP + Muon 0.97 — val_bpb 1.0783 (3-seed mean) + +**val_bpb = 1.0783** (3-seed mean, std 0.0004) | **~15.99 MB** | 8xH100 SXM + +## 3-Seed Results + +| Seed | Pre-quant BPP | Sliding BPP | **TTT BPP** | Artifact | +|------|---------------|-------------|-------------|----------| +| 1337 | 1.0859 | 1.0798 | **1.0782** | 15,986,623 | +| 42 | 1.0856 | 1.0793 | **1.0781** | 15,983,529 | +| 2024 | 1.0862 | 1.0800 | **1.0788** | 15,986,767 | +| **Mean** | 1.0859 | 1.0797 | **1.0783** | | + +## Architecture + +``` +SP8192 tokenizer, 11 physical / 17 virtual layers +512 dim, MLP 4x (2048 hidden), GQA 8Q/4KV, head_dim=64 +Parallel residuals L7+, QK-Gain 5.0, XSA all 11 layers +LeakyReLU(0.5)², skip gates, logit softcap 30 +MuonEq-R (lr=0.022, wd=0.095, momentum=0.97) + AdamW +EMA 0.997, warmdown 66.7%, loop at 35% +SDClip GPTQ int6 (k=12.85) + int8 embed (k=20) + brotli +Score-first TTT: SGD lr=0.01, mom=0.9, 3ep, 32K chunks +Hash embedding: 16384x512, zero-init, trained in TTT +~36M params, ~15.99MB artifact +``` + +## Compliance (Track B — Score-First TTT) + +Per Issue #1017: +- **Condition 1:** Hash key uses prefix tokens only +- **Condition 2:** Full normalized softmax distribution +- **Condition 3:** Each chunk scored under no_grad() before TTT update +- **Condition 4:** Single left-to-right pass, no rescoring + +No SLOT, no pre-quant TTT, no n-gram caches, no Tap-In. + +## Reproduction + +```bash +pip install brotli sentencepiece +MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 data/cached_challenge_fineweb.py --variant sp8192 +SEED=1337 TTT_ENABLED=1 HASH_EMBED_ENABLED=1 TTT_LR=0.01 MUON_MOMENTUM=0.97 \ + torchrun --standalone --nproc_per_node=8 train_gpt.py +``` + +## Credits + +PR #1420 @abaybektursun (triple loop + fused kernels), PR #1394 @clarkkev (SP8192 + SDClip), PR #1471 @X-Abhishek-X (3-layer recurrence), PR #1477 @aryanbhosale (parallel residuals + score-first TTT), PR #1460 @resouer (eval-time hash embedding), PR #399 @abaybektursun (parameter banking concept), PR #1514 @dexhunter (Muon 0.97) diff --git a/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/submission.json b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/submission.json new file mode 100644 index 0000000000..93725b9b89 --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/submission.json @@ -0,0 +1 @@ +{"author":"EthanYangTW","github_id":"EthanYangTW","name":"SP8192 + Triple Recurrence + Banking + Fused MLP + Muon 0.97 + Score-First TTT + Hash Embedding","date":"2026-04-12","track":"10min_16mb","val_bpb":1.07833,"val_bpb_std":0.00037,"seeds":[1337,42,2024],"seed_results":{"1337":{"val_bpb":1.07817,"artifact_bytes":15986623},"42":{"val_bpb":1.07807,"artifact_bytes":15983529},"2024":{"val_bpb":1.07876,"artifact_bytes":15986767}},"hardware":"8xH100 80GB SXM","pytorch_version":"2.9.1+cu128","technique_summary":"SP8192 + Triple Depth Recurrence (3,4,5 x3, 17 virtual) + Parameter Banking + Fused MLP Triton TMA + CUTLASS EVT + Muon 0.97 + Parallel Residuals (L7+) + QK-Gain 5.0 + Score-First TTT (3ep SGD lr=0.01) + Eval-Time Hash Embedding + SDClip GPTQ int6 + Brotli"} diff --git a/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/train_gpt.py b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/train_gpt.py new file mode 100644 index 0000000000..e20acce09d --- /dev/null +++ b/records/track_10min_16mb/2026-04-12_SP8192_LegalSOTA_Clean/train_gpt.py @@ -0,0 +1,5 @@ +import lzma as L,base64 as B,linecache as C +S=L.decompress(B.b85decode('{Wp48S^xk9=GL@E0stWa8~^|S5YJf5;P<0H#a#e2n@VT6Qap3bt~@<3h>ok~)Km^%m4{rOaiQa2(_3zS%kEP3*8F9$Y#f-ZSHsOI0J>T;4{pdFFBjnfYv-o7{^Aj5A;P`z&C6&<=lqU+_4@b=R3%dnFVxv`=Mxc$Xs5zzo9Ric6{F|ATO1RDSva^{lbrs1l?`BLg7X=K(Q;6QVBe>>XcJ1BumAA2c%17A9=Ndxtz>P;Qj8GzvXz~UFKA&VZOufyRgkT(@mOE^`mG_Q^|@KlW_VERX8baH?N3EM-_&P%L_EFTm43EHf-VQXAZ>}m{0HbiJdo*$GnEANkBhMepqyPcx3Fe`cguZa~?mFyAO7xx@pyp9u#SgbW+7#3_|0HHzND*Q91f>^;{89_J{Mn(ifsT;O=nQfn#Ij{*MK%J|f`^>lsxUub!E!Sa7u`{MQ6F9EpM`M3LtLPR@0)h4vzK5MC|1-JXholX8)ey)dQyK9Qb%sbNCH*vNp0k<6<%6l+hRO0vpF^o+^Y#^WM9+l@#vx0FlzSmKyQ#D?TgIPQ3$mo~ESH~#){o=pZ>bx|DpULrF#8zQk&^auf9^{$UlamN3-&TUdJlF%9!C7`Nwr+MCh4tYsw(vrIp6F_fi?;BNr?Yjf9NFRsW@;Uw#>_q%@dXd)o8ZOKhsZqb@w3rbC(;tlItqrqozqGJ43StQ$S=7bGQ}@c?@-$H5}FVv__HFkSqWth_C&p>@S1ZdEnw#`Ivzv>R)$n%JiuE>_KhkJSnCNR9vU7cn1dmYI&yutlAMSAB?;wui}O*PPW~e>3lp#V7QU&$y-;gUOonM89&$WEQzh&mCb@i68QFyPK^{!LRHQ+FrD!`F{DXZ=m0t~iTHET-~L9XBJ*;X)3s}?=ekXQ0hmCMvAC*DJ2e96dnU$g`wC8hd3Q>?8f%UHBIu-kvYkhSO~w#I+(8+2*XwhsSs-c68aQ7Uz&{8$(tdt(TZ@eL!K@-B7CtjeSzA&fduE#HG%Z{57?UQAKy;vtESieFl$AUj2%TK9OtPQzB=(?)7_)y?;M{Oc4wQh?Facnd2$bLeQ?+B6N@M7ZII^8KXd{L*~gUX+qhJ@&~h@Bp*MCd;qK$AMj=gIaX@IMJ%uB&HG~x1fJ#qJ8%th&V;g6du(OHDF)V6YWE1wK1!Y6(xE(rH*fw$n34{~iEqIO3;&5*AZfo*|XS4;rSly}J>Uxm|M;dSWg!Fn(jbE%pbU97gNBCEYM!Ae1ql_~Ss(rQIyulSyL9!tHPz`$i$-V;+5jAvMb_IOcRnNe#bDQ9I6*;UDLUEqvcHud1l5EJ6<<7)PL84F>(+=B5ge{T?!Rw~TG@o##;!Hx=I&_#PIl%jD>mA@vT&H(lTr($YOQZYr77wwuN)tsqwxhj%0Pe;|cgFcKsc-bO_>gjEb#8r`F$i}#E)lN-rE<#^3rkarA|jyZN<_|zJ4;|FcsW>*b_Lr6rwRR6>?9h@(PwP#p3<6_aHLz1aH>UK1p1s&?L<4~f%6h-&5h?!t6=k(zHFF9WoA{g-_~#RLeG$}x$1ehsd+yFxR(8QKb>n-?4NIoQaaJu?p7cl<}I84VNUMqPe@b_?p|*SdRtutwvfLRCc|Ogj333!MxzE19;HY}HLSyFN?4ut$$ejjbHx9>_CEe^3*cqK1qg@AuPSgDi`!`t=E>F$&JP5$iyBA6YV-+|R%Y!K5wtCNO_F2ftgL2)-~G-Xs>V3dhXNCZ$+EI5_Oe&fI2(~zD9)G9|Ap=|(a>KbtvOyN84M0k?)NxRdnWvSrrjRbbTd3EA3m}^Vk?mp>z@;}mTmeVkf=(tIr8nJT?`6_<1H`!w|(Q`a_Dr|-wfn<^$Gl-teh1#n6sLVI6)M__gl>EJMeV*9(E4TPEx;yZ;PtwnFC+y~4Ed~%LW}_J^shH5B%)bR+ww9*a#hot+gwA51q^lTWSthj=Q-d6l`&RO!XKbSa9UFyokF^ZUtx{P1))C;wFL38D)48%r!tYiR$nfsq*MWr|GvDHkOWjpgxem1M_uyjjvbvp-h((5qRRg~uFdF60OI*x=T*lObkX#1-fEfq=rqF+Gx`5&uLO!z+xoywM$q;;xFEH+{wj`#n=XF^L8$#;TO#z6{nxRom53Mus2Bk`+741_eKa>;+lX-p!QKA0F@y{xt$k33u>Y^f~qS0;|WIA!6JfpkY%g1UBqoV}5(d#q>9>w~0J67vVzN14-;Te!PjYiK##`mEkEGYf1dFNY+%s)Ha|p&3Cknhr=_a}Vg|i*T0yyS}EnvgBiB9w@^FswnRmXh>SjKWwA_8pGN4SDrfYU!O)BJ!ymfrCLFrN^Q=)RuEJ%F@}VVAQ%eIhByG`edH?DzcFfX%%9x9Ag@mZD+c)!X`09Y{N7#Rekgl^sNA@vbiS`dPZ!nGqT4F+6%^P~@0Q1c%Tlw|zOS&(&xI8k6Y3zk!$f#;r!Q8V=#=#b*^*n_qL^meig)Y=X6-7eaFj!^}P6Z8tep#64}3vMC!+>LZfewV|cu(EJ(8PcKx4-yrlmYqt;XxI-S0z-aL^7E#9gs`%F*$SBL2WX;j9e#bR!7iNi@!-S^<~et<MB$Vkmud`~;ZQSecg5c3T-op!)3-D1HYbOg9;yrD#HR7zFQJzdxD#e|MRG(H3Luy3+me_)|iUd!&8A2(M#lNW|%Ep^NpKG>;IRFcQ!)DXBHwUX_y|GER&v&i@SDw_HbltO;#an*nlqgeQX8B!k$sg{UVb9jInX|#nmP@XdDzTQoz`+KfR#40eh^iNQE|Kz|A^BOCuCu@8r??Q|66gseVtC=SDoS7s;ckM>|u9I+=O4)*3;+_pib2fF5xv78mEiCUE8gg1YrxxwzGIeZRhcWkr~tXoLO<@=vv^9)64+hSf-(+a7R>)85fN@aCe12!z3WhDQ}dw|v4AQ_tBXeN+P(L&kLkpVNrw5X6H8~PvZDF?J}?<4J5klC0mWzFb_))DjxkZ$UmE=RTBE@LZIGO*{nL_z`j2znt3n2?@kGt^W(ppnAF*@ira#*Dn!qnPd{`8w-EBGZRck&HN?Ei0h4QX)=m^z)CE@%|PzB{*wxIVm6EIWsP3dnHE}s&^=;;0;{!;aHS;9`}`VbyBW?A7nyh3`A9snN-O-z`ys*>$gNojA^GkABY(tkQMX*=eG-HZ7iI>K5tb$UwwpmUsilB(&pG)p^ht?%5F5GQCukHtE&Ee1L>q#?nyOpcYz&HeYd8JljiFQi2G(}B2%onY#VEvI|P{brF;ZqvpfC?5(C6SkOYYrh9@@WK0!-X(_8}Uggow^qm?8Ublv1J#)|>E0RS>oZT!s)5FdWt`5v8lX$y!3@agL{0}NyoB4qWRF<3H-eO#i)Z*9Si6l(OVTAy-o5cOuk95CHcKME5`SxONNK-R2o!pL@-n)5J{Tv-U@(eY(hLNE5bkkVVU$sUBK^R+)o}$w@>b?#n44RXFvLcCae<_8KuvQzLHwe5O1;iXf1)l5joSrC%j62n6Ez4IuJ5YtiQ>sSn8TE|Oj9$l<^qnfbJvgfoKeC3h4EmJZ0%Q$HZ@`?0n_`2kG5A|XX=)b(oo0kk9jq6}Og)3ClUG&Zd57Tg1c}nzxYzI@>A1743Hrr7md?%P!PaCb2^WjPvskCv^%>5mHzrgl$ti(8%T~YMfX6jTTyCt@aXU|v!aS@9B%`FJ$%9LXqa|c>8{5YG`!p%;XYEs?J{P}UgF^AhOA&kc7XL%RDfQmT6XqAhobvsA{F?WSpLL}8MruX_Ta-W4#grjbenj3^y3;$}A0gu+@qUG-@>yGlYu)vqSnHFToUB131FMN=(!fr9gn{k|zGe&m(M9~J6r&i%hQ_$u#2znP(^kQ|N+mul<8_VVXMh+}Ku*HJjMEves<1Q!8Kf;)ZKhv)#mS{`|$^`Yqe05MFD@`Ag4*zijplNw(#0S&ht510q!cPV>HOb?29hsOlc~Fr>XWC};ef?F(4s2-Es`^-lpc>%58*$|>`3OP{)vk?BZB<{8SMmrDiZgOBr+=Qp&F7&_4_GO8*&*?{g62@$ty3(X2w4V)@OR?TN`5rvm{0NZ9>qSIDR--Fw_@m?>rQwk}X=OjKTw~i8Ow}ILwG=LQHvm<%IGRLJi3NmcS$Ri1dS*KlgC%Av?{%o;E+?uEb;hxr&*5nwI&`_1UXM87#p94PlJwM0Wf3sKcNTvxEqNbns~5Ac~oCn)^Q_vSC*+Bby57wwm4xd-8tX1;_(Rcd&C0+5AdQ`P32$zz8zqn7T7H*9U{~Xwj@-R*Jm|=3XB1s03dq7xWd1`CqblaczX^hd{fOtT2O-C(#jcLFiT!bP}6|Z6%Bb2%|+znm71ula_>>T=Hv?cpu2Q(%=6*&%i9lN|5cui1F827I@QmEwlTyWNs@yRS+_b#h#$L42e&!M%6hInT`;{mNIjygKh(ir)l>K=V>+~NJSM9yB0RX8#Qrb?$NS-k`j$X+S^q-gUKU-Vbl_4m`D%Rz%{m=pk$vu%E*EuHXpvxh^n#Wq-6cf!0{m|_76zy_=nnRAP6DU%!Fe0td%l(EnNJbR^4=$``1~gs>5#Hv^&==uX+Z;FJ!pQAu0j&oIo+P)Y?fMkxGOxg@xw_CXzg^iAopem8rQi9Xn${2B$qQO9rv4d9w2#iQ?IT?-0<_+CO7M|8eBe);Dk2|8er49g-xBjCJNz^jJ2fz>NDiZ)CW&=oY2+MaZ%?fPyir7uh`{A|CcIFR(l!9h-WMuP3(X|W#GscwCyRmwYkHP3I8DUv#asI7Ygr9=QQ`}l9F`#w%hp_a$c1c-UsdVWGc}7Kf^4nOAAH3=W9hEX9SL4^okP8-u~w7z3W2)u~s~vU3s*wff>84>Zzo=)Z#efiB09q}?zjVQM(h`x)MDgqovkkcC8qd`Bx=hGMe<6Up4aX0`pk@f`INeF47z4Q-`uC}@%K2H6e0fZt?u+)Z%^kmpH24H)}j4m77#o`bceWnS1k6R`s&UG>T~(7ZEZ25-ina2tOPbFsjBaIJXbz5C+9(x*LKb;+RD)t)buocoo!=;*z6{VkSPy!VV965F&x)uN%5d+w`ShlE3F_;QB28e}d2w{qW4}m}%~^#*s+}SseO1;)U>a08kPPC8)980qs&~jb;A#3_@W4s5n|7L@0h&*vQKG_6_PzZlUkw335#*29$y*jXw3hYjE_p>!l@+U?v*K8*0&uFZWY9eu7KZ73>-|e-6opk_DE;u;MFiLrH{rfQMpP%RwR)S}HVTRo&lv{zClVxie6`=bUk6wZt@cAtdPjZ?2atNj4}v$}w%fjB1{`c}PghUHW}W#{`S;U};mwV9@Z_Z$3~DPt%4u`VWM)-G_w?a{KEercf|hm##kLkdb=G%lKOc{?7l%wE}jFh!#I-JBSt$oO36g0xSzNgJ3S&KKL?1&?ehv#I+`78J)S#VsmgGWdZ1ZYr$>E_nClY7L;pwUORczXUh<8HrOssloA@<$A@o2XdcNQ@ggY6tiwk60_{KN-NpLy<2U-Hwo)egz698+{@etGT6Us`8255_$z^&lruu&A17&{5T^S9J#=V_LIFntW+x4EY)^xAf~M6ds8L;~pn-IwutE2*c|fN!0nx{j2{!xC0$#s%7qoGXcZ689r&OX}i!dHu9-og}ZN*nd5ollvk5}_(whSG2%AoAOk4rO<)Vw>a--8&o6I^$UCxNw)a9w|q?ghlyoRCt=YF^U;3Wa@W?ZWzLDZm4%cPWXB)$n=7{ltRGyyQC}=_I#Sn`#SjVnqOS9Pnbk{8T=<`d1aXy5C(ob6y1L>dIhb^d!z#maPNYTc1*qJk%e<$gudnf)GcBCS_ehgEd@rU+YSSORQ9vK>RgWW{<$Ci(Q3=GVE_)vQwJU3Y_p_4|X4`>R7SeoENj~Czj*yzd3^<)gC%GwK=nOCT+V1(CN}zP(@Fcv~Ayk!Gdh8$Iz_mRa;?!(L4PKAw-qYwyl5kq;0gsx9-PPC}hV6b^(RG7IcWEU06sS&CP`|34g8{I4IpG~D5K+@eX9ti8X(DTfeJR2-+*?Y-y+uH2}IQzHW@wTh^)6>Gq7y&)g2i_f<5idU!J@JR*BR5X0A`W#oNsB-tqW#0JXdk59(hZD3ArgJGK2Zx-?V(DdJx*dZARqV+GNGAmKKiPkkq?OKxS1#VQEPhoDu2hro57Bu>6jx&3A}>b)W#m6c0iIIZCEEhDtKf)fH^2OjPWE(uh;+I>f}nj&*8YUN$;dKg8H^7x>g-n|nfii>tq}TxuGM??p#i_UTg2+}%WrRTZGt?#i1eXyi)G)=}z;`9;axkP9kp&?ocrY3H(|H~SNHnt3=g}(}Q%9m6xe%tfvpTS9u8sxsYI<7cdX}*AKV`HbCM%qw>ES=huXgdOB*ew^G2pu3VV66WUy40h>j$6rNUOz<&Bq+qiDTX9`RkH#{eECeO~!M|P00Bl=7b+ARM%}}U>$kQI7NK?W@~zc<%5}13mHmwiA|(%JXQ7f~4Y(qg_CLd{UM((M0)bu>{>syc{^5<|W!cR;U+Ibf2?UZvtaZyq(4`K@565hAX3r*-VBiW7$M9+4dv#Z8%F4$V1GoMh5S?$<$AF!3XdT{c8DUY_k59nBk3&3K`dM8om_mNVo>%hgPmq<%o?5(_>^KG&@v?Se|2*mTQ&N53Hxn2dJsbr5dj6A-f_+*nQI=c0Mdy&e8~j$5EPKKF+riw1CW4#hiLSUe!RCKgDXOV&H0}fw)eF@g)ROU{M}jUG2-CfQj5qLY>9E3)=9!Z(j%<}#S>J6rd~x-XF17^m@Gx;FVK2Usv{*RXl#>Ik9gYxYxye-A-tJKMVstLZ=L4ki{tz0S3AgZnG@=D}e=iM-d9Py*moF0E2mNgi)Jk|ZsL7SFcNLU)qi>RDANTddDqv(fr};cgGsxOL2@N4MK6n@k+g@YfS)AsM)EvxJ$q&Z(m@ElOVS}63QAQC0+@ZI6`J|uVTvB+Dut)RfcUrm(VXAiSM+GN~S8R{)1b;OdpG2*^sgI`dxXe-5RcSwk9eq0>otrH?Q|v$4p8Eup?6u@QYO32xW!sjr{eFA;F!l2Bl1F)>n(_((L9sxz=wNn|0oYT1K>LsZ8(q(wKmKp*hAs$90D#6>?f)TivJoN+fyEd~vNMe<83s1`IZYAKN-7;zY@inJH1I*zNxSJvvxr8WIgIn@C*U?;yEFzq(lY)K;WE<3PXkGizd4En(yXXuYmC59e?fq_*E)#4UeUlhbWmk$Jq3w$Ubv*{G90)2^TKsLX)xr#}?E;)F<}%AIKq(`xC!2Tr8o#oi-81iQ$sTdj7+?2GTw0zJy+N24bZxOb+-YD8ECF=gOhguH>=mvR=2mLpJFZt+y{T^zvMxftnyEdN5Q;i(Ffj{v5O7N4hl8%u4THlv1f4nmnmg*Kvj;i+4xdzi(H0PxrCb&?k>GbE!KGR^C|z0Qlp*Kd9)s&5yl(=EVk>Mbw|LAKrAO0X9y&h&*SZFpYHSW=xungNek3LZ?V(W_%*eNviEikOpJs3$5&~ylWt-_m)Y=J{fZEr#RSrSuvjY06~=iI1}-cPVgD1MGJDx~IYKTh`@;ZE(o%faa-IRDg8U7Ki1^c$38j9H<4e6;_tv)a2qE~dAaV$OZjnwuyQX*jCIN$|ldRoC2;sd`G2RE5UZ-~I&4Yyg$NVHRye2*;WCEHON}_p1+SkGT2gxIJgTUK$o>Ru$h3`=s-vN2S>*RTX0NgWUznT|o^fEbnYpTy|N!UqM!yW#`S^Jsy8%u&>S|_lVeAQ<26**K%2GO}RkmIiXxuMPCiYV%)C+5Od!y0Ir|ttLByQ^WjeavLzy1IP#+EH|4463p{~pw2o6`?>8BfxgatEkm&J@83fH8r!wS8$pA8`W6=wv&I>{vJ#JmJrc`g_In^_n6y4l_m(m@lOQL&r-?m3-y$StixtXe3;>9{*5g3r&E4W*=wn!Zkf)L_!9EoQP{=rT%pKZJcAp>?=W#+Wq{JvN(Jm1bG*UiZ!qNZO{o&f_g_L2d+_kH4jM@iW~91I5lbx^nXYxhA*kYVG5}msrgBmMzlU*+cwbV9E}q(OG}eKUiHo_I_(Al8>A|_+HF6@IK>$n1cqRl01uAIrW{MtNaEmv!_>O@+oG6wTHr+zCSVLIy4Yb2E>F8Fg>M0%x?tTPJkgX2rym&mr+BqZdwe?i;E;2|1SnB9T3NJ&gsTtKMFt%#yr!_s%e(*b#@wr;?v;@{vXC(;+nB7cl0%yfpc99=U1rw+wg6d@8)!2UH{9?$x1@e<7Oysdppq>iZ}$tZ906C@X*=hG0Us^#xA8|FQwyP^!oFX!k!Gcw4Vhlgh@J{8QDNr`5uD5T0vwiy6Di5-~0Wi-(y;0ga!H2B6(+cHgwO{p6oI|69}UdRpCOfs&yueH4~uLE8wW;rB_Mv2;LsA{2+Z$qpg%47o@+shJ%R^B&QHpo>`XeOhXRI8xi4)Eb*eJAM2DD>wRJH8y^p+=(*Cqlo*{gLXSSJ6p)2l@@|J!%ZA(f>tUo-;8R*mLb5>-WA#gj1eXx^kk7sBAe#}rpxo=G^et|@x!tH-`zP|Pd8|%Zhtll#Xin62Zo8T3t}t=}!;A|bKu5J$E;Urs$?cTXv1}3zJ?JO=q>kh61+Q?)bnAhc3Uz-pn=aZ@9t-L8DAaz?8Lxwmr>}pCrl(1?SQ}s$-7L$bg?{;&G2@4qnjrp~s|23sZiUt1Kyc-g*|J>O9OvsPR8gYY8}HzszmT}z9z%^+SPBl9hA`X>pD7&NrD=fiXLW)zU4&)_5IpJ(n8ThB!67O~bkg4#jG*ST`P5H*1c1c&lNp<97v79VXYu!k@HjPd6I2WWqX^_3tLFpvGr+*p{mkV9$;p!pSlH&aEn*)t8I4M$K7sx?YAenDP8ybW?qH!2kMYLW=R5p$76Qp21n@5vfIj5?V`3i3nJu2llV+H`{G-o;+nk=jN`i8mGiBDU@p_a^?{7r#yg5bc=5~XUo-nIoZ`eNI<*h6T%q~U2{1_L4#6vilLUhW#;-but&`-I%w@F*-8kNIFiuuOnAkN~@6dbYR`I2S25(@uSrmua!?;qi;wJY@^X$PgGDDnkg8`PTZdDm0U9@r<0Y4AU7Xkw2*os;R>`oqIx0O3M(rKNk)x|q80g4)P$DnCQ5sK3BLEd;*um+V+TAexLY&iXJyTwr71fm*VV9*CmHqh`%9SPg)nB9SW2csiEvA!d-3k33qXpu=KTFoB;IdMoYWMBJFDdA%!*4lTKtJG!4f<*D|&f|2%v07RFC9DS&QpjF>|5N0A=7@r>9yco+_jjx(!%Oby|>FbMci~E>ecn@mh%7NnVTOZ7hN7%s!u8Wh{CPy#|)S;IbfC?UTS4^-SbJDiTu@}@Xo^?f+d=9>%lBy+`UKJ;w)B~PNY4j&U{~WQ36e>NXB;ss@RIA~<{ZV$o*1bv$&gIdgo)k*>3;V)++b?H`fnET}1L)7xkdY&M6ZYDt2*4boTm-Qlc|@wV)*^zK2U4VissCs=Ry}n*V*to&0PaLxzc`Qaj)XM(-0ZbJK{BQ{>$?K$f%$paP2rNuAXk!RZXqOWL2WU-fKjd)i62JiUpm^Uav|4I)U=W4L?Ll?AWS`C99(wI+wc4m0AV20#2J5du@3b6=7(D#2qj^kssR-iqR)+Umv%ME=T0qC(-*v%)5E)gDis(R^Og<6pu!OpA!BwhP{%Z;TOaL}z={+wB(pKj5@noYD5DL6(HM0i*#@ben{h37vN|UWx}+GSd>M%#j5VQ6xf43?~IXb}v5VLD)G7AUZyRFONTSV_;4WG40JAz&)M>7;lP$6QR@f^TkMA!3(-(e{%E+2Ph2dIVQ5gg#ly7tS~=HBwB`Q*>E}J?%X}i4s`>vd}J&bOSpPlhhzUMUyr5Z+t9Ob10mzMwj(rSF>k&1+fL-?l~nXRP^-YPS_cX)(62OJ>mU=A0o_g=txY+6wT^ZJV|F!&P7%Gn>vFpRu~-TgXhJ(N_O2bw^G#Z>EnH};iZU5y)x0Uf7Djx09ArjP$TUiZuUT1uN~sGtJIzA3IFz(c-qs(?KnHXl}u4?F{pn;gngc6B%zZy+egP&2BWheawgwT=pI*Pr$_P#$g)^{jjV-1l*$(sYkm-vSx@4<_hV|Bq-h{ng^P&ybese?HPtYtpb$9Om480%K_$O$-{|Jq(?>Q$;p0N!cS<(`LGB%SCzheD~BirS)c021`W}GVO$37VL)D(T^6_g)k?2LMY)bB0fzz9`fPlbw=+p(^{7D>)L(C)%Rf{6GIxoU$waeI@s3(D3qgN)xHfaN{RXz?=-Re}9D&|gt7+9=>1pjsRQu|c@~su3)T%_7b{HqNCE6HmKs!zC)t6Q6d)pr8F|>@z6)a&7NEfo+(3){aX<7s|`Ym=chnkN7dko!~tjmg#dQ0dU`8>3m`1a3Iaa1~;;*lCTWGs{g=qlGBmvS9ZSbc4%;OS>0ZEmI>n^ot9uDI`sQhvXe$#cFB)$dgV{kL~Z6K2=Ti&XTe@lBIfCXi+S*;U?oESa)PlAiBcs*|${>US^ybRound3fj|BpChj!B+RMmx#UHT%Ks^nRQd1Y>TM@#>+Ey^*W=X?9x@8;6-=VbbPrlCC1;NKHgPKeBt#Uw1pb627Sh)S5IrE#Er3au!RZ2O1*im;s0H0#lRE^`o0uAEsV66X5JQjkgPy-V)tMZN{hx=){5qY_%h^yVC=);ED&9Fo)V^LnSO$3+>WcW$$`F@f*%QPCxR?hyO=7mi8pc>u{V9-f1r_K;`I5TvpDEcu6q4VUmxbFOlsDl%n@t1LB&qaey=~G6yz#u{2YKp+8rGC?$R_}@**|pRKtHqRsxL3b)B8S&hB(aB*AWDQA3bqgJEg3Yl#QS2+f)VY=d66KscO=Ac3W@ZpxhcgpJEzlEtnQ3y1eQasXZToqn&@b0+GZ=!$5vnJwT*;3{6KRV=kzfBo|;*CIeM5xK^F*|IjMEbd~^|K$>K3dM+0ydse7ic%&^u;ye0boaFtj?P4T8nA*Up0Pj_oBPki*tg)e3>hFR|lH|NEL-_M9;-Pq>%p?sXF3SAi8)<=JN8yb;HI+G8)ORxaX=xsJ(sD*ai;O}Ws2yYR1@89ArtETT~tAX}6U#Kg|l^yu39J$kJifwQ^4nR>;sBPVSXu1Hd+ux`>BLTByq+(FwD$%N_oe()K-_70~ARClHhM*H9pXxuw_=Zo1wmU1@WyLO9ew3?TE9ZQe8JpBzAakXzR(F60iPZTdQZWS?LnY<8GoBdw4ZcK}c4_ydIeO2e_%(%)#U{{=_K`3(HLWqe}lhehYI}mqprOpGOCX@WgLXQB(e$Z9D5SPAIkRiS#F1O!#}B99VxwfuBj-l2UetzVL%B)?|Ql1Z9kMVMNR_&}o@l*p!?_75uINk}YUsIxs=T>8_Qdne5sd4{CjMJl)&z>@6IzG!g5^h<7p@e$fL#B0YSY+Hqz=#ad|9@j`BhdS3~<(Ww+&3p>XD;G7hR-LQAdtF88`Y;u}ypaZ@qDB+&rJcjj(f*XVc1PAZ(6;Q_Wjk!gDL~NM4S7JRiV*GT9we+-%NA40Nvzy{=`Q3C&LBS6ttj=yjW_L&Om>|fx>KfRdh_0KEZ;DX-|(g#k+2A6N>Aw8$x(K8buICPjyib>zL(Zjk_O|GO5ttBHUhE+0*ID8=mdNd4jOU1_7D#jd}%;O>!}Th+AzAPhtY8o{xOMuvj+RtzQsB*v|!4XWup85yS&(r0_o{WCWg9${pdwd_}r3jphpuP6UY~-6n3f&9H9$ki&vFp^}7e(I`yt#QHq-nRV+jZ;-eQIQK)OL;hakIC42k=zOvLNh9UbdJqSePJp6tRn}jRPMFq|c)%OxJ=ZXBIy;WkS3#n(DUEl|ERu=OAq`n82-I{QhEJCHy_7I~#ta|rAF~LP#Nl~D@dEUdXoG}Or5Ypq8uv{zngHhx`#YXWoFABe7!H} Tensor: + a, b, c = 3.4445, -4.7750, 2.0315 + X = G.bfloat16() + X /= X.norm() + eps + tr = G.size(0) > G.size(1) + if tr: X = X.T + for _ in range(steps): + A = X @ X.T + X = a * X + (b * A + c * A @ A) @ X + return X.T if tr else X + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr, momentum, wd=0.0): + super().__init__(params, dict(lr=lr, momentum=momentum, wd=wd)) + @torch.no_grad() + def step(self): + ws = dist.get_world_size() if dist.is_initialized() else 1 + rk = dist.get_rank() if dist.is_initialized() else 0 + for group in self.param_groups: + ps = group["params"] + if not ps: continue + total = sum(p.numel() for p in ps) + flat = torch.zeros(total, device=ps[0].device, dtype=torch.bfloat16) + cur = 0 + for i, p in enumerate(ps): + if i % ws == rk and p.grad is not None: + g = p.grad + s = self.state[p] + if "buf" not in s: s["buf"] = torch.zeros_like(g) + s["buf"].mul_(group["momentum"]).add_(g) + u = g.add(s["buf"], alpha=group["momentum"]) + u = ns5(u) * max(1, u.size(0)/u.size(1))**0.5 + flat[cur:cur+p.numel()] = u.reshape(-1) + cur += p.numel() + if ws > 1: dist.all_reduce(flat) + cur = 0 + for p in ps: + u = flat[cur:cur+p.numel()].view_as(p).to(p.dtype) + if group["wd"] > 0: p.data.mul_(1 - group["lr"] * group["wd"]) + p.add_(u, alpha=-group["lr"]) + cur += p.numel() + +# ── Data ───────────────────────────────────────────── +def load_shard(f): + h = np.fromfile(f, dtype=" 0: + avail = self.tok.numel() - self.pos + if avail <= 0: + self.fi = (self.fi+1) % len(self.files) + self.tok = load_shard(self.files[self.fi]); self.pos = 0; continue + k = min(rem, avail) + chunks.append(self.tok[self.pos:self.pos+k]); self.pos += k; rem -= k + return chunks[0] if len(chunks)==1 else torch.cat(chunks) + +class DataLoader: + def __init__(self, pat, rank, ws, dev): + self.rank=rank; self.ws=ws; self.dev=dev; self.stream=TokenStream(pat) + def next(self, gtok, sl, gas): + lt = gtok // (self.ws * gas) + 1 + chunk = self.stream.take(lt * self.ws) + s = self.rank * lt + local = chunk[s:s+lt].to(torch.int64) + x, y = local[:-1].reshape(-1, sl), local[1:].reshape(-1, sl) + return x.to(self.dev, non_blocking=True), y.to(self.dev, non_blocking=True) + +def load_val(pat, sl): + files = sorted(glob.glob(pat)) + assert files + tok = torch.cat([load_shard(f) for f in files]) + u = ((tok.numel()-1)//sl)*sl + return tok[:u+1] + +# ── BPB metrics ────────────────────────────────────── +def build_sp_luts(sp, vs, dev): + svs = int(sp.vocab_size()); ts = max(svs, vs) + bb = np.zeros(ts, dtype=np.int16) + hs = np.zeros(ts, dtype=np.bool_) + ib = np.ones(ts, dtype=np.bool_) + for i in range(svs): + if sp.is_control(i) or sp.is_unknown(i) or sp.is_unused(i): continue + ib[i] = False + if sp.is_byte(i): bb[i]=1; continue + p = sp.id_to_piece(i) + if p.startswith("\u2581"): hs[i]=True; p=p[1:] + bb[i] = len(p.encode("utf-8")) + return (torch.tensor(bb, dtype=torch.int16, device=dev), + torch.tensor(hs, dtype=torch.bool, device=dev), + torch.tensor(ib, dtype=torch.bool, device=dev)) + +# ── Model (TinyLoop-compatible) ────────────────────── +# Weight names: embed.weight, pre.{i}.{attn_qkv,attn_out,mlp_gate,mlp_up,mlp_down}.weight +# pre.{i}.ln{1,2}.{weight,bias}, loop.*, ln_out.{weight,bias}, head.weight + +class Block(nn.Module): + """SwiGLU transformer block with LayerNorm+bias, fused QKV, MHA.""" + def __init__(self, dim, n_heads, ffn_dim): + super().__init__() + self.dim = dim; self.n_heads = n_heads; self.hd = dim // n_heads + self.ln1 = nn.LayerNorm(dim) + self.attn_qkv = nn.Linear(dim, 3*dim, bias=False) + self.attn_out = nn.Linear(dim, dim, bias=False) + self.ln2 = nn.LayerNorm(dim) + self.mlp_gate = nn.Linear(dim, ffn_dim, bias=False) + self.mlp_up = nn.Linear(dim, ffn_dim, bias=False) + self.mlp_down = nn.Linear(ffn_dim, dim, bias=False) + + def forward(self, x): + B, T, D = x.shape + h = self.ln1(x) + qkv = self.attn_qkv(h).reshape(B, T, 3, self.n_heads, self.hd) + q, k, v = qkv[:,:,0], qkv[:,:,1], qkv[:,:,2] + y = flash_attn(q, k, v, causal=True).reshape(B, T, D) + x = x + self.attn_out(y) + h = self.ln2(x) + x = x + self.mlp_down(F.silu(self.mlp_gate(h)) * self.mlp_up(h)) + return x + +class SharedGPT(nn.Module): + """2 pre + 1 loop × L. TinyLoop-compatible weight names.""" + def __init__(self, h): + super().__init__() + self.h = h + ffn = int(h.dim * h.ffn_mult) + self.embed = nn.Embedding(h.vocab_size, h.dim) + self.pre = nn.ModuleList([Block(h.dim, h.n_heads, ffn) for _ in range(h.n_pre)]) + self.loop = Block(h.dim, h.n_heads, ffn) + self.ln_out = nn.LayerNorm(h.dim) + self.head = nn.Linear(h.dim, h.vocab_size, bias=False) + if h.tie_weights: + self.head.weight = self.embed.weight + self._init() + + def _init(self): + n = self.h.n_pre + 1 + s = 1.0 / math.sqrt(2 * n) + for m in self.modules(): + if isinstance(m, nn.Linear) and m.weight.ndim == 2 and min(m.weight.shape) >= 64: + nn.init.orthogonal_(m.weight) + for b in list(self.pre) + [self.loop]: + b.attn_out.weight.data.mul_(s) + b.mlp_down.weight.data.mul_(s) + + def forward(self, x_ids, y_ids=None, L=None): + if L is None: L = max(self.h.L_min, min(self.h.L_max, int(np.random.poisson(self.h.L_mean)))) + x = self.embed(x_ids) + for b in self.pre: x = b(x) + for _ in range(L): x = self.loop(x) + x = self.ln_out(x) + logits = self.head(x) + if self.h.logit_softcap > 0: + logits = self.h.logit_softcap * torch.tanh(logits / self.h.logit_softcap) + if y_ids is None: return logits + return F.cross_entropy(logits.reshape(-1, self.h.vocab_size).float(), y_ids.reshape(-1)) + +# ── Eval ───────────────────────────────────────────── +def eval_val(h, model, rk, ws, dev, gas, vt, bb, hs, ib): + sl = h.train_seq_len; lbt = h.val_batch_size // (ws*gas) + lbs = lbt // sl; ts = (vt.numel()-1)//sl + ss, se = (ts*rk)//ws, (ts*(rk+1))//ws + ls = torch.zeros((), device=dev, dtype=torch.float64) + tc = torch.zeros((), device=dev, dtype=torch.float64) + bc = torch.zeros((), device=dev, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for bs in range(ss, se, lbs): + be = min(bs+lbs, se) + loc = vt[bs*sl:be*sl+1].to(dev, dtype=torch.int64) + x, y = loc[:-1].reshape(-1,sl), loc[1:].reshape(-1,sl) + with torch.autocast("cuda", torch.bfloat16): + loss = model(x, y, L=h.L_eval).detach() + n = float(y.numel()); ls += loss.to(torch.float64)*n; tc += n + tb = bb[y.reshape(-1)].to(torch.int16) + tb += (hs[y.reshape(-1)] & ~ib[x.reshape(-1)]).to(torch.int16) + bc += tb.to(torch.float64).sum() + if dist.is_initialized(): + for t in [ls,tc,bc]: dist.all_reduce(t) + vl = (ls/tc).item(); bpt = vl/math.log(2); tpb = tc.item()/bc.item() + model.train() + return vl, bpt*tpb + +def eval_sliding(h, model, rk, ws, dev, vt, bb, hs, ib, stride, bsz=32): + sl = h.train_seq_len; tot = vt.numel()-1 + wins = [w for w in range(0, tot, stride) if min(w+sl, tot)-w >= 1] + ms, me = (len(wins)*rk)//ws, (len(wins)*(rk+1))//ws + mw = wins[ms:me] + ls = torch.zeros((), device=dev, dtype=torch.float64) + tc = torch.zeros((), device=dev, dtype=torch.float64) + bc = torch.zeros((), device=dev, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for bi in range(0, len(mw), bsz): + bw = mw[bi:bi+bsz]; B = len(bw) + xb = torch.zeros(B, sl, dtype=torch.int64, device=dev) + yb = torch.zeros(B, sl, dtype=torch.int64, device=dev) + wl = [] + for i, w in enumerate(bw): + e = min(w+sl, tot); wn = e-w; wl.append(wn) + c = vt[w:e+1].to(torch.int64, device=dev) + xb[i,:wn] = c[:-1]; yb[i,:wn] = c[1:] + with torch.autocast("cuda", torch.bfloat16): + lg = model(xb, L=h.L_eval) + nll = F.cross_entropy(lg.reshape(-1,lg.size(-1)).float(), yb.reshape(-1), reduction="none").reshape(B, sl) + for i, w in enumerate(bw): + wn = wl[i]; s = 0 if w==0 else max(wn-stride, 0) + ls += nll[i,s:wn].to(torch.float64).sum() + tc += float(wn-s) + t,p = yb[i,s:wn], xb[i,s:wn] + tb = bb[t].to(torch.float64) + tb += (hs[t] & ~ib[p]).to(torch.float64) + bc += tb.sum() + if dist.is_initialized(): + for t in [ls,tc,bc]: dist.all_reduce(t) + vl = (ls/tc).item() + model.train() + return vl, vl/math.log(2)*(tc.item()/bc.item()) + +def eval_ttt(h, model, rk, ws, dev, vt, bb, hs, ib, stride, log0=print): + """Score-first TTT. TTT on loop block = TTT on ALL virtual layers.""" + sl = h.train_seq_len; tot = vt.numel()-1; chunk = h.ttt_chunk + wins = [w for w in range(0,tot,stride) if min(w+sl,tot)-w>=stride or w==0] + nc = (tot+chunk-1)//chunk + cw = [[] for _ in range(nc)] + for w in wins: + e = min(w+sl,tot); wn=e-w; s=0 if w==0 else max(wn-stride,0) + ci = min((w+s)//chunk, nc-1); cw[ci].append(w) + log0(f"ttt:start chunks={nc} lr={h.ttt_lr} epochs={h.ttt_epochs}") + ls = torch.zeros((),device=dev,dtype=torch.float64) + tc = torch.zeros((),device=dev,dtype=torch.float64) + bc = torch.zeros((),device=dev,dtype=torch.float64) + # Freeze all, unfreeze loop + ln_out + head + for p in model.parameters(): p.requires_grad_(False) + tp = [] + for p in model.loop.parameters(): p.requires_grad_(True); tp.append(p) + for n,p in model.named_parameters(): + if any(k in n for k in ("ln_out","head","embed")): + p.requires_grad_(True); tp.append(p) + opt = torch.optim.AdamW(tp, lr=h.ttt_lr, weight_decay=0.0) + t0 = time.perf_counter() + for ci in range(nc): + if not cw[ci]: continue + ms,me = (len(cw[ci])*rk)//ws, (len(cw[ci])*(rk+1))//ws + myw = cw[ci][ms:me] + model.eval() + with torch.inference_mode(): + for bi in range(0,len(myw),h.ttt_batch): + bw=myw[bi:bi+h.ttt_batch]; B=len(bw) + xb=torch.zeros(B,sl,dtype=torch.int64,device=dev) + yb=torch.zeros(B,sl,dtype=torch.int64,device=dev); wl=[] + for i,w in enumerate(bw): + e=min(w+sl,tot); wn=e-w; wl.append(wn) + c=vt[w:e+1].to(torch.int64,device=dev) + xb[i,:wn]=c[:-1]; yb[i,:wn]=c[1:] + with torch.autocast("cuda",torch.bfloat16): + lg=model(xb,L=h.L_eval) + nll=F.cross_entropy(lg.reshape(-1,lg.size(-1)).float(),yb.reshape(-1),reduction="none").reshape(B,sl) + for i,w in enumerate(bw): + wn=wl[i]; s=0 if w==0 else max(wn-stride,0) + ls+=nll[i,s:wn].to(torch.float64).sum(); tc+=float(wn-s) + t,p=yb[i,s:wn],xb[i,s:wn] + tb=bb[t].to(torch.float64); tb+=(hs[t]&~ib[p]).to(torch.float64); bc+=tb.sum() + if ci < nc-1 and h.ttt_epochs > 0: + cs,ce = ci*chunk, min((ci+1)*chunk, tot) + model.train(); ns=(ce-cs)//sl + if ns > 0: + clr = h.ttt_lr*0.5*(1+math.cos(math.pi*ci/max(nc-1,1))) + for pg in opt.param_groups: pg['lr']=clr + mss,mse = (ns*rk)//ws, (ns*(rk+1))//ws + for _ in range(h.ttt_epochs): + for bs in range(0, mse-mss, h.ttt_batch): + be=min(bs+h.ttt_batch, mse-mss) + st=cs+(mss+bs)*sl; et=cs+(mss+be)*sl+1 + if et>vt.numel(): continue + loc=vt[st:et].to(dev,torch.int64) + x,y=loc[:-1].reshape(-1,sl),loc[1:].reshape(-1,sl) + opt.zero_grad(set_to_none=True) + with torch.autocast("cuda",torch.bfloat16): + loss=model(x,y,L=h.L_eval) + loss.backward() + if ws>1: + for p in tp: + if p.grad is not None: dist.all_reduce(p.grad, op=dist.ReduceOp.AVG) + torch.nn.utils.clip_grad_norm_(tp, 1.0) + opt.step() + if rk==0 and (ci%10==0 or ci==nc-1): + rl=ls.item()/max(tc.item(),1) + rbpb=rl/math.log(2)*(tc.item()/max(bc.item(),1)) if tc.item()>0 else 0 + log0(f" ttt[{ci+1}/{nc}] bpb={rbpb:.6f} t={time.perf_counter()-t0:.0f}s") + if dist.is_initialized(): + for t in [ls,tc,bc]: dist.all_reduce(t) + vl=(ls/tc).item(); vbpb=vl/math.log(2)*(tc.item()/bc.item()) + for p in model.parameters(): p.requires_grad_(True) + log0(f"ttt:done bpb={vbpb:.6f} t={time.perf_counter()-t0:.0f}s") + return vl, vbpb + +# ── Export for TinyLoop ────────────────────────────── +def export_for_tinyloop(model, h, path, log0=print): + """Save checkpoint in format that convert_pytorch.py expects.""" + sd = {k: v.detach().cpu() for k, v in model.state_dict().items()} + # Rename: head.weight might be tied — save embed separately always + if h.tie_weights and "head.weight" not in sd: + sd["head.weight"] = sd["embed.weight"].clone() + torch.save(sd, path) + sz = os.path.getsize(path) + log0(f"Saved TinyLoop-compatible checkpoint: {sz/1e6:.1f}MB") + return sz + +def export_compressed(sd, code, h, log0=print): + """INT2 quantize + compress for 16MB artifact submission.""" + def q2(w): + N,K = w.shape; w = w.float() + pad = (4-K%4)%4 + if pad: w = F.pad(w, (0,pad)); K = w.shape[1] + mn = w.min(1,keepdim=True).values + mx = w.max(1,keepdim=True).values + sc = ((mx-mn)/3).clamp(min=1e-8) + q = ((w-mn)/sc).round().clamp(0,3).to(torch.uint8) + q4 = q.reshape(N,K//4,4) + pk = q4[:,:,0]|(q4[:,:,1]<<2)|(q4[:,:,2]<<4)|(q4[:,:,3]<<6) + return pk, sc.squeeze(1).half(), mn.squeeze(1).half() + + result = {} + for name, t in sd.items(): + t = t.detach().cpu() + if t.ndim == 2 and t.numel() > 65536 and t.shape[1] % 4 == 0: + pk, sc, zp = q2(t) + result[name+".packed"] = pk + result[name+".scale"] = sc + result[name+".zero"] = zp + else: + result[name] = t.half() if t.is_floating_point() else t + + buf = io.BytesIO() + torch.save({"w": result, "h": { + "dim": h.dim, "n_heads": h.n_heads, "ffn_dim": int(h.dim*h.ffn_mult), + "vocab_size": h.vocab_size, "n_pre": h.n_pre, "L_eval": h.L_eval, + "tie_weights": h.tie_weights, "factor_dim": h.factor_dim, + }}, buf) + raw = buf.getvalue() + if _COMPRESSOR == "zstd": + blob = zstandard.ZstdCompressor(level=22).compress(raw) + else: + blob = zlib.compress(raw, 9) + code_bytes = len(code.encode("utf-8")) + total = len(blob) + code_bytes + log0(f"Artifact: {len(blob)/1e6:.2f}MB + code {code_bytes/1e3:.0f}KB = {total/1e6:.2f}MB") + log0(f"{'PASS' if total <= 16_000_000 else 'FAIL — OVER 16MB'}") + return blob, total + +# ── Main ───────────────────────────────────────────── +def main(): + code = Path(__file__).read_text() + h = H() + ddp = "RANK" in os.environ + rk = int(os.environ.get("RANK", 0)) + ws = int(os.environ.get("WORLD_SIZE", 1)) + lr_k = int(os.environ.get("LOCAL_RANK", 0)) + gas = 8 // ws + dev = torch.device("cuda", lr_k) + torch.cuda.set_device(dev) + if ddp: dist.init_process_group("nccl", device_id=dev); dist.barrier() + r0 = rk == 0 + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + logfile = None + if r0: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{h.run_id}.txt" + def log0(msg, con=True): + if not r0: return + if con: print(msg) + if logfile: + with open(logfile, "a") as f: print(msg, file=f) + log0(code, con=False) + + random.seed(h.seed); np.random.seed(h.seed) + torch.manual_seed(h.seed); torch.cuda.manual_seed_all(h.seed) + + sp = spm.SentencePieceProcessor(model_file=h.tokenizer_path) + assert int(sp.vocab_size()) == h.vocab_size, f"Vocab mismatch: {sp.vocab_size()} vs {h.vocab_size}" + vt = load_val(h.val_files, h.train_seq_len) + bb, hs, ib = build_sp_luts(sp, h.vocab_size, dev) + dl = DataLoader(h.train_files, rk, ws, dev) + + model = SharedGPT(h).to(dev).bfloat16() + # Keep linear weights in fp32 for optimizer quality + for m in model.modules(): + if isinstance(m, nn.Linear): m.float() + with torch.no_grad(): + for n, p in model.named_parameters(): + if p.ndim < 2 and p.dtype != torch.float32: p.data = p.data.float() + + np_total = sum(p.numel() for p in model.parameters()) + ffn = int(h.dim * h.ffn_mult) + log0(f"=== TinyLoop Weight-Shared GPT ===") + log0(f"dim={h.dim} heads={h.n_heads} ffn={ffn} pre={h.n_pre} L~Poi({h.L_mean}) L_eval={h.L_eval}") + log0(f"params={np_total/1e6:.1f}M vocab={h.vocab_size} tie={h.tie_weights}") + log0(f"est_int2_artifact={np_total*0.25*0.85/1e6:.1f}MB") + log0(f"ws={ws} gas={gas} FA3={'Y' if _HAS_FA3 else 'N'}") + + if ddp: + from torch.nn.parallel import DistributedDataParallel as DDP + model = DDP(model, device_ids=[lr_k]) + base = model.module if ddp else model + compiled = torch.compile(model, dynamic=False, fullgraph=True) + + # Optimizer split + mat_p, sca_p, emb_p = [], [], [] + for n, p in base.named_parameters(): + if "embed" in n or "head" in n: + emb_p.append(p) + elif p.ndim >= 2 and min(p.shape) >= 64: + mat_p.append(p) + else: + sca_p.append(p) + + opt_muon = Muon(mat_p, lr=h.lr, momentum=h.muon_mom, wd=h.wd) + for g in opt_muon.param_groups: g["base_lr"] = h.lr + opt_emb = torch.optim.AdamW([{"params": emb_p, "lr": h.embed_lr, "base_lr": h.embed_lr}], + betas=(0.9,0.95), weight_decay=h.wd, fused=True) + opt_sca = torch.optim.AdamW([{"params": sca_p, "lr": h.lr, "base_lr": h.lr}], + betas=(0.9,0.95), weight_decay=h.wd, fused=True) + opts = [opt_muon, opt_emb, opt_sca] + def zero(): + for o in opts: o.zero_grad(set_to_none=True) + + ema = {n: t.detach().float().clone() for n, t in base.state_dict().items()} + + max_ms = 1000*h.max_wallclock if h.max_wallclock > 0 else None + def lr_scale(step, ems): + if h.warmdown_iters <= 0: return 1.0 + if max_ms is None: + wds = max(h.iterations - h.warmdown_iters, 0) + return max((h.iterations-step)/max(h.warmdown_iters,1), 0.0) if step >= wds else 1.0 + sms = ems / max(step, 1) + wdms = h.warmdown_iters * sms + rem = max(max_ms - ems, 0.0) + return rem / max(wdms, 1e-9) if rem <= wdms else 1.0 + + # Warmup + if h.warmup_steps > 0: + init_sd = {n: t.cpu().clone() for n, t in base.state_dict().items()} + init_opt = [copy.deepcopy(o.state_dict()) for o in opts] + model.train() + for ws_i in range(h.warmup_steps): + zero() + for _ in range(gas): + x, y = dl.next(h.train_batch_tokens, h.train_seq_len, gas) + with torch.autocast("cuda", torch.bfloat16): + loss = compiled(x, y) + (loss / gas).backward() + for o in opts: o.step() + base.load_state_dict(init_sd) + for o, s in zip(opts, init_opt): o.load_state_dict(s) + zero() + dl = DataLoader(h.train_files, rk, ws, dev) + + tms = 0.0; stop = None + torch.cuda.synchronize(); t0 = time.perf_counter() + L_t = torch.tensor([0], device=dev, dtype=torch.int64) + step = 0 + + while True: + last = step == h.iterations or (stop is not None and step >= stop) + do_val = last or (h.val_loss_every > 0 and step % h.val_loss_every == 0) + if do_val: + torch.cuda.synchronize(); tms += 1000*(time.perf_counter()-t0) + vl, vbpb = eval_val(h, compiled if not ddp else base, rk, ws, dev, gas, vt, bb, hs, ib) + log0(f"step:{step}/{h.iterations} val_loss:{vl:.4f} val_bpb:{vbpb:.4f} t:{tms:.0f}ms avg:{tms/max(step,1):.1f}ms") + torch.cuda.synchronize(); t0 = time.perf_counter() + if last: break + + ems = tms + 1000*(time.perf_counter()-t0) + sc = lr_scale(step, ems) + + # Dynamic L broadcast + if rk == 0: + L_t[0] = max(h.L_min, min(h.L_max, int(np.random.poisson(h.L_mean)))) + if ddp: dist.broadcast(L_t, 0) + curL = int(L_t.item()) + + zero() + tl = torch.zeros((), device=dev) + for _ in range(gas): + x, y = dl.next(h.train_batch_tokens, h.train_seq_len, gas) + with torch.autocast("cuda", torch.bfloat16): + loss = compiled(x, y, L=curL) + tl += loss.detach() + (loss / gas).backward() + tl /= gas + + frac = min(step/h.muon_warmup, 1.0) if h.muon_warmup > 0 else 1.0 + mm = (1-frac)*h.muon_start + frac*h.muon_mom + for g in opt_muon.param_groups: g["momentum"] = mm; g["lr"] = g["base_lr"]*sc + for o in [opt_emb, opt_sca]: + for g in o.param_groups: g["lr"] = g["base_lr"]*sc + + if h.grad_clip > 0: + torch.nn.utils.clip_grad_norm_(base.parameters(), h.grad_clip) + for o in opts: o.step() + zero() + + with torch.no_grad(): + for n, t in base.state_dict().items(): + ema[n].mul_(h.ema_decay).add_(t.float(), alpha=1-h.ema_decay) + + step += 1 + ams = tms + 1000*(time.perf_counter()-t0) + if h.train_log_every > 0 and (step <= 10 or step % h.train_log_every == 0): + log0(f"step:{step} loss:{tl.item():.4f} L={curL} t:{ams:.0f}ms avg:{ams/step:.1f}ms") + + hit = max_ms is not None and ams >= max_ms + if ddp and max_ms: + ht = torch.tensor(int(hit), device=dev) + dist.all_reduce(ht, op=dist.ReduceOp.MAX) + hit = bool(ht.item()) + if stop is None and hit: stop = step + + # Apply EMA + log0("ema:apply") + cur = base.state_dict() + base.load_state_dict({n: t.to(cur[n].dtype) for n, t in ema.items()}) + + # Diagnostic + torch.cuda.synchronize(); td = time.perf_counter() + vl, vbpb = eval_val(h, base, rk, ws, dev, gas, vt, bb, hs, ib) + log0(f"post_ema val_bpb:{vbpb:.4f} t:{1000*(time.perf_counter()-td):.0f}ms") + + # Export TinyLoop-compatible checkpoint + if r0: + export_for_tinyloop(base, h, "final_model.pt", log0) + + # Compressed artifact + sd = {k: v.detach().cpu() for k, v in base.state_dict().items()} + blob, total = export_compressed(sd, code, h, log0) + if r0: + with open("artifact.ptz", "wb") as f: f.write(blob) + + # Sliding window eval + if h.eval_stride > 0 and h.eval_stride < h.train_seq_len: + torch.cuda.synchronize(); ts = time.perf_counter() + svl, sbpb = eval_sliding(h, base, rk, ws, dev, vt, bb, hs, ib, h.eval_stride) + log0(f"sliding val_bpb:{sbpb:.4f} stride:{h.eval_stride} t:{1000*(time.perf_counter()-ts):.0f}ms") + + # TTT + if h.ttt_enabled: + torch.cuda.synchronize() + tvl, tbpb = eval_ttt(h, base, rk, ws, dev, vt, bb, hs, ib, h.eval_stride, log0) + + if ddp: dist.destroy_process_group() + +if __name__ == "__main__": + main() diff --git a/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/README.md b/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/README.md new file mode 100644 index 0000000000..7f4880bd9f --- /dev/null +++ b/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/README.md @@ -0,0 +1,65 @@ +# Record: SP8192 + PE + MIN_LR + SmearGate + AttnOutGate + 4ep TTT — val_bpb 1.0770 (3-seed mean) + +**val_bpb = 1.0770** (3-seed mean, std 0.0004) | **~15.98 MB** | 8xH100 SXM + +## 3-Seed Results + +| Seed | Steps | Sliding BPB | **TTT BPB** | Artifact (bytes) | +|------|-------|-------------|-------------|-------------------| +| 1337 | 4631 | 1.0785 | **1.0772** | 15,982,989 | +| 42 | 4637 | 1.0777 | **1.0765** | 15,984,317 | +| 2024 | 4633 | 1.0784 | **1.0772** | 15,985,404 | +| **Mean** | **4634** | **1.0782** | **1.0770** | **15,984,237** | +| **Std** | | 0.0004 | **0.0004** | | + +Delta vs previous SOTA (1.0783): **-0.0013 BPB** + +## Changes from previous SOTA (2026-04-12) + +### Training improvements +- **Polar Express NS coefficients** — 5 per-iteration minimax-optimal tuples + row normalization (was: fixed 3.4445/-4.775/2.0315) +- **MIN_LR=0.10** warmdown floor (was: 0.0 — LR dropped to zero) +- **QK_GAIN_INIT=5.25** (was: 5.0) +- **GPTQ_RESERVE_SECONDS=0.5** (was: 12.0) +- **VAL_LOSS_EVERY=0** — skip periodic val during training + +### Architecture additions +- **SmearGate** — causal content-gated residual, zero-init transparent +- **Attention Output Gate** — per-head sigmoid gate on attn output (width=12), zero-init + +### TTT improvement +- **4 epochs** (was: 3) of score-first SGD TTT + +## Architecture (unchanged from base) + +``` +SP8192 tokenizer, 11 physical / 17 virtual layers +512 dim, MLP 4x (2048 hidden), GQA 8Q/4KV, head_dim=64 +Parallel residuals L7+, QK-Gain 5.25, XSA all 11 layers +LeakyReLU(0.5)², skip gates, logit softcap 30 +MuonEq-R (lr=0.022, wd=0.095, momentum=0.97) + AdamW +EMA 0.997, warmdown 66.7%, loop at 35% +SDClip GPTQ int6 (k=12.85) + int8 embed (k=20) + brotli +Score-first TTT: SGD lr=0.01, mom=0.9, 4ep, 32K chunks +Hash embedding: 16384x512, zero-init, trained in TTT +~36M params, ~15.98MB artifact +``` + +## Compliance (Track B — Score-First TTT) + +Per Issue #1017: +- **Condition 1:** Hash key uses prefix tokens only +- **Condition 2:** Full normalized softmax distribution +- **Condition 3:** Each chunk scored under no_grad() before TTT update +- **Condition 4:** Single left-to-right pass, no rescoring + +No SLOT, no pre-quant TTT, no n-gram caches, no CaseOps, no global TTT, no multi-phase. + +## Reproduction + +```bash +pip install brotli sentencepiece +MATCHED_FINEWEB_REPO_ID=kevclark/parameter-golf python3 data/cached_challenge_fineweb.py --variant sp8192 --train-shards 80 +SEED=1337 TTT_ENABLED=1 HASH_EMBED_ENABLED=1 TTT_LR=0.01 TTT_EPOCHS=4 TTT_OPTIMIZER=sgd MUON_MOMENTUM=0.97 GLOBAL_TTT_ENABLED=0 \ + torchrun --standalone --nproc_per_node=8 train_gpt.py +``` diff --git a/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/train_gpt.py b/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/train_gpt.py new file mode 100644 index 0000000000..ffe85a0239 --- /dev/null +++ b/records/track_10min_16mb/2026-04-26_V2_PE_MinLR_AttnGate/train_gpt.py @@ -0,0 +1,5 @@ +import lzma as L,base64 as B,linecache as C +S=L.decompress(B.b85decode('{Wp48S^xk9=GL@E0stWa8~^|S5YJf5;S-`obX@>6n@VT6Qap3bt~@<3h>ok~)Km^%m4{rOaiQa2(_3zS%kEP3*8F9$Y#f-ZSHsOI0J>T;4{pdFFBjnfYv-o7{^Aj5A;P`z&C6&<=lqU+_4@b=R3%dnFVxv`=Mxc$Xs5zzo9Ric6{F|ATO1RDSva^{lbrs1l?`BLg7X=K(Q;6QVBe>>XcJ1BumAA2c%17A9=Ndxtz>P;Qj8GzvXz~UFKA&VZOufyRgkT(@mOE^`mG_Q^|@KlW_VERX8baH?N3EM-_&P%L_EFTm43EHf-VQXAZ>}m{0HbiJdo*$GnEANkBhMepqyPcx3Fe`cguZa~?mFyAO7xx@pyp9u#SgbW+7#3_|0HHzND*Q91f>^;{89_J{Mn(ifsT;O=nQfn#Ij{*MK%J|f`^>lsxUub!E!Sa7u`{MQ6F9EpM`M3LtLPR@0)h4vzK5MC|1-JXholX8)ey)dQyK9Qb%sbNCH*vNp0k<6<%6l+hRO0vpF^o+^Y#^WM9+l@#vx0FlzSmKyQ#D?TgIPQ3$mo~ESH~#){o=pZ>bx|DpULrF#8zQk&^auf9^{$UlamN3-&TUdJlF%9!C7`Nwr+MCh4tYsw(vrIp6F_fi?;BNr?Yjf9NFRsW@;Uw#>_q%@dXd)o8ZOKhsZqb@w3rbC(;tlItqrqozqGJ43StQ$S=7bGQ}@c?@-$H5}FVv__HFkSqWth_C&p>@S1ZdEnw#`Ivzv>R)$n%JiuE>_KhkJSnCNR9vU7cn1dmYI&yutlAMSAB?;wu7Q>+c-|I&aO!1l%k8=cGLbfYzwN<8tU(zgQ=&~Eo^92?QzYSl@#x68g1v1y<>AQsxiR?Dwx|Ect3Fd3lX2OtJRoK)+G2QNeT45iNt?soi?z>TV5qPXXL0bf-Bd>!sDt@A{JmOoaQ<02tlEOxUT$8eJxmp#-9IQ)xwK$XbZCknWml?FkJPey3Oh!HCSJ0pw!f{dHJ35hMyGd-5p%Ao~TL(uh6&j*nPzpM~D_Pi#RGM7K`RWlrB_{zIhf|APS=nmH1I(nhX*0=L@J;uhAp_QlY(yb-dgE6_y|7Rp>q7c@kEqR))Je=2+0dGa?Gqlihbzt8Mf#1-`GocpGMXoTsyiuyp}p1S1ZC#)GKcY{`RKa9%Kks9j-!QcP1I7L{1*4?7`3=Lgy%LZ`>=J5I|wT&-gJU7c!PDqxcETv=*tcL0!C(6$R>1N};GOv)h{>3-E)8hWWJT1S8u|z_^dAGufpZd2?JE0io@$D00GS>Pg@t_o^)Xf!Aw)C;MR~Erxq4vF`ao|Wi9v>O}IU;HN-0!x`=h5^(brJI7z-|h?kI%89xtO1G9{Yv6pmqo~IEY#vIr3prx!GCC{BR)3W;E=AZWMu}29J&I)=eyzT4Qj^<%E{Gjsz-IPk2zFXL}PlgK?lTLUDf2xJV%jd<}G5{7;s%MI*d-E}vc<81vTbKU8Ovi4wKJ0SzfHv|8m3(68{y_9ZjVmI|7;ikL^5G}Y4Ii>rA4uchD-)Ek!hh_wdv?q)C8oK)a=YNvR0xr{t7A?V0$BP#Zfikn5!vm$TeAiv?8owPt^#r520z%+TeDu~^C#~grn{U+wf>BG4ZXJG{1R0W2ti7mN9@B(vOt{AM87YIe6DF+n8Y@masb!tAtDx`<$i_QzZ9lz+GIW+&DZm>3_}~x-p!qoq|3tdw4jCRbj^^d!<^4HYG6m!20g4d!Qm4O9n4cKTRnqQX)p$2((L<0VHqMB1KM^d{3QKQokrMG=1{JkIcBTD91$##-fIb&?0)OxLoS2>7k04B^>Cuwxb;P@I0VvV3|+td?YCUXjzPD5lFq6H;FW0QQHb{;_5X2BM5(&rYNUWx7Cdu5<7l-yIkpV29PubxPWgB9NRObTecd5uckQsr6;9n$uR(32C}_sNadJ~G(Ol4Hqv!(?G1dF{1>AIm$1XUV->Y5=43)#lGeUR$t0&*lMd29&ir3I;Em#V80$&hINBn~fFem#_KOf||_s(qa1VHzDRS&B3XoLExjQ+#oUEKKLkqv4X3_9T%c`{X-0EHuf7HUUuttgNpaKj9t5f^!TC6nt+IR()-;sJh*D5xCs)=$iIcm3t*sE@-;`y4kH46o7g}`J^;=9sf@j0#ix;nM6aAL8(c|be|3TNW+redYDFW#6p?eHm{J3W!=*~tXl7n@^xYn)q5dp3a-Kp!q07I!zIbd=`H~>J%sid=eTB#Lp*DNnBJ{!jIg8YmBrbocH$Cy%fkOJH$(I=TFX?@K9Myy58ebw(?LRT>N^#;k-DjKcaRg9U|82@eTRP%TbKWg`8WlNR65cQ^;vtRAQ#c&S!FKqLP2P=FsT`SjEa!`bTf2lT~Xe2Xwl~*f4TOA&pTO^wXZy)=b#xYwZP1Ue}p2cb$(bhA26w-j;q80Q_u>a&K^GRQG!cPFm=O3Id!!6*7`)}SQ6MAWRTaaruIhp6DE_IM%u1#$`I~9}TH`J$<=JX~Idcyx%d^*-+&gSs(YVBXfLJVUuLY6fBO?y0F|Ey!IrW*oV&a3geL`tk-`~`G#0(qw+TQtoTmTPzIgm`nLVfKbFdB1Mxi0iZIZpSavM#E(29g{(GDf;Lyq$T-`ARg!%!@Z;a1{&U%mrh|`SHSrQeaK;fA)ya{NI*N_)2$y}!}%{O&WU2spd!28V0N(?*4BnL9ekgpUOjNVsX}s@J-nf$JJ`OEr3_?Df6O>GI+L=vboq{&(pg&&q$JY~?o-f5kD{^wW+0Yj`I<45OvZ%_u2P>{&<`2Vkut&!&*@>g>TgrO_A#qGvJ;7M)Vf#y_g)`5T4M<}2Hdi-P1Z6tgOR8^s6(7x+_5QMnTUPt#y&L)R>7+j*C0;9D2r_+`5QmaWjt$Dw&oUYrY&PCh*_DKou+EGmJAOFHm-EUi+TR=;i=1Wkd;H;Dz0;r{Tv;f@6OI+DO^MZfZ@G+KmKY3!Q^rJN%C#r&LUJ`0$*5Wn{Edlu_Uh?X>LGQ4F+jcn#cIKluRdQI0R*5k6_c4>!NvrYZ6>%8HDGqOuVc~_up)e3-6x0^o;neph>`M%1cU$Xcdw21iAtw%(naN#rKGQl;=R!dxQ^ackzl9jyMnNw=VS$-Xi*?%38fjx7qf7MyZO}~N-4vrqT2p6ZFXQN7^>ER;J}$fI=k7kRACoGKG6N$DR^MFHybT|f<8^o-U?nt=KM$Adt093W=--YRND)D{{$nWZE5SyvCs#a$AexsvrNImsax1c+U$g6V!*;qOR);xgk4n!VA%h1!J#=M{rz1_pMK%P!SWj)z%$xD?_o83hoq6ePw?fA$7@Xgwek*49<^oGkB`1~4X}jBBzkPi!X`|5_qRdP|y~=Wyud=huvr6|B(pOZ!fI(om>rMb+0^?w@=@=R02B$(9QT69da?d=!3Lr`Ns}EFb6n5Of7H1V*lAUxUmxgk3m_?W?fDf^qaM)hf3|RD4nqCluetT?rYmNA$6k{7tW=BAPw0pO}!hdwVauWI2ms_{Znee3ti3SFeg<{}`jMRRd?P9gQ^AP}19D0F#lBy^v^BwghKCnywnIQTFsIkRcno(^RynD?#v13A2TVse|$w<<1vt9f8nglack^O50Y6-nEszHdg4|j;`TsC=irV2_435fEC%t^#dF4#;9H_bp%uLucU&Ih2q1Ra_XR6XxLpsk>Vl<75=wMkJ!-Q(x6KxZ9M!~VH5-e$Z)Xrhz|X;(u81g{(;OKbzG9`Gp{%@nXs_SOL-G@;(5jyt=fjjwoDW$YJLrOeiuV3;*hdBH*gk((T0{^+j*S=5ZUFbVS_tk?(>Cqof>O6#ZK`9B%-$ImZ#ORq~H~2+Wj5=FG-{y;1l8G{+Z~)V0{`o8j(AzY4>;7lwx>d+&%-aS2V^6e1<%c?lnbVs4m{wa7$y($TiInn*ZD|34|<&@cvVL#-X`e|XNj{Lvd0vf&yP)))JrQ;32MuHc==rQ)1pP&NvEvbWyP_hCv6S|F1OKIuu1Pp@F}qBs2jAdd<(b4DXU+cfib`!I1c>bcSaTRhzEoV2@7WgX#8zz=!HhNxMUwYv!xY5=d!)=T?`+NzGANIve)WKqYa;my@mCJ-Wl=X7oX(~~W=m9sfS&toSaibBOT{nt4UKhP}-+A1f*@jy@dkeXYE-}7t7;(g$ClB$H&MXm(>*|k{)i4jC0vxch#^P_eMrMFQ_jXle;gQ;`0FPYva*0iJ?uk=xmq2Q$YOTN^)uU-PasU4;}hXSLiJWlrj9u<_wKF4W91-;?mKP5_oecMCi4SUoP(tPUHTZ@n*pPnD{6_~ZshPI*qM7)nv)8wq+GfW7(=qMx>W3tbB!tXE_L;Aw61ElsQo<7CPulQly0qLia>z_bVxr7vvTFLhoxqSdb-7HY02rxz{N_C=M6aXDpI_6EW`C@3YICUF+es6Js`o53c%w^gCN>A1f>OD6dp@YmV)w8d#GNOTMa*#A7cSqY9u4GxAy)-KnBxYw)lQgj5g23Xy^B7JW0{Kq%Pe*;Y#QwXB0(f-u;MRv%=7)&p_|8Y5H&l>whN9Nd$t6U`#>!GbGKgjtLQcCROI&v+CV}P`?CKM(B`&zJ6Rb}Cqc0gPSsjtXtA!@$WqhS(6QRRvUNKvbi$~xYC6D#naoB{%T;8e4w8k_rULO>UsDAq1g@9}CYFi(RL33CjcMT6OF{X@AZ268RErBuvhmI+O9=Y%Xy>Y04byPizb(?J`vLJS%5%|TDI+}Qt|5ma*iY4UFP+8(8=0?wOpdFA(ki&S-6b$T;E^n2vTheBm8P7*kt)urRjPkZvRx+{F{)CU~u&(WnQ-w60_6k#`2Pp5rV>auV#Qgp{frr?^k@Eb3vLc_M7~kwaTC8os-A?do9MTfCbPzo}$Y*2Lp@kEQVca|e^&D^@I;y-Tdc$>1>mtdPLao`rfOy?_RE+=%sFrqc$&2|GB$O=j@?v&WFPu`~|ne7OnLSn!qwPRj3Q65a{jX_hvHXwhIKW{Ka1dy`tQalLZtQDP6|zfIJVqcok>gC5@mv>?U359}xZz(K?#OV!9o6Uvqn69?Gtxh^ja6>j6dZCJrbz#M7q?Is_f5$I4Ut5jpC$uS5cvKYaOJ?K5TyM?CZ=8#a4qy+xGkF*p7l18~8v>@WjN)Zh#Cr{=GYb8XB!SrqH#l}7>uQHI2=p$`LM3=^{+A)2sIiA#&F14oQVIf-e9qdM~Tl=rdWD)4oQe_a}aMi@d^Tw5xJj%eq;+oS>mX{^mV-aN!-tU*2*p@FJeD(3Wxfc++JU~9(KzL1Y41Ft<`@*pMs8@TSFhS`AJtbB?DWyq5r()D}qMbRiPd%yDCM=stUDo;Y-6%6404s|#Brvthr(S)3WhK3A|?F^i1oIiGkIawR1x^=A%S+t+f^O!lJXl;uF1HfUP+rZtiT+tmAceoMD_?-3m@c*b!XH66+N5kn4JbSpOWyW~98(phmIml1$Rvb5>2`N&>t!WgUri#BV{S!-A-7;`#C!%;QQQNA!_KeL({b&){^b_9r|B!XIHAFnoeTHz6*z9jV8d^$XbvVTV^qWLcTv7oZ;X*^2VTI(;OX-8eLP2iW7lRRgAq1*%|rX#0;B^Uyf057<_gofjh#e(CrBva#uCaqLmR+_U*4(vv<&CfSO0i_M*StlEG^xS!+Ve;316{tUDvz#8#bkh-})3T~@IA++3WVH{uG)VKPymtbJ?YC|p(cW%BumAF6ZByp)&l6c&9E*%uCok$h8cR$qVdS`DE3n}>lN&}UtrHKt<^yK=;Dnyts-!Wv-D3+2zkCT+^-&yq-Vu=9sAb-D={NDdKYQi_PwemhVgeL_2F!S@9kzwVE+tS1K`AR1Z4?O6P0W15SGyX)rFM6}Rx=Pf+%<}kzP+F9)g2dF3Ok?n(ft1ed4M5W-#V5IdGsXvP`9EH3p_0mchk>B(o4F5R^y+*+zJ?=jI~#`zPT8Qzie{&+Z5?A8|11@CO6DSbMW^%L6n(O0?HBF%X;0C9r!c2d8K@3I!khRrc6on&#!>}4q{#EqY=;5mp$G1P#sInK3k!Q+X@-!WsOw=)ae{ma2f%vi-th&-dY+x4rqvtk?-1*=Yxeom^Ml+pFAt7t9RMm^L0DZOTay{BZyrKtAxlP;J4fk{vud8fwamB1ssT&+1o-#ca`_bZ*Ae*&6}qyo#@ozds=XvL67P=%L`AKI8K;UBhfz$&R%lOYABypUHT)5jf9nuSRCH;QGMUWzu#^b>~7wZES8T@V$Opbtd#XrR5u<3t9QEUVD8g)LBWg81N@^AT4c#W+T3nk>k()oChHTMq2zTYTai1*xxBm!eircF^8r4zXSNUypdwqb!j(|CN{@B8Mny|BUh$eTD!;JU&zVn{~#Z)-#n#Ilp!)A%$@GT8}A9s+?XOE`OpVnNVo&rdnA?F_C94)}$w!gAD{pzq!x$(jC~W`Rx1B#9xrEU`21yzCd>?eILVCpNqOaW^`8(+e2ICzmHr7Mz`a0DMNPoFm>alFu`twmZxv#0e%ozlwbe>}=<_9e*k^;0|_d?f(tTnJA==R#HEK^qbc+i6t)lLZ6Z?s1P3tMPiTs||cowN6ZVapIJ<);NKenFT6`q+gTTKoif9DcOyD)gzKzH3HM|5Ik5>n#oa2-40obAb95C_#B%Joo-1%-KW%zi+Ne5zNK@G4Btgs4=}^k%kk1ahLm$TW=2$}5N$;L2FTlWP(CGWQmLiYxU(B=uK597-=*`4;;b&=UNl+pURq`;}TUCc)h>z?#%F-UD*b))Opy>qagA3aAPAY3h(?-FFU;xl2Tn4ciuNe0FE=L5OW$y?rBpxtCNBJn8$%ZDrt;il_a$-$5bp_cp~@@rrsfwW-bAX+j~0B|Z`U3G%&;^!P7HJg;MC^#-}c`b1umRWELC)yaIG>6ek*H@y&VEm>s1#*aD|M){YWb>)*cGeB_CsNtvP5&NXd_D3t5YebZI+*Uc$ycb)s4%!Fwjp~9u_j2_ITAq8O*pm9pKz)JnQtF&<&E1kKQneT-4uc_qbGwK$28&mQVZ+&P9>TlEdH~D+_-CMQ%-r_JDqS*urx?Jeq8YR$6LM-j6A)Ya>lJ6&4KcLT3XOR4yC*e~d-)&~^dKs&!4~3U6X|=P6|H|pNwF)GVH;ePEh-jMg&vZVt)%23I(p>1NX{DM@8Dmi^s0tu2^&4@U`jd6`9ReSCnr=^7Il{u)vEtWY6jLK2J9lBvH@66yC$Glm$($~RSvI$oEL58(=v8f}WWT_hoKJwiQJtQ>GM)&UUxN`X@QeoKE+Vw^w|)Sek+9i%HOx^~V(kKU}11TI5bem35$>@InzPBDv3$v@2PrPX8rl?mW`eJMu>ss0aDs^6!M3Iuf5_aY8fz-cXsb&c7l3-K#i(*SdWpS1bFHBjJdT#t9;jycH;AQ33h#hJ<9sOFBO2iKJdej2-}h-EgX2eh5ivYYn@<`Bode3CSFr=tzbI4y3zgW%0Sq(EbZ-Fh?NzNDv>SerHh>K-K1dsn$~-HAiQmC)G}Tt{J7yQWD1o9e$v{&~$Vxgo=ed>`reQM+0ePn#n}LL<{E1`ksMt1EJ0+RuEN8#$&C%6TvA?4|6vKDR>4;v4`_d48D;Axe5f!K{FLjvD0JdczXyCWUThW$cu?GgEdbl(PtHJ?1eQ1l~J%oW^4jzfx5X2(S{D6WMEuvM?&NUo}EXrJ3hZT8FLy+1b_?ZTLRSqREMNJMofY2iz-WC6if!GKkwzF?#&W%hjGPe0M%Qlc2H~)TG$!v8fCW^0S{D?NZvtx9r*mKYuVC!C;c#&NW(D{HUSk;JH2Xb>$8kvx<>xe|%B-(3-C|h|oAcqb{|}=cJw*+`_0t^2p`4qgw`0Yi^CiCcM}SseslliB_Cn%AB65VjwiBNhuUGoyvBu7O8PcRe;KM}#eA6A`O)EI)(WmUrDQs|w5l5J?>hJ@cmmM*6fyGTUuAV~yVPGmuLLRF+Up7zUs70@VtT_XMRZlb{u0oszxcPY;_DISu$wc9S;2g^h`W$0*`V08xF8(nF(WaE*;vbq@ncd~d96z!I#5xe}}5JdLHxm^6A7D{x(JKxu6#;+gxCp-yyLUJyjQ>054~p5+pF8yCdHXk}n^{}Jz-gm8=PXOT$Tu3{h^QtmE0DLdt_Y@i3-4En7SoZ;a?#%*n%VU5SxxH6WFktXX1k7R<+)~ky1LJG2-nT8?C#&tTmJW?|dvl6=0HK0M-n*J@+TuyKsnrSh{*f$k$MzeYqcwwh+&z{O&bY2m!IPj#g7OoqrxtT@V;tFKsAjjqOJOk2B|itumu2@=s;whe^X8yFZ#KHrn(wh?K)R#ZaO5J*K%(2}gzHJ+indKG_1m4G8NvGzD)ST-Ghbl4d`wxFTdWg)m@2s)8D`wT9PP16bTaXhf=s|rBgzX*c+wQ|i$FA3wcWT23=PQZw+r>qp)Eg6)&s#89OXga=+0B)vzG@1`oW9yw=Ou5rWEvkg3TMnP4Mz)RY&-1#=I0t*hh(?S7RpkiC=3Ko1Ab;3-81A;S=mJm>qHQw*7WzOqe`%`WUh!dPE^(d%wsg-P^4XV0Afm`6g6tI4z55ijv8HeC)85>mcNIfspX@JXD7|`KU`L9=Gr2tygM5{;Hkz4lTuSpH1`Aqw)YiwsD}CB!O}7Dwebo60U)?1OItOBjVACtlz(N6faw{&!ojDSkXE3C%T1n%bA=ccLv)3*h-&y0!O@Dt4ll>HzqqrN2?X1*(udv0j=)_BOqgf1jD&4UJbgxtF`XNpo}+);%nBlcTroI!c5X*7pNF+)7D@?)mXGKrIJtAO0p)vHak`nk(o>~bz8gJVIPtl;xky{sRcrLpEet(KihT74^P$1g3o?8cNbzjSd!2Qfe-qxsuRiFU3y9yTq2eLDP;M|Sv^3ON$oCWS_vi3Tf;?q0SsXxn0kP(7Q?js6vN?w(HYd|k;xH+$^(!s7ley)kpi&3R{6A8L4KM1x=-T2c#BsNyJxs@kqL*TPiwRVYWroJtSilt%bGpEA_Rohg$p?o2V+A3_iRsIGw8>$mGfTGOhRmWVIUQ`WA}1bHqOFwhZ?IDd(WE@#oG$Tih{R*Z;dDI)3B0dT1A=R+s)p5RU1eEbdSapP|3Ii9wUg-Ap=(TJ1{h4%Lc2C{ql%qA=pwc>yzA8+j03L8@TndD`?5lasSz^)E0qg2m){l<>&6JC0Z%5ei;YzX^~ms&QL5$ho%&ER5eix4-pj$m`7bTig@BJ3rtvK%bvICmcRI2|Gl(qJ>jUeI@&H?8^4iutcI^lvURaewu#})pVr7b}u@#-Gz$iTWqRvwAzVO-s`cyrwnwRSi~7+KA%voh)cXD;rEL2gQ(Gq$4YG2V$N+2<-qDS3SOL_}?OOT{hRMz1fthIS^^|CfeUB@w2{ZE4cQVGpfkHqbnq{@y1jVEgL-7-~iDX1b**nLzc320SN^EhTz@79Bk}+TSDs+$KRBBTF#P0iHrWa-*xDHYy@T>AKA&j_Qr{!~!JEz6UY$oPUl1-+97?Aoe4q^_MJh37U3+bl+J%bE2DHROWqjo>wmm%u5eFSQ(z)Ue{BJRh0#Rr>XwoSFeS5C{S_DEc}h7G-$k8YJv$G{D`XRC-jbieiDDNo7A?);o9F^dfY{arC2Hjqu?pROE(}l`bW$wDJcPI{dgFDqhp&HRwB^Af}$(o+S{x22MZTK0I3Vx&P5T$jsP}*zc~Q)rc%*)={swV85o6q2u^b8&?QtFL%VPe?VZjGgD5-paw7@D%4>P7YZ&b)YIHrR;d*V3@l@7b=K$5q)jiwKv3KP<3^A!u!o*H9R)mr7MmIp$iA`EXuQ4ujY1XtaULJV#ABIUxBA?06Oag?!7^LuetjHLAe9WH!eR=fGOp?7%@3TD)1BJ^qg^uT*p`nZY|;!>l_aiBQ(WeJ+I$>gUHC`7eGJEJr@dAzHGH(ORRC$Ku_m^ORqWMI?!s-8xadGv}9h+x2FB?{&9z8V!&@vTHz*Z=k@7$Kt~0h2LzvT%RU=i=^y6W8)2wahE;?&M5*{G&pSF8Tc&j~niOx*`7)7_1|o%Ernwp`xZ^^2yOem^pE0}q$>ActeZHjic3j?g6Z~`oOdc?zYSSIJiq~SoovMJlcVgp+K{$=cFA(mzkiquKh;e(SIyget!@t&!iY^cS7d+QfHZ#~UH)2IA_iHmujVt>F$RGl@u#k`ouO*^()mZdvY%ciEdJ7-f(6v0m%Mv1f0qbT(!qJbksdMr^6_sskPgJOAR-BE!B2qXq+{3_5qV`wXWqlMcHa_ZksId)$9u`-lmss@9?^@lwB+qp2xDx9WIUw7HAkf1JD1};}pXW&9X6GmGeyamw5)aK8s|n#igYod;jCHOPhcy31BG@P`xJX8AJe^)kY{Bj1%Q)^Mv%N^y%++~xkE4h`u}I<%|7mGihA%Oa!Pz3Fq#Juauf4qF353FK9z3t1%p=ArJVn9i&&$lKmK2@N}CSylIpV3w`k~#0ZeA;%1@jCOx_Ey5E1Z!FC2WLbBBK2^a<)WYHbL~100SfAfEp@c=M`d@w=ikkXo=?tMDUL?7*QHJzwP@dms)u6b0(|S+>)hH5Gji(`Eu8ehhk`T7DmM2+B$F!pXF=CP-Q(EJCAxM~ek$7=+w&T2c{&apiJ|YMMX$JOhDoIWbsVynj;SI-2ADc~gV0iphFR1O6O$EZPE5s{xb`kHxpx4Wa|CpdBH}r(ojCZUzE*z*`Y{g_T(jx+pv_-e};6uyi_bR3Mp>#cSllzG{tudj@Q*rh`Py)5>}c$_VI=O7-$L((+}#Dmp?%=81?rn*Bpr!9WjCy!)vcgxSRMxef{QBUJLu2;EQ&8nTtEG_f%~)9E8aoF$Ol{4u!4^3rC)Db+)Bd`tVA%;i#mNdIlYNCFSK6(3oZ)sI%bg+g25mMb_%t+;N*^Rxit4w@_;@GXY3!o->b#(v3qN1qTLjq=vsIbkpSXOXOv5RZy+od2i0%08A|1T+xOctu+jcx=-O$HwCjIh;O}pa=yRN{rH~OeNacO(pwpnFzQE?kNpc#CAZSU+4z;kh9xIY548w-t5PezSUN~!U{Hbk88XY$VUGqIxnq_uz%Nj>5Y%Wh$m;M%$!Muw*W-!q=j(}2*poE3*~e^74OqUv1=5!ncxKC8>>Js)85KtAXmp3F(5i6Pq?6);(RLl4q0xOz!(7_Yi=;E);bBa#X}c?5eRa@DZur_yb+$@aGCDJvltz3F0pnkH%KusjLKihlqD>54#Bh8H$M8DYequv(6&z?3N1A+@(0=j<=-(aKy)*+?=I6YR+z0{>5HnPe%^;ih673)=W-ulG7&VK-jFX+6nl_>w`SPw>?rG`hi&z^gKSn^F+!W*)5e(0Kxy4HXy^L9o@ncSGjunB`iWa;`iZSO1$d@*;|Y?^IfD=!7~#;2-Fcxl_lf%rRaX+7ONw4lvsE+WJ>%KIRUFsh@aixVi=Qmwpcf9KiK;9;Mc~8xLUsygW;=k#_NflzrZD)JOSnsgS4ZV9?!}y0J8-ulcDFEdX}V@O>#z@}7HiVw9Rq7kwK+tF8ahJ)UVqnfRQe22;REH3?)rCOu|O6{t!TbKr>Df$XwWZDDdyEZ(G5cqpj-NyVUdgZ-d`j=`c07$1q+fe*!;VDoNZBB+CoZDW1QC5!bS3_Fu2)q@7XnPSPJ8d)f*ul@JB$?AZVbB?Q&&e(}yM%|%JUDH6wGjr2U%52k4zYSI6K}>d+B`}#az=9n`Y)sErg`YtCXVxT*b2}9wAI8QoE2S2WRQ|$3#TzXyK;F@9U^9!?mr6Ll#EH3T!FA2!07PE-Na*y9LwmV7mWRKp=v9|Y+Rx~`iz2r*qCsnOLaBcqqsFngMjp&*D-uzS7}Gb)&I0{jsGAn#z9$894o^&M(DW)A0+BpC&?~{d$ME-;@+XOW1Qw7YLXN#aTzJ^0-_q+IGA`%wRpq?hBd*bI}pH^ImuLWj5E5;s#*I_Uy{J`EAOW))5ev2E2bL)n_oVP_ZK#Y1Y>7}Ze6)E6<$7z;YZ75g4RQyE1A*_YXHCl`gSpAvS-{`U+^&FjSa)~2qMsiJf-SdaY9j7dRNoR^aHdxTzbJ~Dlh7v(F5)cqrKf9!ap!ePW%taUmR;w6HDp=Ql=htO_~K<7|&2_QJ~&Nq!n+Qlm|6f+bS;QxaWNlL{E@Bt^6hpV5{7fL{XY+Vy<-k#OTH-S+m@zdKar~F|PR^rHMr$`Hi$ggTZ#!Cx{^Fd~Y-~$~NIF!+|Kf_ucKnN+7uKm6X$T$h1#kFJZ(C(MbeF-a@dl;dW2X)Br5Bssso)D`Ci&%9`mf<&?3mE2_pFjvXN*LAPn};|_GYoRs2)ATHP8weE^2n=D-kEtco&v3%+|Dn~W-W2J_n&$BjToyEB+&B(p$LB<-%SY6&2H%0cBZ#f=;1Kyyf+vhb&YejrcoPMlt*q280Vha3ej)>_@Xmv!XMM$Uy=p3zrFq~rty#q!jKyo3~j7XxfWl=2p(qSZPjGx9}{bPWz(cLT(Y1~P@c__E@PfN*e2$oElyDOL~H1=113|%IAxfSXMxDx_t8%jlLqi>xy)Flx&8B`#oeX+Ko&?EAB+^5&N%#a-v^nb;`?pl^MaG`TPgFO)#x_Q6E__|!Bb;R)On|*#wH2yE1gjjz-bPk8t?DQ0Gv#5}UEj8ZhIyDstJGt6iig@i@GLw(ye^x;zz-q*#jxAhBu;)H#afbZiZ*_x*L|dDF#u*NWa#$vV7n*q0h!Nr16+@Av$H&kvuu!$NrFV3c6ZxtM{Zpd3*21HPQW-__5#G?rz+QKinBZp>68ILyz3G5pVXNBS*$b=K-G&tnQh|rkiKyJ>L+N2bej_FE_?{~TMas~lcs%AuO+4v|FLkmXB%n0$+@RO3FOaVDv-fcw$ZmhVUq*;|)ugfTmA9(~CvPoap>_XMG2Uu-K}$PlmM+~82xbyzJDWZxrk^Sdj({jSW5Ou>AWGWh2!9&bugj$y$63%IS|8!R(9GbpuE(6hvr!Ro}0dq7g5}A@DTPnUtKuw?M=#X$)^=>H>}TTk{&eZ0;0aVXU2b{!$JS=&kO>CExw@&zzY&%pHJ=(9hD=uYBC5V9ql0Ljx^rAch{^1ghAAVGweOE&#nRrZp6b(A3W$9ya^sQBeY#;s~+cbVDDF^y*)e1%DKunuwh<8lh#f55EeQopioViN4)#+KSw!xQa*PFKR_0NkaaG1aGEs?srmu>&2vu9`&js5S7VwBACH>vrKsvFkl2`n$p&y()W%n3A|X_hfz3nv)HWlwzspg`zEir!$jtL5D3lZ#jf~v@G_0w5Ihc*FbkQ8N)TBA19K}uo1;)lajks&&lo>hkmn-