openai · deborahnelson8788726 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,9 @@
 data/tokenizers
 __pycache__/
 .DS_Store
+.secrets/
+.obsidian/
+cowork_transfer/
 modded-nanogpt/
 modded-nanogpt
 data/datasets

diff --git a/modal/run_v4.py b/modal/run_v4.py
@@ -0,0 +1,119 @@
+"""Modal app: run Trinity v5 (Pre-quant TTT + SLOT) on 8xH100 SXM.
+Uses PyTorch 2.9 + Flash Attention (2.x or 3) to match PR #1329's performance.
+
+Usage:
+    modal run --detach modal/run_v4.py --seed 42
+"""
+
+import modal
+import os
+from pathlib import Path
+
+app = modal.App("trinity-v5-parameter-golf")
+
+# Use the official NVIDIA PyTorch 2.9 image that has CUDA runtime + PyTorch pre-installed.
+# Based on nvcr.io/nvidia/pytorch images which come with FA3 support.
+image = (
+    modal.Image.from_registry(
+        "pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel",
+        add_python="3.11",
+    )
+    .apt_install("git", "build-essential", "wget")
+    .run_commands(
+        # Upgrade to torch 2.9.1+cu128 like PR #1329
+        "pip install --upgrade pip",
+        "pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
+    )
+    .pip_install(
+        "ninja",  # Required for flash-attn compilation
+        "packaging",
+        "wheel",
+    )
+    .run_commands(
+        # flash-attn with TORCH_CUDA_ARCH_LIST set for H100 (sm_90)
+        "TORCH_CUDA_ARCH_LIST='9.0' FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn==2.7.4.post1 --no-build-isolation || pip install flash-attn==2.6.3 --no-build-isolation",
+    )
+    .pip_install(
+        "sentencepiece",
+        "huggingface-hub",
+        "datasets",
+        "tqdm",
+        "numpy",
+    )
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /root/parameter-golf",
+        "cd /root/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
+    )
+)
+
+# Add train_gpt.py to image
+LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
+image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")
+
+
+@app.function(
+    image=image,
+    gpu="H100:8",
+    timeout=3600,
+)
+def run_seed(seed: int):
+    """Run a single seed of Trinity v5 and return the val_bpb."""
+    import subprocess
+    import shutil
+
+    shutil.copy("/root/train_gpt.py", "/root/parameter-golf/train_gpt.py")
+
+    env = os.environ.copy()
+    env.update({
+        "SEED": str(seed),
+        "RUN_ID": f"trinity_v5_modal_seed{seed}",
+        "TTT_ENABLED": "1",
+        "TTT_LR": "0.001",
+        "TTT_EPOCHS": "1",
+        "TTT_CHUNK_TOKENS": "32768",
+        "TTT_FREEZE_BLOCKS": "10",
+        "TTT_BATCH_SEQS": "32",
+        "SLOT_LR": "0.024",
+        "SLOT_STEPS": "24",
+        "SLOT_STRIDE": "64",
+        "GPTQ_DAMP_FACTOR": "0.005",
+        "GPTQ_CALIB_VAL": "1",
+        "GPTQ_CALIB_BATCHES": "256",
+        "QK_GAIN_INIT": "4.0",
+        "MTP_NUM_HEADS": "2",
+        "MTP_LOSS_WEIGHT": "0.1",
+        "MAX_WALLCLOCK_SECONDS": "600",
+    })
+
+    result = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        cwd="/root/parameter-golf",
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+    log = result.stdout + result.stderr
+
+    slot_bpb = None
+    for line in log.splitlines():
+        if "final_slot_exact" in line and "val_bpb:" in line:
+            try:
+                slot_bpb = float(line.split("val_bpb:")[-1].strip())
+            except ValueError:
+                pass
+
+    return {
+        "seed": seed,
+        "slot_bpb": slot_bpb,
+        "log_tail": log[-10000:],
+    }
+
+
+@app.local_entrypoint()
+def main(seed: int = 42):
+    print(f"Running Trinity v5 seed {seed} on Modal 8xH100 SXM...")
+    result = run_seed.remote(seed)
+    print(f"\n=== Seed {seed} done ===")
+    print(f"SLOT BPB: {result['slot_bpb']}")
+    print(f"\n=== Log tail ===\n{result['log_tail']}")
diff --git a/modal/run_v5.py b/modal/run_v5.py
@@ -0,0 +1,107 @@
+"""Modal app: run Trinity v5 (3 bug fixes) on 8xH100 SXM.
+Uses nvcr.io/nvidia/pytorch image which has pre-installed FA3 + CUDA 12.8 + PyTorch 2.9.
+
+Usage:
+    modal run --detach modal/run_v5.py --seed 42
+"""
+
+import modal
+import os
+from pathlib import Path
+
+app = modal.App("trinity-v5-pgolf")
+
+# Lightweight image: use Modal's debian_slim + install torch/flash-attn from pre-built wheels
+# This is much faster than pulling 25GB nvcr image
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git", "wget", "build-essential")
+    .pip_install(
+        "torch==2.5.1",
+        "torchvision",
+        "torchaudio",
+        index_url="https://download.pytorch.org/whl/cu124",
+    )
+    .pip_install(
+        # Flash Attention — use pre-built wheel for torch 2.5.1 + cu124 + python3.11
+        "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl",
+    )
+    .pip_install(
+        "sentencepiece",
+        "huggingface-hub",
+        "datasets",
+        "tqdm",
+        "numpy",
+    )
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /root/parameter-golf",
+        "cd /root/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
+    )
+)
+
+# Add train_gpt.py to image
+LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
+image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")
+
+
+@app.function(
+    image=image,
+    gpu="H100:8",
+    timeout=3600,
+)
+def run_seed(seed: int):
+    """Run a single seed of Trinity v5 and return the val_bpb."""
+    import subprocess
+    import shutil
+
+    shutil.copy("/root/train_gpt.py", "/root/parameter-golf/train_gpt.py")
+
+    env = os.environ.copy()
+    env.update({
+        "SEED": str(seed),
+        "RUN_ID": f"trinity_v5_seed{seed}",
+        "TTT_ENABLED": "1",
+        "TTT_LR": "0.001",
+        "TTT_EPOCHS": "1",
+        "TTT_CHUNK_TOKENS": "32768",
+        "TTT_FREEZE_BLOCKS": "10",
+        "TTT_BATCH_SEQS": "32",
+        "SLOT_LR": "0.024",
+        "SLOT_STEPS": "24",
+        "SLOT_STRIDE": "64",
+        "GPTQ_DAMP_FACTOR": "0.005",
+        "GPTQ_CALIB_VAL": "1",
+        "GPTQ_CALIB_BATCHES": "256",
+        "QK_GAIN_INIT": "4.0",
+        "MTP_NUM_HEADS": "2",
+        "MTP_LOSS_WEIGHT": "0.1",
+        "MAX_WALLCLOCK_SECONDS": "600",
+    })
+
+    result = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        cwd="/root/parameter-golf",
+        env=env,
+        capture_output=True,
+        text=True,
+    )
+
+    log = result.stdout + result.stderr
+    slot_bpb = None
+    for line in log.splitlines():
+        if "final_slot_exact" in line and "val_bpb:" in line:
+            try:
+                slot_bpb = float(line.split("val_bpb:")[-1].strip())
+            except ValueError:
+                pass
+
+    return {"seed": seed, "slot_bpb": slot_bpb, "log_tail": log[-10000:]}
+
+
+@app.local_entrypoint()
+def main(seed: int = 42):
+    print(f"Running Trinity v5 seed {seed} on Modal 8xH100 SXM...")
+    result = run_seed.remote(seed)
+    print(f"\n=== Seed {seed} done ===")
+    print(f"SLOT BPB: {result['slot_bpb']}")
+    print(f"\n=== Log tail ===\n{result['log_tail']}")
diff --git a/modal/run_v6.py b/modal/run_v6.py
@@ -0,0 +1,73 @@
+"""Modal: Trinity v6 N-gram Order-22 on 8xH100.
+Simple image: torch 2.5.1 + flash-attn prebuilt wheel. No FA3 — our code has FA2 fallback.
+
+Usage: modal run --detach modal/run_v6.py --seed 42
+"""
+import modal, os
+from pathlib import Path
+
+app = modal.App("trinity-v6-ngram")
+
+image = (
+    modal.Image.debian_slim(python_version="3.11")
+    .apt_install("git")
+    .pip_install(
+        "torch==2.5.1",
+        index_url="https://download.pytorch.org/whl/cu124",
+    )
+    .pip_install("sentencepiece", "huggingface-hub", "datasets", "tqdm", "numpy")
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /root/pgolf",
+        "cd /root/pgolf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
+    )
+)
+
+LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
+image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")
+
+@app.function(image=image, gpu="H100:8", timeout=7200)  # 2 hours — SDPA fallback is slow
+def run_seed(seed: int):
+    import subprocess, shutil
+    shutil.copy("/root/train_gpt.py", "/root/pgolf/train_gpt.py")
+    env = os.environ.copy()
+    env.update({
+        "SEED": str(seed), "RUN_ID": f"v6_s{seed}",
+        "TTT_ENABLED": "1", "TTT_LR": "0.001", "TTT_EPOCHS": "1",
+        "TTT_CHUNK_TOKENS": "32768", "TTT_FREEZE_BLOCKS": "10", "TTT_BATCH_SEQS": "32",
+        "SLOT_LR": "0.432", "SLOT_STEPS": "24", "SLOT_STRIDE": "64",
+        "SLOT_BETA1": "0.6", "SLOT_BETA2": "0.5", "SLOT_BATCH_SEQS": "128",
+        "NGRAM_ENABLED": "1", "NGRAM_ORDER": "22", "NGRAM_BUCKETS": "4194304",
+        "NGRAM_MIN_COUNT": "2", "NGRAM_MIN_TOKENS": "5000",
+        "GPTQ_DAMP_FACTOR": "0.005", "GPTQ_CALIB_VAL": "1", "GPTQ_CALIB_BATCHES": "256",
+        "QK_GAIN_INIT": "4.0", "MTP_NUM_HEADS": "2", "MTP_LOSS_WEIGHT": "0.1",
+        "MAX_WALLCLOCK_SECONDS": "600",
+    })
+    # First: quick smoke test — import check on 1 GPU
+    import sys
+    smoke = subprocess.run(
+        [sys.executable, "-c", "import torch; print(f'torch {torch.__version__}, cuda {torch.cuda.is_available()}, gpus {torch.cuda.device_count()}'); import train_gpt; print('import OK')"],
+        cwd="/root/pgolf", env=env, capture_output=True, text=True,
+    )
+    print(f"SMOKE: {smoke.stdout.strip()}")
+    if smoke.returncode != 0:
+        print(f"SMOKE ERROR: {smoke.stderr[-3000:]}")
+        return {"seed": seed, "bpb": None, "log": f"SMOKE FAILED:\n{smoke.stderr[-5000:]}"}
+
+    r = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        cwd="/root/pgolf", env=env, capture_output=True, text=True,
+    )
+    log = r.stdout + r.stderr
+    bpb = None
+    for line in log.splitlines():
+        if "final_slot_exact" in line and "val_bpb:" in line:
+            try: bpb = float(line.split("val_bpb:")[-1].strip())
+            except: pass
+    return {"seed": seed, "bpb": bpb, "log": log[-15000:]}
+
+@app.local_entrypoint()
+def main(seed: int = 42):
+    print(f"Running v6 seed {seed} on Modal 8xH100...")
+    r = run_seed.remote(seed)
+    print(f"\nSeed {seed}: BPB={r['bpb']}")
+    print(f"\n{r['log']}")
diff --git a/modal/run_v6_fa.py b/modal/run_v6_fa.py
@@ -0,0 +1,80 @@
+"""Modal: Trinity v6 N-gram — WITH flash-attn on CUDA devel image.
+Parallel attempt: if FA compiles, this will be 5x faster than SDPA fallback.
+
+Usage: modal run --detach modal/run_v6_fa.py --seed 42
+"""
+import modal, os
+from pathlib import Path
+
+app = modal.App("trinity-v6-ngram-fa")
+
+# CUDA devel image — has nvcc for flash-attn compilation
+image = (
+    modal.Image.from_registry("nvidia/cuda:12.4.1-devel-ubuntu22.04", add_python="3.11")
+    .apt_install("git", "ninja-build")
+    .pip_install(
+        "torch==2.5.1",
+        index_url="https://download.pytorch.org/whl/cu124",
+    )
+    .pip_install("packaging", "wheel", "setuptools")
+    .run_commands(
+        # Build flash-attn from source with H100 arch
+        "MAX_JOBS=4 TORCH_CUDA_ARCH_LIST='9.0' pip install flash-attn==2.7.3 --no-build-isolation 2>&1 | tail -20",
+    )
+    .pip_install("sentencepiece", "huggingface-hub", "datasets", "tqdm", "numpy")
+    .run_commands(
+        "git clone https://github.com/openai/parameter-golf.git /root/pgolf",
+        "cd /root/pgolf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
+    )
+)
+
+LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
+image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")
+
+@app.function(image=image, gpu="H100:8", timeout=3600)
+def run_seed(seed: int):
+    import subprocess, shutil, sys
+    shutil.copy("/root/train_gpt.py", "/root/pgolf/train_gpt.py")
+
+    # Smoke test
+    smoke = subprocess.run(
+        [sys.executable, "-c",
+         "import torch; print(f'torch {torch.__version__}, cuda {torch.cuda.is_available()}, gpus {torch.cuda.device_count()}');"
+         "try:\n from flash_attn import flash_attn_func; print('FA2 OK')\nexcept: print('FA2 MISSING');"
+         "try:\n from flash_attn_interface import flash_attn_func; print('FA3 OK')\nexcept: print('FA3 MISSING')"],
+        capture_output=True, text=True)
+    print(f"SMOKE: {smoke.stdout.strip()}")
+    if "MISSING" in smoke.stdout and "FA2 MISSING" in smoke.stdout:
+        return {"seed": seed, "bpb": None, "log": f"FA install failed:\n{smoke.stderr[-3000:]}"}
+
+    env = os.environ.copy()
+    env.update({
+        "SEED": str(seed), "RUN_ID": f"v6fa_s{seed}",
+        "TTT_ENABLED": "1", "TTT_LR": "0.001", "TTT_EPOCHS": "1",
+        "TTT_CHUNK_TOKENS": "32768", "TTT_FREEZE_BLOCKS": "10", "TTT_BATCH_SEQS": "32",
+        "SLOT_LR": "0.432", "SLOT_STEPS": "24", "SLOT_STRIDE": "64",
+        "SLOT_BETA1": "0.6", "SLOT_BETA2": "0.5", "SLOT_BATCH_SEQS": "128",
+        "NGRAM_ENABLED": "1", "NGRAM_ORDER": "22", "NGRAM_BUCKETS": "4194304",
+        "NGRAM_MIN_COUNT": "2", "NGRAM_MIN_TOKENS": "5000",
+        "GPTQ_DAMP_FACTOR": "0.005", "GPTQ_CALIB_VAL": "1", "GPTQ_CALIB_BATCHES": "256",
+        "QK_GAIN_INIT": "4.0", "MTP_NUM_HEADS": "2", "MTP_LOSS_WEIGHT": "0.1",
+        "MAX_WALLCLOCK_SECONDS": "600",
+    })
+    r = subprocess.run(
+        ["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
+        cwd="/root/pgolf", env=env, capture_output=True, text=True,
+    )
+    log = r.stdout + r.stderr
+    bpb = None
+    for line in log.splitlines():
+        if "final_slot_exact" in line and "val_bpb:" in line:
+            try: bpb = float(line.split("val_bpb:")[-1].strip())
+            except: pass
+    return {"seed": seed, "bpb": bpb, "log": log[-15000:]}
+
+@app.local_entrypoint()
+def main(seed: int = 42):
+    print(f"Running v6+FA seed {seed} on Modal 8xH100...")
+    r = run_seed.remote(seed)
+    print(f"\nSeed {seed}: BPB={r['bpb']}")
+    print(f"\n{r['log']}")