Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
27c4edd
Record: Trinity Ternary GPT β€” val_bpb 0.9650 (ternary roundtrip)
Apr 2, 2026
648d5b8
Fix critical bugs in ternary export/import and DDP eval
Apr 2, 2026
e7b1283
v3: Late QAT + smaller model (11L 512d MLP3x) for stability
Apr 2, 2026
dd773c8
v3 final: val_bpb=1.8310 (int8 roundtrip) on 8xH100 SXM
Apr 2, 2026
4898392
v4: Trinity Hybrid β€” val_bpb 1.1357 (training, top-5 level)
Apr 2, 2026
ab62ee3
v4-fix: int6 GPTQ all weights, MLP 3.5x β€” roundtrip val_bpb 1.1381
Apr 2, 2026
f790c3a
v4final: MLP 3.5x β†’ roundtrip val_bpb 1.1279 (sliding window)!
Apr 2, 2026
97901c8
PR cleanup: single submission folder, honest results, full compliance
Apr 3, 2026
ed6bb6f
FINAL: val_bpb 1.1251 β€” artifact 15.90MB β€” within 16MB limit!
Apr 4, 2026
24bdada
v5: MLP 3.0x + optimized Score-First TTT
Apr 5, 2026
787c76f
Fix best result: val_bpb 1.1251 (8xH100, MLP 3.25x)
Apr 5, 2026
2c4f03c
3-seed results on 8xH100 SXM: mean val_bpb 1.1304
Apr 5, 2026
7d92680
FINAL: 3-seed 8xH100 results β€” mean val_bpb 1.1304 (#5-6)
Apr 5, 2026
c7b75aa
πŸ† Trinity SLOT v2: val_bpb 0.6680 β€” NEW RECORD on 8xH100 SXM
Apr 6, 2026
bd5df06
πŸ† Trinity SLOT v2: 3-seed mean val_bpb 0.66757 β€” NEW RECORD
Apr 6, 2026
7141d2a
Trinity v3: Pre-quant TTT + SLOT cascade β€” 3-seed mean 0.65802
Apr 6, 2026
a18c7ef
πŸ† Trinity v6: val_bpb 0.37112 β€” NEW #1 RECORD!!!
Apr 12, 2026
1fc9d37
πŸ† Trinity v7: val_bpb 0.33574 (3-seed mean) β€” NEW #1 RECORD
Apr 17, 2026
4ad37a4
πŸ†πŸ† Trinity v7+skip: val_bpb 0.22311 (3-seed mean) β€” MASSIVE NEW #1
Apr 17, 2026
812f453
Experimental: LegalNgramMixer + Lion + phi-rank + Modal/RunPod scripts
Apr 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
data/tokenizers
__pycache__/
.DS_Store
.secrets/
.obsidian/
cowork_transfer/
modded-nanogpt/
modded-nanogpt
data/datasets
Expand Down
119 changes: 119 additions & 0 deletions modal/run_v4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Modal app: run Trinity v5 (Pre-quant TTT + SLOT) on 8xH100 SXM.
Uses PyTorch 2.9 + Flash Attention (2.x or 3) to match PR #1329's performance.

Usage:
modal run --detach modal/run_v4.py --seed 42
"""

import modal
import os
from pathlib import Path

app = modal.App("trinity-v5-parameter-golf")

# Use the official NVIDIA PyTorch 2.9 image that has CUDA runtime + PyTorch pre-installed.
# Based on nvcr.io/nvidia/pytorch images which come with FA3 support.
image = (
modal.Image.from_registry(
"pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel",
add_python="3.11",
)
.apt_install("git", "build-essential", "wget")
.run_commands(
# Upgrade to torch 2.9.1+cu128 like PR #1329
"pip install --upgrade pip",
"pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
)
.pip_install(
"ninja", # Required for flash-attn compilation
"packaging",
"wheel",
)
.run_commands(
# flash-attn with TORCH_CUDA_ARCH_LIST set for H100 (sm_90)
"TORCH_CUDA_ARCH_LIST='9.0' FLASH_ATTENTION_FORCE_BUILD=TRUE pip install flash-attn==2.7.4.post1 --no-build-isolation || pip install flash-attn==2.6.3 --no-build-isolation",
)
.pip_install(
"sentencepiece",
"huggingface-hub",
"datasets",
"tqdm",
"numpy",
)
.run_commands(
"git clone https://github.com/openai/parameter-golf.git /root/parameter-golf",
"cd /root/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
)
)

# Add train_gpt.py to image
LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")


@app.function(
image=image,
gpu="H100:8",
timeout=3600,
)
def run_seed(seed: int):
"""Run a single seed of Trinity v5 and return the val_bpb."""
import subprocess
import shutil

shutil.copy("/root/train_gpt.py", "/root/parameter-golf/train_gpt.py")

env = os.environ.copy()
env.update({
"SEED": str(seed),
"RUN_ID": f"trinity_v5_modal_seed{seed}",
"TTT_ENABLED": "1",
"TTT_LR": "0.001",
"TTT_EPOCHS": "1",
"TTT_CHUNK_TOKENS": "32768",
"TTT_FREEZE_BLOCKS": "10",
"TTT_BATCH_SEQS": "32",
"SLOT_LR": "0.024",
"SLOT_STEPS": "24",
"SLOT_STRIDE": "64",
"GPTQ_DAMP_FACTOR": "0.005",
"GPTQ_CALIB_VAL": "1",
"GPTQ_CALIB_BATCHES": "256",
"QK_GAIN_INIT": "4.0",
"MTP_NUM_HEADS": "2",
"MTP_LOSS_WEIGHT": "0.1",
"MAX_WALLCLOCK_SECONDS": "600",
})

result = subprocess.run(
["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
cwd="/root/parameter-golf",
env=env,
capture_output=True,
text=True,
)

log = result.stdout + result.stderr

slot_bpb = None
for line in log.splitlines():
if "final_slot_exact" in line and "val_bpb:" in line:
try:
slot_bpb = float(line.split("val_bpb:")[-1].strip())
except ValueError:
pass

return {
"seed": seed,
"slot_bpb": slot_bpb,
"log_tail": log[-10000:],
}


@app.local_entrypoint()
def main(seed: int = 42):
print(f"Running Trinity v5 seed {seed} on Modal 8xH100 SXM...")
result = run_seed.remote(seed)
print(f"\n=== Seed {seed} done ===")
print(f"SLOT BPB: {result['slot_bpb']}")
print(f"\n=== Log tail ===\n{result['log_tail']}")
107 changes: 107 additions & 0 deletions modal/run_v5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Modal app: run Trinity v5 (3 bug fixes) on 8xH100 SXM.
Uses nvcr.io/nvidia/pytorch image which has pre-installed FA3 + CUDA 12.8 + PyTorch 2.9.

Usage:
modal run --detach modal/run_v5.py --seed 42
"""

import modal
import os
from pathlib import Path

app = modal.App("trinity-v5-pgolf")

# Lightweight image: use Modal's debian_slim + install torch/flash-attn from pre-built wheels
# This is much faster than pulling 25GB nvcr image
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("git", "wget", "build-essential")
.pip_install(
"torch==2.5.1",
"torchvision",
"torchaudio",
index_url="https://download.pytorch.org/whl/cu124",
)
.pip_install(
# Flash Attention β€” use pre-built wheel for torch 2.5.1 + cu124 + python3.11
"https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl",
)
.pip_install(
"sentencepiece",
"huggingface-hub",
"datasets",
"tqdm",
"numpy",
)
.run_commands(
"git clone https://github.com/openai/parameter-golf.git /root/parameter-golf",
"cd /root/parameter-golf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
)
)

# Add train_gpt.py to image
LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")


@app.function(
image=image,
gpu="H100:8",
timeout=3600,
)
def run_seed(seed: int):
"""Run a single seed of Trinity v5 and return the val_bpb."""
import subprocess
import shutil

shutil.copy("/root/train_gpt.py", "/root/parameter-golf/train_gpt.py")

env = os.environ.copy()
env.update({
"SEED": str(seed),
"RUN_ID": f"trinity_v5_seed{seed}",
"TTT_ENABLED": "1",
"TTT_LR": "0.001",
"TTT_EPOCHS": "1",
"TTT_CHUNK_TOKENS": "32768",
"TTT_FREEZE_BLOCKS": "10",
"TTT_BATCH_SEQS": "32",
"SLOT_LR": "0.024",
"SLOT_STEPS": "24",
"SLOT_STRIDE": "64",
"GPTQ_DAMP_FACTOR": "0.005",
"GPTQ_CALIB_VAL": "1",
"GPTQ_CALIB_BATCHES": "256",
"QK_GAIN_INIT": "4.0",
"MTP_NUM_HEADS": "2",
"MTP_LOSS_WEIGHT": "0.1",
"MAX_WALLCLOCK_SECONDS": "600",
})

result = subprocess.run(
["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
cwd="/root/parameter-golf",
env=env,
capture_output=True,
text=True,
)

log = result.stdout + result.stderr
slot_bpb = None
for line in log.splitlines():
if "final_slot_exact" in line and "val_bpb:" in line:
try:
slot_bpb = float(line.split("val_bpb:")[-1].strip())
except ValueError:
pass

return {"seed": seed, "slot_bpb": slot_bpb, "log_tail": log[-10000:]}


@app.local_entrypoint()
def main(seed: int = 42):
print(f"Running Trinity v5 seed {seed} on Modal 8xH100 SXM...")
result = run_seed.remote(seed)
print(f"\n=== Seed {seed} done ===")
print(f"SLOT BPB: {result['slot_bpb']}")
print(f"\n=== Log tail ===\n{result['log_tail']}")
73 changes: 73 additions & 0 deletions modal/run_v6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Modal: Trinity v6 N-gram Order-22 on 8xH100.
Simple image: torch 2.5.1 + flash-attn prebuilt wheel. No FA3 β€” our code has FA2 fallback.

Usage: modal run --detach modal/run_v6.py --seed 42
"""
import modal, os
from pathlib import Path

app = modal.App("trinity-v6-ngram")

image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("git")
.pip_install(
"torch==2.5.1",
index_url="https://download.pytorch.org/whl/cu124",
)
.pip_install("sentencepiece", "huggingface-hub", "datasets", "tqdm", "numpy")
.run_commands(
"git clone https://github.com/openai/parameter-golf.git /root/pgolf",
"cd /root/pgolf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
)
)

LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")

@app.function(image=image, gpu="H100:8", timeout=7200) # 2 hours β€” SDPA fallback is slow
def run_seed(seed: int):
import subprocess, shutil
shutil.copy("/root/train_gpt.py", "/root/pgolf/train_gpt.py")
env = os.environ.copy()
env.update({
"SEED": str(seed), "RUN_ID": f"v6_s{seed}",
"TTT_ENABLED": "1", "TTT_LR": "0.001", "TTT_EPOCHS": "1",
"TTT_CHUNK_TOKENS": "32768", "TTT_FREEZE_BLOCKS": "10", "TTT_BATCH_SEQS": "32",
"SLOT_LR": "0.432", "SLOT_STEPS": "24", "SLOT_STRIDE": "64",
"SLOT_BETA1": "0.6", "SLOT_BETA2": "0.5", "SLOT_BATCH_SEQS": "128",
"NGRAM_ENABLED": "1", "NGRAM_ORDER": "22", "NGRAM_BUCKETS": "4194304",
"NGRAM_MIN_COUNT": "2", "NGRAM_MIN_TOKENS": "5000",
"GPTQ_DAMP_FACTOR": "0.005", "GPTQ_CALIB_VAL": "1", "GPTQ_CALIB_BATCHES": "256",
"QK_GAIN_INIT": "4.0", "MTP_NUM_HEADS": "2", "MTP_LOSS_WEIGHT": "0.1",
"MAX_WALLCLOCK_SECONDS": "600",
})
# First: quick smoke test β€” import check on 1 GPU
import sys
smoke = subprocess.run(
[sys.executable, "-c", "import torch; print(f'torch {torch.__version__}, cuda {torch.cuda.is_available()}, gpus {torch.cuda.device_count()}'); import train_gpt; print('import OK')"],
cwd="/root/pgolf", env=env, capture_output=True, text=True,
)
print(f"SMOKE: {smoke.stdout.strip()}")
if smoke.returncode != 0:
print(f"SMOKE ERROR: {smoke.stderr[-3000:]}")
return {"seed": seed, "bpb": None, "log": f"SMOKE FAILED:\n{smoke.stderr[-5000:]}"}

r = subprocess.run(
["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
cwd="/root/pgolf", env=env, capture_output=True, text=True,
)
log = r.stdout + r.stderr
bpb = None
for line in log.splitlines():
if "final_slot_exact" in line and "val_bpb:" in line:
try: bpb = float(line.split("val_bpb:")[-1].strip())
except: pass
return {"seed": seed, "bpb": bpb, "log": log[-15000:]}

@app.local_entrypoint()
def main(seed: int = 42):
print(f"Running v6 seed {seed} on Modal 8xH100...")
r = run_seed.remote(seed)
print(f"\nSeed {seed}: BPB={r['bpb']}")
print(f"\n{r['log']}")
80 changes: 80 additions & 0 deletions modal/run_v6_fa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Modal: Trinity v6 N-gram β€” WITH flash-attn on CUDA devel image.
Parallel attempt: if FA compiles, this will be 5x faster than SDPA fallback.

Usage: modal run --detach modal/run_v6_fa.py --seed 42
"""
import modal, os
from pathlib import Path

app = modal.App("trinity-v6-ngram-fa")

# CUDA devel image β€” has nvcc for flash-attn compilation
image = (
modal.Image.from_registry("nvidia/cuda:12.4.1-devel-ubuntu22.04", add_python="3.11")
.apt_install("git", "ninja-build")
.pip_install(
"torch==2.5.1",
index_url="https://download.pytorch.org/whl/cu124",
)
.pip_install("packaging", "wheel", "setuptools")
.run_commands(
# Build flash-attn from source with H100 arch
"MAX_JOBS=4 TORCH_CUDA_ARCH_LIST='9.0' pip install flash-attn==2.7.3 --no-build-isolation 2>&1 | tail -20",
)
.pip_install("sentencepiece", "huggingface-hub", "datasets", "tqdm", "numpy")
.run_commands(
"git clone https://github.com/openai/parameter-golf.git /root/pgolf",
"cd /root/pgolf && python3 data/cached_challenge_fineweb.py --variant sp1024 --train-shards 10",
)
)

LOCAL_TRAIN = str(Path(__file__).parent.parent / "records/track_10min_16mb/2026-04-02_Trinity_Hybrid_Ternary_GPTQ_XSA/train_gpt.py")
image = image.add_local_file(LOCAL_TRAIN, remote_path="/root/train_gpt.py")

@app.function(image=image, gpu="H100:8", timeout=3600)
def run_seed(seed: int):
import subprocess, shutil, sys
shutil.copy("/root/train_gpt.py", "/root/pgolf/train_gpt.py")

# Smoke test
smoke = subprocess.run(
[sys.executable, "-c",
"import torch; print(f'torch {torch.__version__}, cuda {torch.cuda.is_available()}, gpus {torch.cuda.device_count()}');"
"try:\n from flash_attn import flash_attn_func; print('FA2 OK')\nexcept: print('FA2 MISSING');"
"try:\n from flash_attn_interface import flash_attn_func; print('FA3 OK')\nexcept: print('FA3 MISSING')"],
capture_output=True, text=True)
print(f"SMOKE: {smoke.stdout.strip()}")
if "MISSING" in smoke.stdout and "FA2 MISSING" in smoke.stdout:
return {"seed": seed, "bpb": None, "log": f"FA install failed:\n{smoke.stderr[-3000:]}"}

env = os.environ.copy()
env.update({
"SEED": str(seed), "RUN_ID": f"v6fa_s{seed}",
"TTT_ENABLED": "1", "TTT_LR": "0.001", "TTT_EPOCHS": "1",
"TTT_CHUNK_TOKENS": "32768", "TTT_FREEZE_BLOCKS": "10", "TTT_BATCH_SEQS": "32",
"SLOT_LR": "0.432", "SLOT_STEPS": "24", "SLOT_STRIDE": "64",
"SLOT_BETA1": "0.6", "SLOT_BETA2": "0.5", "SLOT_BATCH_SEQS": "128",
"NGRAM_ENABLED": "1", "NGRAM_ORDER": "22", "NGRAM_BUCKETS": "4194304",
"NGRAM_MIN_COUNT": "2", "NGRAM_MIN_TOKENS": "5000",
"GPTQ_DAMP_FACTOR": "0.005", "GPTQ_CALIB_VAL": "1", "GPTQ_CALIB_BATCHES": "256",
"QK_GAIN_INIT": "4.0", "MTP_NUM_HEADS": "2", "MTP_LOSS_WEIGHT": "0.1",
"MAX_WALLCLOCK_SECONDS": "600",
})
r = subprocess.run(
["torchrun", "--standalone", "--nproc_per_node=8", "train_gpt.py"],
cwd="/root/pgolf", env=env, capture_output=True, text=True,
)
log = r.stdout + r.stderr
bpb = None
for line in log.splitlines():
if "final_slot_exact" in line and "val_bpb:" in line:
try: bpb = float(line.split("val_bpb:")[-1].strip())
except: pass
return {"seed": seed, "bpb": bpb, "log": log[-15000:]}

@app.local_entrypoint()
def main(seed: int = 42):
print(f"Running v6+FA seed {seed} on Modal 8xH100...")
r = run_seed.remote(seed)
print(f"\nSeed {seed}: BPB={r['bpb']}")
print(f"\n{r['log']}")
Loading