Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"track": "non_record_16mb",
"val_bpb": 1.13353331,
"val_bpb_std": 0.00095351,
"val_bpb_seeds": {
"42": 1.13472408,
"99": 1.13387877,
"1337": 1.13269366,
"2025": 1.13283672
},
"model_file": "final_model.int6.ptz",
"model_bytes": 15094152,
"code_bytes": 104955,
"total_submission_bytes": 15199107,
"training_tokens_billions": 5.65,
"training_script": "train_gpt.py",
"hardware": "2×A100 PCIe 40GB",
"training_time_hours": 2.23,
"training_steps": 7185,
"quantization": "int6+lzma-6",
"architecture": "10L-512D-8H-4KV-3xMLP-BigramHash1536-E2E_TTT-FlowRefiner",
"num_layers": 10,
"model_dim": 512,
"model_params": 25749398,
"refiner_params": 1182530,
"total_params": 26931928,
"seed": 42,
"seeds_completed": [42, 99, 1337, 2025],
"date": "2026-03-30",
"slurm_jobs": {
"seed42": 55383562,
"seed99": 55392385,
"seed1337": 55392383,
"seed2025": 55392384
}
}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
#SBATCH --job-name=varA_s42
#SBATCH --partition=gpu
#SBATCH --gres=gpu:a100:2
#SBATCH --nodes=1
#SBATCH --cpus-per-task=16
#SBATCH --mem=128G
#SBATCH --time=08:00:00
#SBATCH --nice=0
#SBATCH --output=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varA_11L_longwarmdown/runs/seed42_%j/logs/train_%j.out
#SBATCH --error=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varA_11L_longwarmdown/runs/seed42_%j/logs/train_%j.err
#SBATCH --account=medcam

# =============================================================================
# Variant A: 11L + BigramHash(1536) + Long Warmdown (60%) — Seed 42
# Goal: Improve weight compressibility via longer warmdown to fit under 16MB
# =============================================================================
set -euo pipefail

VARIANT_DIR=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varA_11L_longwarmdown
RUN_DIR=${VARIANT_DIR}/runs/seed42_${SLURM_JOB_ID}
mkdir -p "${RUN_DIR}/logs" "${RUN_DIR}/checkpoints"

echo "=== Variant A: 11L + Long Warmdown — Seed 42 ==="
echo "Run dir: ${RUN_DIR}"
echo "Host: $(hostname), GPUs: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | tr '\n' ', ')"
date

source /hpfs/scratch/gpfs/mcclec07/code/parameter_golf/.venv/bin/activate
cd "${RUN_DIR}"

NGPU=2
export MASTER_PORT=$((10000 + RANDOM % 50000))

export RUN_ID="varA_seed42_${SLURM_JOB_ID}"
export SEED=42

export DATA_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/datasets/fineweb10B_sp1024/
export TOKENIZER_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/tokenizers/fineweb_1024_bpe.model
export VOCAB_SIZE=1024

export MAX_WALLCLOCK_SECONDS=0
export ITERATIONS=7185
export WARMDOWN_ITERS=4311
export WARMUP_STEPS=20
export VAL_LOSS_EVERY=500
export TRAIN_LOG_EVERY=200
export TRAIN_BATCH_TOKENS=786432
export TRAIN_SEQ_LEN=2048

export NUM_LAYERS=11
export MODEL_DIM=512
export NUM_HEADS=8
export NUM_KV_HEADS=4
export MLP_MULT=3
export TIE_EMBEDDINGS=1
export BIGRAM_VOCAB_SIZE=1536
export BIGRAM_DIM=128
export XSA_LAST_N=4
export ROPE_DIMS=16
export LN_SCALE=1
export LOGIT_SOFTCAP=30.0
export VE_ENABLED=1
export VE_DIM=128
export VE_LAYERS="9,10"

# --- E2E TTT: ENABLED ---
export E2E_TTT_ENABLED=1
export E2E_TTT_NUM_HEADS=8
export E2E_TTT_MINI_BATCH=16
export E2E_TTT_BASE_LR=1.0

# --- FlowRefiner: ENABLED ---
export FLOW_ENABLED=1
export FLOW_LATENT_DIM=64
export FLOW_HIDDEN_DIM=256
export FLOW_INIT_SCALE=0.01

# --- Checkpointing ---
export SAVE_EVERY=1000
export SAVE_DIR=${RUN_DIR}/checkpoints

export MATRIX_LR=0.025
export SCALAR_LR=0.025
export TIED_EMBED_LR=0.035
export MUON_WD=0.04
export ADAM_WD=0.04
export GRAD_CLIP_NORM=0.3
export EVAL_STRIDE=64

echo "Config: SEED=$SEED ITERATIONS=$ITERATIONS WARMDOWN=$WARMDOWN_ITERS BIGRAM=$BIGRAM_VOCAB_SIZE NUM_LAYERS=$NUM_LAYERS NGPU=$NGPU"

torchrun --standalone --nproc_per_node=$NGPU \
"${VARIANT_DIR}/train_gpt.py" 2>&1 | \
tee "${RUN_DIR}/logs/train_${SLURM_JOB_ID}.txt"

echo "=== EXIT: $? ==="
date
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
#SBATCH --job-name=varB_s1337
#SBATCH --partition=gpu
#SBATCH --gres=gpu:a100:2
#SBATCH --nodes=1
#SBATCH --cpus-per-task=16
#SBATCH --mem=128G
#SBATCH --time=08:00:00
#SBATCH --nice=0
#SBATCH --output=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L/runs/seed1337_%j/logs/train_%j.out
#SBATCH --error=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L/runs/seed1337_%j/logs/train_%j.err
#SBATCH --account=medcam

# =============================================================================
# Variant B: 10L + BigramHash(1536) + Long Warmdown (60%) — Seed 1337
# 3-seed reproducibility run for PR supplementary data
# =============================================================================
set -euo pipefail

VARIANT_DIR=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L
RUN_DIR=${VARIANT_DIR}/runs/seed1337_${SLURM_JOB_ID}
mkdir -p "${RUN_DIR}/logs" "${RUN_DIR}/checkpoints"

echo "=== Variant B: 10L — Seed 1337 ==="
echo "Run dir: ${RUN_DIR}"
echo "Host: $(hostname), GPUs: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | tr '\n' ', ')"
date

source /hpfs/scratch/gpfs/mcclec07/code/parameter_golf/.venv/bin/activate
cd "${RUN_DIR}"

NGPU=2
export MASTER_PORT=$((10000 + RANDOM % 50000))

export RUN_ID="varB_seed1337_${SLURM_JOB_ID}"
export SEED=1337

export DATA_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/datasets/fineweb10B_sp1024/
export TOKENIZER_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/tokenizers/fineweb_1024_bpe.model
export VOCAB_SIZE=1024

export MAX_WALLCLOCK_SECONDS=0
export ITERATIONS=7185
export WARMDOWN_ITERS=4311
export WARMUP_STEPS=20
export VAL_LOSS_EVERY=500
export TRAIN_LOG_EVERY=200
export TRAIN_BATCH_TOKENS=786432
export TRAIN_SEQ_LEN=2048

export NUM_LAYERS=10
export MODEL_DIM=512
export NUM_HEADS=8
export NUM_KV_HEADS=4
export MLP_MULT=3
export TIE_EMBEDDINGS=1
export BIGRAM_VOCAB_SIZE=1536
export BIGRAM_DIM=128
export XSA_LAST_N=4
export ROPE_DIMS=16
export LN_SCALE=1
export LOGIT_SOFTCAP=30.0
export VE_ENABLED=1
export VE_DIM=128
export VE_LAYERS="8,9"

# --- E2E TTT: ENABLED ---
export E2E_TTT_ENABLED=1
export E2E_TTT_NUM_HEADS=8
export E2E_TTT_MINI_BATCH=16
export E2E_TTT_BASE_LR=1.0

# --- FlowRefiner: ENABLED ---
export FLOW_ENABLED=1
export FLOW_LATENT_DIM=64
export FLOW_HIDDEN_DIM=256
export FLOW_INIT_SCALE=0.01

# --- Checkpointing ---
export SAVE_EVERY=1000
export SAVE_DIR=${RUN_DIR}/checkpoints

export MATRIX_LR=0.025
export SCALAR_LR=0.025
export TIED_EMBED_LR=0.035
export MUON_WD=0.04
export ADAM_WD=0.04
export GRAD_CLIP_NORM=0.3
export EVAL_STRIDE=64

echo "Config: SEED=$SEED ITERATIONS=$ITERATIONS WARMDOWN=$WARMDOWN_ITERS BIGRAM=$BIGRAM_VOCAB_SIZE NUM_LAYERS=$NUM_LAYERS NGPU=$NGPU"

torchrun --standalone --nproc_per_node=$NGPU \
"${VARIANT_DIR}/train_gpt.py" 2>&1 | \
tee "${RUN_DIR}/logs/train_${SLURM_JOB_ID}.txt"

echo "=== EXIT: $? ==="
date
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
#SBATCH --job-name=varB_s2025
#SBATCH --partition=gpu
#SBATCH --gres=gpu:a100:2
#SBATCH --nodes=1
#SBATCH --cpus-per-task=16
#SBATCH --mem=128G
#SBATCH --time=08:00:00
#SBATCH --nice=0
#SBATCH --output=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L/runs/seed2025_%j/logs/train_%j.out
#SBATCH --error=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L/runs/seed2025_%j/logs/train_%j.err
#SBATCH --account=medcam

# =============================================================================
# Variant B: 10L + BigramHash(1536) + Long Warmdown (60%) — Seed 2025
# 3-seed reproducibility run for PR supplementary data
# =============================================================================
set -euo pipefail

VARIANT_DIR=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/experiments_16mb/varB_10L
RUN_DIR=${VARIANT_DIR}/runs/seed2025_${SLURM_JOB_ID}
mkdir -p "${RUN_DIR}/logs" "${RUN_DIR}/checkpoints"

echo "=== Variant B: 10L — Seed 2025 ==="
echo "Run dir: ${RUN_DIR}"
echo "Host: $(hostname), GPUs: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | tr '\n' ', ')"
date

source /hpfs/scratch/gpfs/mcclec07/code/parameter_golf/.venv/bin/activate
cd "${RUN_DIR}"

NGPU=2
export MASTER_PORT=$((10000 + RANDOM % 50000))

export RUN_ID="varB_seed2025_${SLURM_JOB_ID}"
export SEED=2025

export DATA_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/datasets/fineweb10B_sp1024/
export TOKENIZER_PATH=/hpfs/scratch/gpfs/mcclec07/code/parameter_golf/repo/data/tokenizers/fineweb_1024_bpe.model
export VOCAB_SIZE=1024

export MAX_WALLCLOCK_SECONDS=0
export ITERATIONS=7185
export WARMDOWN_ITERS=4311
export WARMUP_STEPS=20
export VAL_LOSS_EVERY=500
export TRAIN_LOG_EVERY=200
export TRAIN_BATCH_TOKENS=786432
export TRAIN_SEQ_LEN=2048

export NUM_LAYERS=10
export MODEL_DIM=512
export NUM_HEADS=8
export NUM_KV_HEADS=4
export MLP_MULT=3
export TIE_EMBEDDINGS=1
export BIGRAM_VOCAB_SIZE=1536
export BIGRAM_DIM=128
export XSA_LAST_N=4
export ROPE_DIMS=16
export LN_SCALE=1
export LOGIT_SOFTCAP=30.0
export VE_ENABLED=1
export VE_DIM=128
export VE_LAYERS="8,9"

# --- E2E TTT: ENABLED ---
export E2E_TTT_ENABLED=1
export E2E_TTT_NUM_HEADS=8
export E2E_TTT_MINI_BATCH=16
export E2E_TTT_BASE_LR=1.0

# --- FlowRefiner: ENABLED ---
export FLOW_ENABLED=1
export FLOW_LATENT_DIM=64
export FLOW_HIDDEN_DIM=256
export FLOW_INIT_SCALE=0.01

# --- Checkpointing ---
export SAVE_EVERY=1000
export SAVE_DIR=${RUN_DIR}/checkpoints

export MATRIX_LR=0.025
export SCALAR_LR=0.025
export TIED_EMBED_LR=0.035
export MUON_WD=0.04
export ADAM_WD=0.04
export GRAD_CLIP_NORM=0.3
export EVAL_STRIDE=64

echo "Config: SEED=$SEED ITERATIONS=$ITERATIONS WARMDOWN=$WARMDOWN_ITERS BIGRAM=$BIGRAM_VOCAB_SIZE NUM_LAYERS=$NUM_LAYERS NGPU=$NGPU"

torchrun --standalone --nproc_per_node=$NGPU \
"${VARIANT_DIR}/train_gpt.py" 2>&1 | \
tee "${RUN_DIR}/logs/train_${SLURM_JOB_ID}.txt"

echo "=== EXIT: $? ==="
date
Loading