Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
4ce0d59
X-WING 3D Cubric: 0.4820 BPB (3-seed mean, std 0.0002)
Mar 26, 2026
6c49da3
B-wing lab: port PR #809 n-gram techniques onto X-WING base
Mar 26, 2026
bee0716
B-wing II: cubric ON + entropy shift + fast TTT
Mar 26, 2026
d6d281a
B-wing III: LoRA TTT from #809 + cubric ON + all n-gram fixes
Mar 26, 2026
137432f
Record bwing_full_port seed 1337: 0.4512 BPB
Mar 26, 2026
94bb107
Replace bwing_III with copy of SOTA bwing_full_port (0.4512 BPB)
Mar 26, 2026
2c0c0ee
B-wing IV + V: fix 7→9 hash primes (order 8-9 collision bug)
Mar 26, 2026
3ebaf38
Add B-wing pod setup script (FA3 + zstandard + sp1024)
Mar 26, 2026
5a21365
Add n-gram parameter grid sweep for bwing_V
Mar 26, 2026
75dbe40
A-Wing Green: INT5 GPTQ (clip_range=15) + 9-prime hash fix
Mar 26, 2026
22eae2a
A-Wing Green: strip TTT, cubric, F1 correction, distillation
Mar 26, 2026
d6cb709
Record results: A-Wing Green 0.4576, bwing_V 0.4601
Mar 26, 2026
c37a8ab
A-Wing Green_1: Oracle Alpha — use model_p vs ngram_p directly
Mar 26, 2026
08d6b7c
Green_1: cap training at 570s to fit GPTQ in 600s budget
Mar 26, 2026
d8b6022
Green_1: add preflight checks (zstd, FA3) + zstd import warning
Mar 26, 2026
b1d45b8
A-Wing Green_2: Oracle Alpha + LoRA TTT + 9-Prime
Mar 26, 2026
88ec4ca
Fix pod setup: use system Python, no conda/PYTHONPATH hacks
Mar 26, 2026
5876cf5
NEW SOTA 0.3200 BPB: A-Wing Green_1 Oracle Alpha + 9-Prime
Mar 26, 2026
da832ba
A-Wing Purple: Learned Mixer Head for legal n-gram ceiling
Mar 26, 2026
2b38218
Add pod_launch.sh: one command for clone + setup + run
Mar 26, 2026
a37d7c3
Fix pod_launch.sh: pull from private repo (fork1), not public
Mar 26, 2026
6004ac7
Purple: reduce prefill to 20 shards (~2B tokens), restore 570s cap
Mar 26, 2026
230dfc6
Clean up repo: single pod_setup.sh, archive stale dirs
Mar 26, 2026
db300a0
Fix pod_setup.sh: workspace path is /workspace/parameter-golf
Mar 26, 2026
2a92a77
F-Wing: Frugendorff + X-WING N-gram combined concept
Mar 26, 2026
473a4b7
Fix REPO_DIR depth in F_Wing run scripts (3 levels up, not 2)
Mar 26, 2026
5e8ec28
Add A-wing RED mixer variant with bounded distributed prefill
Mar 26, 2026
4a06a37
Add A-wing RED_G GPU monster mixer path and tune RED
Mar 26, 2026
3cedb3f
Fix DDP warmup by including mixer supervision in RED variants
Mar 26, 2026
005cdc5
records: add A-WING RED_G seed1337 run summary
Mar 26, 2026
4a4be33
F-Wing: rebase train_gpt.py onto A_wing/RED (add CrawlerGPT + mixer s…
Mar 26, 2026
f09a6e5
RED_G: fix ngram blend-mode conflicts and wire order-aware eval controls
Mar 26, 2026
abe72f0
F-Wing: fix CrawlerGPT torch.compile compatibility
Mar 26, 2026
a76dda4
Add A-Wing green_3: width bump to model_dim=640
Mar 26, 2026
5e27afc
Add A-Wing green_1A: legal alpha + PR#609 improvements
Mar 27, 2026
aa0a156
Optimize green_1A selective pruning: fast zstd-1 for binary search
Mar 27, 2026
411dea1
Add Cobra base-quality 10min harness plan and tooling
Mar 27, 2026
3b4b821
Add pod_setup_cobra bootstrap script
Mar 27, 2026
90741b4
Rat Rod Green: Parallel Muon base + GPTQ stripped for pure base model…
Mar 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ data/manifest.json
data/docs_selected.jsonl
.mypy_cache/
.venv
logs/
logs/
experiments/archive/checkpoints/
112 changes: 112 additions & 0 deletions experiments/A_wing/RED/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/bin/bash
set -euo pipefail
# A-WING RED_G: Mixer-first, startup-bounded variant.
# Keeps learned mixer head, but bounds prefill and uses distributed sync
# so setup doesn't dominate runtime.

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." && pwd)"
cd "${REPO_ROOT}"
export PYTHONPATH="${REPO_ROOT}/flash-attention/hopper:${PYTHONPATH:-}"

SEED="${SEED:-1337}"
NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
: "${MAX_WALLCLOCK_SECONDS:=570}"

# 10-minute eval budgeting (training and eval are separate challenge caps).
: "${EVAL_BUDGET_SECONDS:=600}"
: "${EVAL_FIXED_OVERHEAD_SECONDS:=150}"
: "${EVAL_SAFETY_MARGIN_SECONDS:=45}"
DEFAULT_NGRAM_MAX_SECONDS=$((EVAL_BUDGET_SECONDS - EVAL_FIXED_OVERHEAD_SECONDS - EVAL_SAFETY_MARGIN_SECONDS))
if (( DEFAULT_NGRAM_MAX_SECONDS < 60 )); then
DEFAULT_NGRAM_MAX_SECONDS=60
fi
: "${NGRAM_EVAL_MAX_SECONDS:=${DEFAULT_NGRAM_MAX_SECONDS}}"
: "${NGRAM_EVAL_BUCKETS:=16777216}"
: "${NGRAM_CHUNK_TOKENS:=1048576}"

# Mixer prefill controls (training-oracle build time).
: "${MIXER_BUCKETS:=2097152}"
: "${MIXER_N_ORDERS:=8}" # orders 2..9
: "${MIXER_PREFILL_MAX_SHARDS:=80}"
: "${MIXER_PREFILL_MAX_SECONDS:=90}"
: "${MIXER_PREFILL_MIN_SHARDS:=4}"
: "${MIXER_PREFILL_TOKENS_PER_SHARD:=50000000}"
: "${MIXER_GPU_MODE:=1}"
: "${MIXER_PREFILL_POS_CHUNK:=1000000}"

: "${COMPILE_FULLGRAPH:=0}"

# --- Pre-flight checks ---
echo "[preflight] checking zstandard..."
python3 -c "import zstandard; print(f' zstandard {zstandard.__version__} OK')" 2>/dev/null \
|| { echo " FATAL: zstandard not found. pip install zstandard"; exit 1; }

echo "[preflight] checking flash_attn..."
python3 -c "
try:
import flash_attn_interface; print(' FA3 (hopper) OK')
except ImportError:
import flash_attn; v=flash_attn.__version__
if v.startswith('3'): print(f' FA3 v{v} OK')
else: print(f' WARNING: FA{v[0]} detected — want FA3')
" 2>/dev/null || echo " WARNING: no flash_attn found"

echo "============================================"
echo " A-WING RED_G — GPU Monster Mixer"
echo " Seed: ${SEED}"
echo " Mixer: Linear(512→$((MIXER_N_ORDERS + 1))) orders 2..$((MIXER_N_ORDERS + 1))"
echo " Mixer prefill: <=${MIXER_PREFILL_MAX_SECONDS}s, min_shards=${MIXER_PREFILL_MIN_SHARDS}, max_shards=${MIXER_PREFILL_MAX_SHARDS}"
echo " Mixer buckets: ${MIXER_BUCKETS}, tokens/shard cap: ${MIXER_PREFILL_TOKENS_PER_SHARD}, gpu_mode=${MIXER_GPU_MODE}"
echo " Eval buckets: ${NGRAM_EVAL_BUCKETS}, ngram eval cap: ${NGRAM_EVAL_MAX_SECONDS}s"
echo " Training cap: ${MAX_WALLCLOCK_SECONDS}s"
echo "============================================"

SEED="$SEED" \
F1_CORR_RANK=0 \
DISTILL_ENABLED=0 \
MLP_ACT=leaky_relu_sq \
MLP_LEAKY_SLOPE=0.5 \
XSA_LAST_N=4 \
BIGRAM_VOCAB_SIZE=1536 \
TTT_EVAL_ENABLED=0 \
ROPE_DIMS=24 \
VAL_LOSS_EVERY=20000 \
TRAIN_LOG_EVERY=1000 \
SWA_EVERY=100 \
COMPLEMENT_ALPHA=0.5 \
MIXER_ENABLED=1 \
MIXER_N_ORDERS="${MIXER_N_ORDERS}" \
MIXER_LOSS_WEIGHT=0.1 \
MIXER_NEURAL_FLOOR=0.05 \
MIXER_BUCKETS="${MIXER_BUCKETS}" \
MIXER_PREFILL_MAX_SHARDS="${MIXER_PREFILL_MAX_SHARDS}" \
MIXER_PREFILL_MAX_SECONDS="${MIXER_PREFILL_MAX_SECONDS}" \
MIXER_PREFILL_MIN_SHARDS="${MIXER_PREFILL_MIN_SHARDS}" \
MIXER_PREFILL_TOKENS_PER_SHARD="${MIXER_PREFILL_TOKENS_PER_SHARD}" \
MIXER_GPU_MODE="${MIXER_GPU_MODE}" \
MIXER_PREFILL_POS_CHUNK="${MIXER_PREFILL_POS_CHUNK}" \
NGRAM_EVAL_ORDER=9 \
NGRAM_EVAL_MIN_ORDER=2 \
NGRAM_EVAL_ADAPTIVE=1 \
NGRAM_EVAL_ALPHA=0.30 \
NGRAM_EVAL_ALPHA_MIN=0.05 \
NGRAM_EVAL_ALPHA_MAX=0.60 \
NGRAM_EVAL_ENTROPY_CENTER=3.0 \
NGRAM_EVAL_ENTROPY_SCALE=2.0 \
NGRAM_EVAL_MIN_COUNT=2 \
NGRAM_EVAL_BUCKETS="${NGRAM_EVAL_BUCKETS}" \
NGRAM_EVAL_MAX_SECONDS="${NGRAM_EVAL_MAX_SECONDS}" \
CUBRIC_CADENCE=0 \
NGRAM_ENTROPY_SHIFT=1 \
NGRAM_ORDER_MULTS="" \
NGRAM_CHUNK_TOKENS="${NGRAM_CHUNK_TOKENS}" \
MAX_WALLCLOCK_SECONDS="${MAX_WALLCLOCK_SECONDS}" \
COMPILE_FULLGRAPH="${COMPILE_FULLGRAPH}" \
torchrun --standalone --nproc_per_node="${NPROC_PER_NODE}" \
"${SCRIPT_DIR}/train_gpt.py" \
2>&1 | tee "logs/awing_redg_gpu_mixer_s${SEED}_$(date +%Y%m%d_%H%M%S).log"

echo "============================================"
echo " DONE"
echo "============================================"
Loading