Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# SP8192 + LongCtx NoQV QK5.25 Prefix2750

This is a deliberately small follow-up candidate on PR #1953:

- Base: PR #1953, `PR #1945 base + 2560 long-context + no_qv TTT mask + TTT LR 0.75 + QK_GAIN 5.25`.
- Only intended change: `PHASED_TTT_PREFIX_DOCS=2750` instead of `2500`.
- No tokenizer change, no PPM, no n-gram, no SLOT, no logit bias, no pre-quant validation adaptation.
- Artifact size is effectively unchanged; the risk is eval time, not bytes.

## Why this change

PR #1953 reports max eval time `513.1s`, leaving roughly `87s` under the 600s eval cap. Its lineage already shows that increasing phased-TTT prefix docs from earlier values to `2500` was useful. This candidate spends part of the remaining eval budget on a slightly larger TTT prefix (`2750`) while keeping every other mechanism unchanged.

Single-seed testing shows this change is essentially neutral versus the #1953 seed 42 reference. It is included as a narrow phased-TTT prefix schedule experiment with the full seed 42 log.

## Result

| Run | Seed | Prefix docs | Final BPB | Eval time | Total bytes |
| --- | ---: | ---: | ---: | ---: | ---: |
| #1953 reference | 42 | 2500 | 1.05824720 | 430.0s | 15,988,861 |
| This experiment | 42 | 2750 | 1.05826976 | 495.0s | 15,978,173 |

The longer prefix increases eval time by about 65s and lands within `0.00003 BPB` of the #1953 seed 42 reference. The full log is included as `train_seed42.log`.

## Data

This script uses the CaseOps SP8192 dataset. Do not use the ordinary `sp8192` FineWeb download.

Expected layout after running `download_caseops_data.py`:

```text
/workspace/caseops_data/datasets/
tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model
datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved/
fineweb_train_*.bin
fineweb_val_*.bin
fineweb_val_bytes_*.bin
```

Download and validate:

```bash
python3 records/track_10min_16mb/2026-04-30_LongCtx_NoQV_QK525_Prefix2750/download_caseops_data.py \
--local-dir /workspace/caseops_data
```

## Dependencies

Python packages are listed in `requirements.txt`. FlashAttention 3 and `lrzip` are required:

```bash
apt-get update
apt-get install -y lrzip
pip3 install -r records/track_10min_16mb/2026-04-30_LongCtx_NoQV_QK525_Prefix2750/requirements.txt
pip3 install --no-deps flash_attn_3 --find-links https://windreamer.github.io/flash-attention3-wheels/cu128_torch291/
```

## Single-seed test

```bash
RUN_ID=1953_prefix2750_seed42 \
SEED=42 \
DATA_DIR=/workspace/caseops_data/datasets \
DATA_PATH=/workspace/caseops_data/datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved \
TOKENIZER_PATH=/workspace/caseops_data/datasets/tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model \
CASEOPS_ENABLED=1 \
VOCAB_SIZE=8192 \
ITERATIONS=20000 \
MAX_WALLCLOCK_SECONDS=600 \
VAL_LOSS_EVERY=0 \
WARMDOWN_FRAC=0.85 \
BETA2=0.99 \
MUON_MOMENTUM=0.97 \
MATRIX_LR=0.026 \
MIN_LR=0.1 \
EMBED_BITS=7 \
MATRIX_CLIP_SIGMAS=12.85 \
ATTN_CLIP_SIGMAS=13.0 \
MLP_CLIP_SIGMAS=11.5 \
EMBED_CLIP_SIGMAS=14.0 \
GRAD_CLIP_NORM=0.3 \
FUSED_CE_ENABLED=1 \
SMEAR_GATE_ENABLED=1 \
GATE_WINDOW=12 \
SPARSE_ATTN_GATE_ENABLED=1 \
SPARSE_ATTN_GATE_SCALE=0.5 \
SPARSE_ATTN_GATE_INIT_STD=0.0 \
GATED_ATTN_QUANT_GATE=1 \
LQER_ENABLED=1 \
LQER_RANK=4 \
LQER_TOP_K=3 \
LQER_GROUP_SIZE=64 \
LQER_FACTOR_BITS=4 \
LQER_ASYM_ENABLED=1 \
LQER_ASYM_GROUP=64 \
AWQ_LITE_ENABLED=1 \
ASYM_LOGIT_RESCALE=1 \
GPTQ_RESERVE_SECONDS=4.0 \
GPTQ_CALIBRATION_BATCHES=16 \
COMPRESSOR=pergroup \
TTT_ENABLED=1 \
PHASED_TTT_ENABLED=1 \
PHASED_TTT_NUM_PHASES=3 \
PHASED_TTT_PREFIX_DOCS=2750 \
TTT_LORA_RANK=80 \
TTT_MASK=no_qv \
TTT_Q_LORA=0 \
TTT_V_LORA=0 \
TTT_LOCAL_LR_MULT=0.75 \
TTT_BETA2=0.99 \
TTT_WEIGHT_DECAY=0.5 \
EVAL_SEQ_LEN=2560 \
TTT_EVAL_SEQ_LEN=2560 \
QK_GAIN_INIT=5.25 \
NCCL_NET=Socket \
torchrun --standalone --nproc_per_node=8 \
records/track_10min_16mb/2026-04-30_LongCtx_NoQV_QK525_Prefix2750/train_gpt.py
```

## Decision rule

Compare seed 42 against PR #1953 seed 42:

- PR #1953 seed 42 post-TTT: `1.05824720`.
- This experiment seed 42 post-TTT: `1.05826976`.
- This is a single-seed schedule experiment; the result is effectively tied with the #1953 seed 42 reference while using a larger phased-TTT prefix.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python3
from __future__ import annotations

import argparse
from pathlib import Path

from huggingface_hub import snapshot_download


DEFAULT_REPO_ID = "romeerp/parameter-golf-caseops-v1"
TOKENIZER = "datasets/tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model"
DATASET_DIR = "datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"


def count_files(path: Path, pattern: str) -> int:
return len(list(path.glob(pattern)))


def validate(local_dir: Path, min_train_shards: int) -> None:
root = local_dir / "datasets"
tokenizer = root / "tokenizers" / "fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model"
dataset = root / "datasets" / "fineweb10B_sp8192_lossless_caps_caseops_v1_reserved"
missing = [p for p in (tokenizer, dataset) if not p.exists()]
if missing:
raise FileNotFoundError("Missing CaseOps files: " + ", ".join(str(p) for p in missing))

train = count_files(dataset, "fineweb_train_*.bin")
val = count_files(dataset, "fineweb_val_*.bin")
val_bytes = count_files(dataset, "fineweb_val_bytes_*.bin")
if train < min_train_shards:
raise RuntimeError(f"Expected at least {min_train_shards} train shards, found {train}")
if val == 0:
raise RuntimeError("No fineweb_val_*.bin shards found")
if val_bytes == 0:
raise RuntimeError("No fineweb_val_bytes_*.bin sidecar shards found")
print(f"CaseOps data ready: train_shards={train} val_shards={val} val_byte_shards={val_bytes}")
print(f"DATA_DIR={root}")
print(f"DATA_PATH={dataset}")
print(f"TOKENIZER_PATH={tokenizer}")


def main() -> None:
parser = argparse.ArgumentParser(description="Download the CaseOps SP8192 dataset used by the #1953 lineage.")
parser.add_argument("--repo-id", default=DEFAULT_REPO_ID)
parser.add_argument("--local-dir", default="/workspace/caseops_data", type=Path)
parser.add_argument("--min-train-shards", default=80, type=int)
parser.add_argument("--validate-only", action="store_true")
args = parser.parse_args()

if not args.validate_only:
snapshot_download(
repo_id=args.repo_id,
repo_type="dataset",
local_dir=str(args.local_dir),
allow_patterns=[
TOKENIZER,
f"{DATASET_DIR}/fineweb_train_*.bin",
f"{DATASET_DIR}/fineweb_val_*.bin",
f"{DATASET_DIR}/fineweb_val_bytes_*.bin",
],
)
validate(args.local_dir, args.min_train_shards)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
--extra-index-url https://download.pytorch.org/whl/cu128
torch==2.9.1+cu128
sentencepiece
brotli
huggingface_hub
numpy
python-minifier
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"author": "someone114514",
"github_id": "someone114514",
"name": "SP8192 + LongCtx NoQV QK5.25 Prefix2750, seed42 1.05827 BPB",
"date": "2026-04-30",
"track": "10min_16mb",
"status": "single_seed_result",
"base_pr": 1953,
"base_val_bpb": 1.0585537,
"base_seed42_val_bpb": 1.0582472,
"candidate_change": {
"PHASED_TTT_PREFIX_DOCS": 2750,
"base_PHASED_TTT_PREFIX_DOCS": 2500,
"everything_else": "intended to match PR #1953"
},
"val_bpb": 1.05826976,
"val_bpb_std": null,
"seeds": [42],
"seed_results": {
"42": {
"pre_quant_bpb": 1.06162201,
"quantized_bpb": 1.07001475,
"quantized_ttt_phased_bpb": 1.05826976,
"eval_time_s": 495.0,
"total_submission_bytes": 15978173
}
},
"compliance": {
"no_tokenizer_change": true,
"no_ppm": true,
"no_ngram": true,
"no_slot": true,
"no_logit_bias": true,
"score_before_update": "inherits PR #1953 phased TTT score-first implementation"
}
}
Loading