openai · Devchandrasen · Apr 30, 2026
diff --git a/...track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/README.md b/...track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/README.md
@@ -0,0 +1,37 @@
+# AWQ 2xH100 Proxy, No-Compile Quantized Eval
+
+Non-record candidate submission by Chandrasen Pandey (`Devchandrasen`).
+
+This is a small-resource reproduction/variant of the PR #1908 / PR #1956 AWQ + GPTQ stack. It was run on 2x H100 instead of the official 8x H100 leaderboard configuration, so it is submitted as non-record evidence rather than a SOTA claim.
+
+## Result
+
+| Metric | Value |
+|---|---:|
+| Seed | 42 |
+| Quantized validation BPB | 1.15828615 |
+| Quantized validation loss | 2.53478079 |
+| Pre-quant post-EMA BPB | 1.15335094 |
+| Training stop | 1241 steps |
+| Training time | 599917 ms |
+| Quantized artifact bytes | 15964464 |
+| Compressed code bytes | 33825 |
+| Total counted bytes | 15998289 |
+| GPUs | 2x H100 80GB |
+
+The total counted artifact is under the decimal 16MB limit by 1711 bytes.
+
+## Notes
+
+- Based on the PR #1956 record folder, itself a compliant rerun of the PR #1908 activation-aware GPTQ/AWQ stack.
+- This run used `LQER_TOP_K=2` to fit under the 16MB cap on the 2-GPU proxy run.
+- Test-time training was disabled for this proxy run: `TTT_ENABLED=0`.
+- The original full training run produced the under-cap artifact, then crashed during the compiled quantized eval path on the local PyTorch 2.8.0 + CUDA 12.8 HPC environment.
+- `train_gpt.py` includes a tiny environment-gated bypass for `torch.compile` in the quantized eval path. The saved under-cap artifact was then reloaded and evaluated successfully with `PGOLF_DISABLE_QUANT_COMPILE=1`.
+
+## Logs
+
+- `train_seed42_original_compile_crash.log` is the full training/serialization log from the 10-minute 2xH100 run. It includes the under-cap artifact size and the original post-decompression compile crash.
+- `eval_seed42_existing_artifact_nocompile.log` reloads that same `final_model.int6.ptz` artifact and reports the clean quantized validation score above.
+
+This is not a leaderboard-winning run. It is a packaged non-record/candidate result showing a valid under-cap artifact and clean quantized evaluation on available 2xH100 HPC resources.
diff --git a/...26-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/eval_seed42_existing_artifact_nocompile.log b/...26-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/eval_seed42_existing_artifact_nocompile.log
@@ -0,0 +1,140 @@
+Hyperparameters:
+  adam_eps: 1e-08
+  adam_wd: 0.02
+  artifact_dir: /home/chandrasen.pandey/pgolf_codex/awq_eval_existing_nocompile_20260430_151903/proxy_s42_eval
+  attn_clip_sigmas: 13.0
+  attn_out_gate_enabled: False
+  attn_out_gate_src: proj
+  awq_lite_bits: 8
+  awq_lite_enabled: True
+  awq_lite_group_size: 64
+  awq_lite_group_top_k: 2
+  beta1: 0.9
+  beta2: 0.99
+  caseops_enabled: True
+  compressor: pergroup
+  data_dir: ./data/
+  datasets_dir: /home/chandrasen.pandey/pgolf_codex/caseops_data/datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved
+  distributed: True
+  ema_decay: 0.9965
+  embed_bits: 7
+  embed_clip_sigmas: 14.0
+  embed_lr: 0.6
+  embed_wd: 0.085
+  enable_looping_at: 0.35
+  eval_seq_len: 2048
+  eval_stride: 64
+  fused_ce_enabled: True
+  gate_window: 12
+  gated_attn_enabled: False
+  gated_attn_init_std: 0.01
+  gated_attn_quant_gate: True
+  global_ttt_batch_seqs: 32
+  global_ttt_chunk_tokens: 32768
+  global_ttt_epochs: 1
+  global_ttt_grad_clip: 1.0
+  global_ttt_lr: 0.001
+  global_ttt_momentum: 0.9
+  global_ttt_respect_doc_boundaries: True
+  global_ttt_warmup_chunks: 0
+  global_ttt_warmup_start_lr: 0.0
+  gptq_calibration_batches: 16
+  gptq_reserve_seconds: 0.5
+  grad_accum_steps: 4
+  grad_clip_norm: 0.3
+  is_main_process: True
+  iterations: 20000
+  ln_scale: True
+  local_rank: 0
+  logfile: /home/chandrasen.pandey/pgolf_codex/awq_eval_existing_nocompile_20260430_151903/proxy_s42_eval/proxy_s42_eval.txt
+  logit_softcap: 30.0
+  loop_end: 5
+  loop_start: 3
+  lqer_asym_enabled: True
+  lqer_asym_group: 64
+  lqer_enabled: True
+  lqer_factor_bits: 4
+  lqer_gain_select: False
+  lqer_rank: 4
+  lqer_scope: all
+  lqer_top_k: 2
+  matrix_bits: 6
+  matrix_clip_sigmas: 12.85
+  matrix_lr: 0.026
+  max_wallclock_seconds: 600.0
+  min_lr: 0.1
+  mlp_clip_sigmas: 11.5
+  mlp_mult: 4.0
+  model_dim: 512
+  model_path: /home/chandrasen.pandey/pgolf_codex/awq_eval_existing_nocompile_20260430_151903/proxy_s42_eval/final_model.pt
+  muon_backend_steps: 5
+  muon_momentum: 0.97
+  muon_momentum_warmup_start: 0.92
+  muon_momentum_warmup_steps: 1500
+  muon_row_normalize: True
+  muon_wd: 0.095
+  num_heads: 8
+  num_kv_heads: 4
+  num_layers: 11
+  num_loops: 2
+  parallel_final_lane: mean
+  parallel_start_layer: 8
+  phased_ttt_num_phases: 3
+  phased_ttt_prefix_docs: 2500
+  qk_gain_init: 5.0
+  quantized_model_path: /home/chandrasen.pandey/pgolf_codex/awq_eval_existing_nocompile_20260430_151903/proxy_s42_eval/final_model.int6.ptz
+  rank: 0
+  rope_base: 10000.0
+  rope_dims: 16
+  rope_train_seq_len: 2048
+  rope_yarn: False
+  run_id: proxy_s42_eval
+  scalar_lr: 0.02
+  seed: 42
+  skip_gates_enabled: True
+  smear_gate_enabled: True
+  sparse_attn_gate_enabled: True
+  sparse_attn_gate_init_std: 0.0
+  sparse_attn_gate_scale: 0.5
+  tie_embeddings: True
+  tied_embed_init_std: 0.005
+  tied_embed_lr: 0.03
+  tokenizer_path: /home/chandrasen.pandey/pgolf_codex/caseops_data/datasets/tokenizers/fineweb_8192_bpe_lossless_caps_caseops_v1_reserved.model
+  train_batch_tokens: 786432
+  train_files: /home/chandrasen.pandey/pgolf_codex/caseops_data/datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved/fineweb_train_*.bin
+  train_log_every: 20
+  train_seq_len: 2048
+  ttt_batch_size: 64
+  ttt_beta1: 0.0
+  ttt_beta2: 0.99
+  ttt_chunk_size: 48
+  ttt_enabled: False
+  ttt_eval_batches: 
+  ttt_eval_seq_len: 2048
+  ttt_grad_steps: 1
+  ttt_k_lora: True
+  ttt_lora_lr: 0.0001
+  ttt_lora_rank: 80
+  ttt_mlp_lora: True
+  ttt_o_lora: True
+  ttt_optimizer: adam
+  ttt_weight_decay: 0.5
+  val_batch_tokens: 262144
+  val_bytes_files: /home/chandrasen.pandey/pgolf_codex/caseops_data/datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved/fineweb_val_bytes_*.bin
+  val_doc_fraction: 1.0
+  val_files: /home/chandrasen.pandey/pgolf_codex/caseops_data/datasets/datasets/fineweb10B_sp8192_lossless_caps_caseops_v1_reserved/fineweb_val_*.bin
+  val_loss_every: 0
+  vocab_size: 8192
+  warmdown_frac: 0.85
+  warmup_steps: 8
+  world_size: 2
+  xsa_last_n: 11
+train_shards: 80
+val_tokens: 47851520
+TTT_EVAL_ONLY=1 — skipping training + GPTQ, loading saved artifact for TTT eval
+ttt_lora_alpha: 144.0
+ttt_warm_start_a: True
+ttt_weight_decay: 0.5
+Deserialize: per-group lrzip decompression...
+Deserialize: decompression done in 19.0s
+diagnostic quantized val_loss:2.53478079 val_bpb:1.15828615 eval_time:44528ms
diff --git a/...ds/track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/requirements.txt b/...ds/track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/requirements.txt
@@ -0,0 +1,10 @@
+torch==2.9.1+cu128
+numpy
+brotli
+sentencepiece
+huggingface-hub
+# FlashAttention 3 (install separately):
+#   pip install --no-deps flash_attn_3 \
+#     --find-links https://windreamer.github.io/flash-attention3-wheels/cu128_torch291/
+# System binary required:
+#   apt-get install lrzip
diff --git a/...rds/track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/submission.json b/...rds/track_non_record_16mb/2026-04-30_Devchandrasen_AWQ2GPU_ProxyNoCompile/submission.json
@@ -0,0 +1,45 @@
+{
+  "track": "non_record_16mb",
+  "date": "2026-04-30",
+  "name": "AWQ 2xH100 Proxy, No-Compile Quantized Eval",
+  "author": "Chandrasen Pandey",
+  "github_id": "Devchandrasen",
+  "val_loss": 2.53478079,
+  "val_bpb": 1.15828615,
+  "pre_quant_val_loss": 2.52398063,
+  "pre_quant_val_bpb": 1.15335094,
+  "seeds": [42],
+  "seed_results": {
+    "42": {
+      "val_loss": 2.53478079,
+      "val_bpb": 1.15828615,
+      "pre_quant_val_loss": 2.52398063,
+      "pre_quant_val_bpb": 1.15335094,
+      "artifact_bytes": 15964464,
+      "compressed_code_bytes": 33825,
+      "total_counted_bytes": 15998289,
+      "train_time_ms": 599917,
+      "step": 1241,
+      "eval_time_ms": 44528
+    }
+  },
+  "compliance": {
+    "artifact_limit_bytes": 16000000,
+    "total_counted_bytes": 15998289,
+    "under_16mb_artifact": true,
+    "train_wallclock_cap_ms": 600000,
+    "train_time_ms_observed": 599917,
+    "under_600s_train_wallclock": true,
+    "record_claim": false,
+    "reason_non_record": "Single 2xH100 proxy run; not an 8xH100 SOTA claim."
+  },
+  "hardware": {
+    "gpu": "2x NVIDIA H100 80GB",
+    "cluster": "UPES HPC gpu01",
+    "pytorch": "2.8.0+cu128"
+  },
+  "lineage": {
+    "base": "PR #1956 / PR #1908 AWQ + GPTQ stack",
+    "delta": "2xH100 proxy run with LQER_TOP_K=2 and TTT disabled; train_gpt.py adds an environment-gated no-compile quantized eval path for PyTorch 2.8.0 stability."
+  }
+}