Fix README

sesterce · sesterce · commit c8305fcb3874 · 2026-05-11T23:49:32.000Z
diff --git a/Qwen3-ASR/finetuning/outputs/gspo_fr_0p6b.pid b/Qwen3-ASR/finetuning/outputs/gspo_fr_0p6b.pid
@@ -0,0 +1 @@
+1304055
diff --git a/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_gspo_ch_dev100.json b/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_gspo_ch_dev100.json
@@ -0,0 +1,10 @@
+{
+  "n": 100,
+  "language": "Chinese",
+  "wer": null,
+  "cer": 0.08771929824561403,
+  "wall_seconds": 19.74980926513672,
+  "tag": "qwen3_0p6b_gspo_ch_dev100",
+  "model_path": "Qwen/Qwen3-ASR-0.6B",
+  "jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/cv21-zh/dev/dev.jsonl"
+}
diff --git a/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_gspo_fr_dev100.json b/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_gspo_fr_dev100.json
@@ -0,0 +1,10 @@
+{
+  "n": 100,
+  "language": "French",
+  "wer": 0.061341571050308914,
+  "cer": 0.024457308248914615,
+  "wall_seconds": 49.11708307266235,
+  "tag": "qwen3_0p6b_gspo_fr_dev100",
+  "model_path": "Qwen/Qwen3-ASR-0.6B",
+  "jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/fleurs-fr/dev/dev.jsonl"
+}
diff --git a/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_mwer_ch_dev100.json b/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_mwer_ch_dev100.json
@@ -0,0 +1,10 @@
+{
+  "n": 100,
+  "language": "Chinese",
+  "wer": null,
+  "cer": 0.07622504537205081,
+  "wall_seconds": 19.298258543014526,
+  "tag": "qwen3_0p6b_mwer_ch_dev100",
+  "model_path": "Qwen/Qwen3-ASR-0.6B",
+  "jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/cv21-zh/dev/dev.jsonl"
+}
diff --git a/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_mwer_fr_dev100.json b/Qwen3-ASR/finetuning/outputs/qwen3_0p6b_mwer_fr_dev100.json
@@ -0,0 +1,10 @@
+{
+  "n": 100,
+  "language": "French",
+  "wer": 0.06222418358340689,
+  "cer": 0.025108538350217077,
+  "wall_seconds": 48.716057777404785,
+  "tag": "qwen3_0p6b_mwer_fr_dev100",
+  "model_path": "Qwen/Qwen3-ASR-0.6B",
+  "jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/fleurs-fr/dev/dev.jsonl"
+}
diff --git a/Qwen3-ASR/finetuning/outputs/run_rl_0p6b.pid b/Qwen3-ASR/finetuning/outputs/run_rl_0p6b.pid
@@ -0,0 +1 @@
+1285994
diff --git a/Qwen3-ASR/finetuning/outputs/run_rl_0p6b_fast.pid b/Qwen3-ASR/finetuning/outputs/run_rl_0p6b_fast.pid
@@ -0,0 +1 @@
+1380779
diff --git a/Qwen3-ASR/finetuning/outputs/smoke_mwer_fr2.json b/Qwen3-ASR/finetuning/outputs/smoke_mwer_fr2.json
@@ -0,0 +1,10 @@
+{
+  "n": 100,
+  "language": "French",
+  "wer": 0.06928508384819064,
+  "cer": 0.02959479015918958,
+  "wall_seconds": 50.924898624420166,
+  "tag": "smoke_mwer_fr2",
+  "model_path": "Qwen/Qwen3-ASR-0.6B",
+  "jsonl": "finetuning/data/fleurs-fr/dev/dev.jsonl"
+}
diff --git a/Qwen3-ASR/finetuning/run_rl_sequential.sh b/Qwen3-ASR/finetuning/run_rl_sequential.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Sequential RL training: run each job, wait for its output JSON, then start next.
+# Designed to survive SSH session loss (run with setsid).
+set -uo pipefail
+
+PY=/data/venv/bin/python
+ROOT=/data/speech2text/Qwen3-ASR/finetuning
+OUT=${ROOT}/outputs
+DATA=${ROOT}/data
+LOG_DIR=${OUT}/logs
+ADAPTERS=${OUT}/adapters
+export HF_HOME=/data/speech2text/outputs/cache
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+mkdir -p "${LOG_DIR}" "${ADAPTERS}"
+
+run_job() {
+  local algo="$1" lang="$2" profile="$3" size="$4" epochs="$5" extra="${6:-}"
+  local lang_short="${lang,,}"; lang_short="${lang_short:0:2}"
+  local tag="qwen3_${size}_${algo}_${lang_short}_dev100"
+  local out_json="${OUT}/${tag}.json"
+  local adapter_dir="${ADAPTERS}/qwen3_${size}_${algo}_${lang_short}"
+  local log="${LOG_DIR}/${tag}_$(date -u +%Y%m%d_%H%M%S).log"
+
+  if [[ -f "${out_json}" ]]; then
+    echo "[skip] ${tag} already done → ${out_json}"
+    return 0
+  fi
+
+  echo "[start] ${tag}  $(date -u)"
+  "${PY}" "${ROOT}/qwen3_asr_${algo}.py" \
+    --model_path  "Qwen/Qwen3-ASR-0.6B" \
+    --train_file  "${DATA}/${profile}/train/train.jsonl" \
+    --eval_file   "${DATA}/${profile}/dev/dev.jsonl" \
+    --output_dir  "${adapter_dir}" \
+    --tag         "${tag}" \
+    --language    "${lang}" \
+    --epochs      "${epochs}" \
+    --grad_acc    4 --lr 5e-6 \
+    --log_steps   25 --eval_steps 0 \
+    --eval_out_dir "${OUT}" \
+    ${extra} \
+    > "${log}" 2>&1
+  local ec=$?
+  echo "[done] ${tag} exit=${ec}  $(date -u)"
+  if [[ -f "${out_json}" ]]; then
+    echo "[ok] result saved to ${out_json}"
+    cat "${out_json}" | python3 -c "import json,sys; d=json.load(sys.stdin); print(f'  WER={d.get(\"wer\",\"n/a\")}  CER={d.get(\"cer\",\"n/a\")}  n={d.get(\"n\",\"?\")}')";
+  else
+    echo "[warn] no output JSON at ${out_json}"
+  fi
+}
+
+# Skip GSPO-FR — already running (PID 1304055)
+# Queue: MWER-FR, MWER-ZH, GSPO-ZH
+# Wait for GSPO-FR to finish first (it holds 11GB GPU)
+echo "Waiting for GSPO-FR (PID 1304055) to finish..."
+while kill -0 1304055 2>/dev/null; do sleep 30; done
+echo "GSPO-FR done. Starting MWER-FR..."
+
+run_job mwer French  fleurs-fr 0p6b 0.25 "--n_best 4 --mwer_batch_size 1"
+run_job mwer Chinese cv21-zh   0p6b 0.25 "--n_best 4 --mwer_batch_size 1"
+run_job gspo Chinese cv21-zh   0p6b 0.25 "--group_size 4"
+
+echo "All jobs complete."
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-**[HTML version (GitHub Pages)](https://evergreentree.github.io/speech2text/)** — static render of [`index.html`](index.html) (repo **Settings → Pages → GitHub Actions**).
+***Please refer to the [HTML version](https://evergreentree.github.io/speech2text/) for a better reading experience.***
 
 # Fine-Tuning Efficient Chinese Speech Models beyond the Pareto Frontier
 
@@ -668,13 +668,6 @@ them across the N-best scoring and CE paths, and defaults MWER N-best
 generation to sampling rather than beam search. GSPO mirrors that structure
 with `--gspo_batch_size` and cached audio features. Sequence scoring is row-chunked (`QWEN_ASR_SCORE_ROW_CHUNK=2` by default), MWER backpropagates the large sequence-risk graph before building the CE graph, and both trainers explicitly release CUDA cache between microbatches so VRAM does not monotonically grow.
 
-Follow-up profiling on 2026-05-11 used four-audio GSPO microbatches and two-audio MWER microbatches for the completed 0.6B run. Short French measurements before final eval:
-
-| Trainer | Batch setting | Optimizer-step timing | GPU util sample | VRAM peak | Observed 0.5-epoch behavior |
-|---|---:|---:|---:|---:|---:|
-| MWER | `--mwer_batch_size 2`, `n_best=4` | 10 steps / 62-78 s | ~40 % sampled during full run | 22.5 GB | French and Chinese runs completed without monotonic VRAM growth |
-| GSPO | `--gspo_batch_size 4`, `group_size=4` | 10 steps / 56-96 s | mostly 40-47 % after warmup | 20.4 GB | French and Chinese runs completed without monotonic VRAM growth |
-
 The 0.6B four-run automation is
 [`run_rl_0p6b_fast.sh`](Qwen3-ASR/finetuning/run_rl_0p6b_fast.sh): French MWER,
 Chinese MWER, French GSPO, then Chinese GSPO, all with the profiled microbatch
@@ -764,7 +757,7 @@ or pass `--share` to `src.server` for a `*.gradio.live` HTTPS URL.
 **System dep:** Gradio decodes uploaded audio via `ffmpeg`. On a fresh machine:
 `sudo apt-get install -y ffmpeg`.
 
-## 9. Limitations and why the French plot is last
+## 9. Limitations
 
 ![fr-FR benchmark + fine-tunes](asr_bench/figures/wer_vs_size_fr.png)
 
@@ -788,4 +781,5 @@ The core reasons are the same ones developed earlier in §3 and §5:
 
 So the French plot is useful, but mainly as a warning: a strong multilingual
 ASR baseline on in-distribution data can make a naive fine-tune look busy
-without making it better.
+without making it better. The optimistic future work is to continue running RL 
+in continuation of the already successful Qwen SFT result.
diff --git a/index.html b/index.html
@@ -954,7 +954,7 @@ <h2>9. Limitations and why the French plot is last</h2>
 
     <div class="callout warning">
       <div class="callout-icon"></div>
-      <div class="callout-body"><p>So the French plot is useful, but mainly as a warning: a strong multilingual ASR baseline on in-distribution data can make a naive fine-tune look busy without making it better.</p></div>
+      <div class="callout-body"><p>So the French plot is useful, but mainly as a warning: a strong multilingual ASR baseline on in-distribution data can make a naive fine-tune look busy without making it better. The optimistic future work is to continue running RL in continuation of the already successful Qwen SFT result.</p></div>
     </div>
   </section>