Skip to content

Commit c8305fc

Browse files
author
sesterce
committed
Fix README
1 parent 27528ed commit c8305fc

11 files changed

Lines changed: 123 additions & 11 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1304055
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"n": 100,
3+
"language": "Chinese",
4+
"wer": null,
5+
"cer": 0.08771929824561403,
6+
"wall_seconds": 19.74980926513672,
7+
"tag": "qwen3_0p6b_gspo_ch_dev100",
8+
"model_path": "Qwen/Qwen3-ASR-0.6B",
9+
"jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/cv21-zh/dev/dev.jsonl"
10+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"n": 100,
3+
"language": "French",
4+
"wer": 0.061341571050308914,
5+
"cer": 0.024457308248914615,
6+
"wall_seconds": 49.11708307266235,
7+
"tag": "qwen3_0p6b_gspo_fr_dev100",
8+
"model_path": "Qwen/Qwen3-ASR-0.6B",
9+
"jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/fleurs-fr/dev/dev.jsonl"
10+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"n": 100,
3+
"language": "Chinese",
4+
"wer": null,
5+
"cer": 0.07622504537205081,
6+
"wall_seconds": 19.298258543014526,
7+
"tag": "qwen3_0p6b_mwer_ch_dev100",
8+
"model_path": "Qwen/Qwen3-ASR-0.6B",
9+
"jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/cv21-zh/dev/dev.jsonl"
10+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"n": 100,
3+
"language": "French",
4+
"wer": 0.06222418358340689,
5+
"cer": 0.025108538350217077,
6+
"wall_seconds": 48.716057777404785,
7+
"tag": "qwen3_0p6b_mwer_fr_dev100",
8+
"model_path": "Qwen/Qwen3-ASR-0.6B",
9+
"jsonl": "/data/speech2text/Qwen3-ASR/finetuning/data/fleurs-fr/dev/dev.jsonl"
10+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1285994
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1380779
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"n": 100,
3+
"language": "French",
4+
"wer": 0.06928508384819064,
5+
"cer": 0.02959479015918958,
6+
"wall_seconds": 50.924898624420166,
7+
"tag": "smoke_mwer_fr2",
8+
"model_path": "Qwen/Qwen3-ASR-0.6B",
9+
"jsonl": "finetuning/data/fleurs-fr/dev/dev.jsonl"
10+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env bash
2+
# Sequential RL training: run each job, wait for its output JSON, then start next.
3+
# Designed to survive SSH session loss (run with setsid).
4+
set -uo pipefail
5+
6+
PY=/data/venv/bin/python
7+
ROOT=/data/speech2text/Qwen3-ASR/finetuning
8+
OUT=${ROOT}/outputs
9+
DATA=${ROOT}/data
10+
LOG_DIR=${OUT}/logs
11+
ADAPTERS=${OUT}/adapters
12+
export HF_HOME=/data/speech2text/outputs/cache
13+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
14+
15+
mkdir -p "${LOG_DIR}" "${ADAPTERS}"
16+
17+
run_job() {
18+
local algo="$1" lang="$2" profile="$3" size="$4" epochs="$5" extra="${6:-}"
19+
local lang_short="${lang,,}"; lang_short="${lang_short:0:2}"
20+
local tag="qwen3_${size}_${algo}_${lang_short}_dev100"
21+
local out_json="${OUT}/${tag}.json"
22+
local adapter_dir="${ADAPTERS}/qwen3_${size}_${algo}_${lang_short}"
23+
local log="${LOG_DIR}/${tag}_$(date -u +%Y%m%d_%H%M%S).log"
24+
25+
if [[ -f "${out_json}" ]]; then
26+
echo "[skip] ${tag} already done → ${out_json}"
27+
return 0
28+
fi
29+
30+
echo "[start] ${tag} $(date -u)"
31+
"${PY}" "${ROOT}/qwen3_asr_${algo}.py" \
32+
--model_path "Qwen/Qwen3-ASR-0.6B" \
33+
--train_file "${DATA}/${profile}/train/train.jsonl" \
34+
--eval_file "${DATA}/${profile}/dev/dev.jsonl" \
35+
--output_dir "${adapter_dir}" \
36+
--tag "${tag}" \
37+
--language "${lang}" \
38+
--epochs "${epochs}" \
39+
--grad_acc 4 --lr 5e-6 \
40+
--log_steps 25 --eval_steps 0 \
41+
--eval_out_dir "${OUT}" \
42+
${extra} \
43+
> "${log}" 2>&1
44+
local ec=$?
45+
echo "[done] ${tag} exit=${ec} $(date -u)"
46+
if [[ -f "${out_json}" ]]; then
47+
echo "[ok] result saved to ${out_json}"
48+
cat "${out_json}" | python3 -c "import json,sys; d=json.load(sys.stdin); print(f' WER={d.get(\"wer\",\"n/a\")} CER={d.get(\"cer\",\"n/a\")} n={d.get(\"n\",\"?\")}')";
49+
else
50+
echo "[warn] no output JSON at ${out_json}"
51+
fi
52+
}
53+
54+
# Skip GSPO-FR — already running (PID 1304055)
55+
# Queue: MWER-FR, MWER-ZH, GSPO-ZH
56+
# Wait for GSPO-FR to finish first (it holds 11GB GPU)
57+
echo "Waiting for GSPO-FR (PID 1304055) to finish..."
58+
while kill -0 1304055 2>/dev/null; do sleep 30; done
59+
echo "GSPO-FR done. Starting MWER-FR..."
60+
61+
run_job mwer French fleurs-fr 0p6b 0.25 "--n_best 4 --mwer_batch_size 1"
62+
run_job mwer Chinese cv21-zh 0p6b 0.25 "--n_best 4 --mwer_batch_size 1"
63+
run_job gspo Chinese cv21-zh 0p6b 0.25 "--group_size 4"
64+
65+
echo "All jobs complete."

README.md

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
**[HTML version (GitHub Pages)](https://evergreentree.github.io/speech2text/)** — static render of [`index.html`](index.html) (repo **Settings → Pages → GitHub Actions**).
1+
***Please refer to the [HTML version](https://evergreentree.github.io/speech2text/) for a better reading experience.***
22

33
# Fine-Tuning Efficient Chinese Speech Models beyond the Pareto Frontier
44

@@ -668,13 +668,6 @@ them across the N-best scoring and CE paths, and defaults MWER N-best
668668
generation to sampling rather than beam search. GSPO mirrors that structure
669669
with `--gspo_batch_size` and cached audio features. Sequence scoring is row-chunked (`QWEN_ASR_SCORE_ROW_CHUNK=2` by default), MWER backpropagates the large sequence-risk graph before building the CE graph, and both trainers explicitly release CUDA cache between microbatches so VRAM does not monotonically grow.
670670

671-
Follow-up profiling on 2026-05-11 used four-audio GSPO microbatches and two-audio MWER microbatches for the completed 0.6B run. Short French measurements before final eval:
672-
673-
| Trainer | Batch setting | Optimizer-step timing | GPU util sample | VRAM peak | Observed 0.5-epoch behavior |
674-
|---|---:|---:|---:|---:|---:|
675-
| MWER | `--mwer_batch_size 2`, `n_best=4` | 10 steps / 62-78 s | ~40 % sampled during full run | 22.5 GB | French and Chinese runs completed without monotonic VRAM growth |
676-
| GSPO | `--gspo_batch_size 4`, `group_size=4` | 10 steps / 56-96 s | mostly 40-47 % after warmup | 20.4 GB | French and Chinese runs completed without monotonic VRAM growth |
677-
678671
The 0.6B four-run automation is
679672
[`run_rl_0p6b_fast.sh`](Qwen3-ASR/finetuning/run_rl_0p6b_fast.sh): French MWER,
680673
Chinese MWER, French GSPO, then Chinese GSPO, all with the profiled microbatch
@@ -764,7 +757,7 @@ or pass `--share` to `src.server` for a `*.gradio.live` HTTPS URL.
764757
**System dep:** Gradio decodes uploaded audio via `ffmpeg`. On a fresh machine:
765758
`sudo apt-get install -y ffmpeg`.
766759

767-
## 9. Limitations and why the French plot is last
760+
## 9. Limitations
768761

769762
![fr-FR benchmark + fine-tunes](asr_bench/figures/wer_vs_size_fr.png)
770763

@@ -788,4 +781,5 @@ The core reasons are the same ones developed earlier in §3 and §5:
788781

789782
So the French plot is useful, but mainly as a warning: a strong multilingual
790783
ASR baseline on in-distribution data can make a naive fine-tune look busy
791-
without making it better.
784+
without making it better. The optimistic future work is to continue running RL
785+
in continuation of the already successful Qwen SFT result.

0 commit comments

Comments
 (0)