Add streaming HLG decoding for zipformer CTC. (#1557)

Note it supports only CPU.
k2-fsa · Mar 18, 2024 · 489263e · 489263e
1 parent 4917ac8
commit 489263e
Show file tree

Hide file tree

Showing 6 changed files with 492 additions and 8 deletions.
diff --git a/.github/scripts/docker/Dockerfile b/.github/scripts/docker/Dockerfile
@@ -36,7 +36,9 @@ RUN pip install --no-cache-dir \
       \
       git+https://github.com/lhotse-speech/lhotse \
       kaldifeat==${_KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
+      cython \
       dill \
+      espnet_tts_frontend \
       graphviz \
       kaldi-decoder \
       kaldi_native_io \
@@ -45,13 +47,15 @@ RUN pip install --no-cache-dir \
       kaldilm \
       matplotlib \
       multi_quantization \
+      numba \
       numpy \
       onnx \
       onnxmltools \
       onnxruntime \
+      piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
+      pypinyin==0.50.0 \
       pytest \
       sentencepiece>=0.1.96 \
-      pypinyin==0.50.0 \
       six \
       tensorboard \
       typeguard

diff --git a/.github/scripts/docker/generate_build_matrix.py b/.github/scripts/docker/generate_build_matrix.py
@@ -45,7 +45,7 @@ def get_torchaudio_version(torch_version):
 def get_matrix():
     k2_version = "1.24.4.dev20240223"
     kaldifeat_version = "1.25.4.dev20240223"
-    version = "20240223"
+    version = "20240318"
     python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
     torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
     torch_version += ["2.2.0", "2.2.1"]

diff --git a/.github/scripts/librispeech/ASR/run.sh b/.github/scripts/librispeech/ASR/run.sh
@@ -64,6 +64,46 @@ function run_diagnostics() {
     --print-diagnostics 1
 }
 
+function test_streaming_zipformer_ctc_hlg() {
+  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18
+
+  log "Downloading pre-trained model from $repo_url"
+  git lfs install
+  git clone $repo_url
+  repo=$(basename $repo_url)
+
+  rm $repo/exp-ctc-rnnt-small/*.onnx
+  ls -lh $repo/exp-ctc-rnnt-small
+
+  # export models to onnx
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
+    --epoch 30 \
+    --avg 3 \
+    --exp-dir $repo/exp-ctc-rnnt-small \
+    --causal 1 \
+    --use-ctc 1 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    \
+    --num-encoder-layers 2,2,2,2,2,2 \
+    --feedforward-dim 512,768,768,768,768,768 \
+    --encoder-dim 192,256,256,256,256,256 \
+    --encoder-unmasked-dim 192,192,192,192,192,192
+
+  ls -lh $repo/exp-ctc-rnnt-small
+
+  for wav in 0.wav 1.wav 8k.wav; do
+    python3 ./zipformer/onnx_pretrained_ctc_HLG_streaming.py \
+      --nn-model $repo/exp-ctc-rnnt-small/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \
+      --words $repo/data/lang_bpe_500/words.txt \
+      --HLG $repo/data/lang_bpe_500/HLG.fst \
+      $repo/test_wavs/$wav
+  done
+
+  rm -rf $repo
+}
+
 function test_pruned_transducer_stateless_2022_03_12() {
   repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
 
@@ -1577,6 +1617,7 @@ function test_transducer_bpe_500_2021_12_23() {
 
 prepare_data
 run_diagnostics
+test_streaming_zipformer_ctc_hlg
 test_pruned_transducer_stateless_2022_03_12
 test_pruned_transducer_stateless2_2022_04_29
 test_pruned_transducer_stateless3_2022_04_29

diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py
@@ -32,7 +32,7 @@
   --joiner-dim 512 \
   --causal True \
   --chunk-size 16 \
-  --left-context-frames 64 \
+  --left-context-frames 128 \
   --use-ctc 1
 
 The --chunk-size in training is "16,32,64,-1", so we select one of them
@@ -41,7 +41,7 @@
 
 It will generate the following file inside $repo/exp:
 
-  - ctc-epoch-99-avg-1-chunk-16-left-64.onnx
+  - ctc-epoch-99-avg-1-chunk-16-left-128.onnx
 
 See ./onnx_pretrained-streaming-ctc.py for how to use the exported ONNX models.
 """

diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
@@ -48,17 +48,17 @@
   --joiner-dim 512 \
   --causal True \
   --chunk-size 16 \
-  --left-context-frames 64
+  --left-context-frames 128
 
 The --chunk-size in training is "16,32,64,-1", so we select one of them
 (excluding -1) during streaming export. The same applies to `--left-context`,
 whose value is "64,128,256,-1".
 
 It will generate the following 3 files inside $repo/exp:
 
-  - encoder-epoch-99-avg-1-chunk-16-left-64.onnx
-  - decoder-epoch-99-avg-1-chunk-16-left-64.onnx
-  - joiner-epoch-99-avg-1-chunk-16-left-64.onnx
+  - encoder-epoch-99-avg-1-chunk-16-left-128.onnx
+  - decoder-epoch-99-avg-1-chunk-16-left-128.onnx
+  - joiner-epoch-99-avg-1-chunk-16-left-128.onnx
 
 See ./onnx_pretrained-streaming.py for how to use the exported ONNX models.
 """