Skip to content

Commit

Permalink
Merge branch 'k2-fsa:master' into k2ssl
Browse files Browse the repository at this point in the history
  • Loading branch information
yfyeung authored Mar 19, 2024
2 parents 482c24e + 413220d commit 898b0ce
Show file tree
Hide file tree
Showing 103 changed files with 11,080 additions and 338 deletions.
6 changes: 5 additions & 1 deletion .github/scripts/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ RUN pip install --no-cache-dir \
\
git+https://github.com/lhotse-speech/lhotse \
kaldifeat==${_KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
cython \
dill \
espnet_tts_frontend \
graphviz \
kaldi-decoder \
kaldi_native_io \
Expand All @@ -45,13 +47,15 @@ RUN pip install --no-cache-dir \
kaldilm \
matplotlib \
multi_quantization \
numba \
numpy \
onnx \
onnxmltools \
onnxruntime \
piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
pypinyin==0.50.0 \
pytest \
sentencepiece>=0.1.96 \
pypinyin==0.50.0 \
six \
tensorboard \
typeguard
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/docker/generate_build_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_torchaudio_version(torch_version):
def get_matrix():
k2_version = "1.24.4.dev20240223"
kaldifeat_version = "1.25.4.dev20240223"
version = "20240223"
version = "20240318"
python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
torch_version += ["2.2.0", "2.2.1"]
Expand Down
41 changes: 41 additions & 0 deletions .github/scripts/librispeech/ASR/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,46 @@ function run_diagnostics() {
--print-diagnostics 1
}

function test_streaming_zipformer_ctc_hlg() {
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18

log "Downloading pre-trained model from $repo_url"
git lfs install
git clone $repo_url
repo=$(basename $repo_url)

rm $repo/exp-ctc-rnnt-small/*.onnx
ls -lh $repo/exp-ctc-rnnt-small

# export models to onnx
./zipformer/export-onnx-streaming-ctc.py \
--tokens $repo/data/lang_bpe_500/tokens.txt \
--epoch 30 \
--avg 3 \
--exp-dir $repo/exp-ctc-rnnt-small \
--causal 1 \
--use-ctc 1 \
--chunk-size 16 \
--left-context-frames 128 \
\
--num-encoder-layers 2,2,2,2,2,2 \
--feedforward-dim 512,768,768,768,768,768 \
--encoder-dim 192,256,256,256,256,256 \
--encoder-unmasked-dim 192,192,192,192,192,192

ls -lh $repo/exp-ctc-rnnt-small

for wav in 0.wav 1.wav 8k.wav; do
python3 ./zipformer/onnx_pretrained_ctc_HLG_streaming.py \
--nn-model $repo/exp-ctc-rnnt-small/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \
--words $repo/data/lang_bpe_500/words.txt \
--HLG $repo/data/lang_bpe_500/HLG.fst \
$repo/test_wavs/$wav
done

rm -rf $repo
}

function test_pruned_transducer_stateless_2022_03_12() {
repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12

Expand Down Expand Up @@ -1577,6 +1617,7 @@ function test_transducer_bpe_500_2021_12_23() {

prepare_data
run_diagnostics
test_streaming_zipformer_ctc_hlg
test_pruned_transducer_stateless_2022_03_12
test_pruned_transducer_stateless2_2022_04_29
test_pruned_transducer_stateless3_2022_04_29
Expand Down
225 changes: 225 additions & 0 deletions docs/source/recipes/Finetune/adapter/finetune_adapter.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
Finetune from a pre-trained Zipformer model with adapters
=========================================================

This tutorial shows you how to fine-tune a pre-trained **Zipformer**
transducer model on a new dataset with adapters.
Adapters are compact and efficient module that can be integrated into a pre-trained model
to improve the model's performance on a new domain. Adapters are injected
between different modules in the well-trained neural network. During training, only the parameters
in the adapters will be updated. It achieves competitive performance
while requiring much less GPU memory than full fine-tuning. For more details about adapters,
please refer to the original `paper <https://arxiv.org/pdf/1902.00751.pdf#/>`_ for more details.

.. HINT::

We assume you have read the page :ref:`install icefall` and have setup
the environment for ``icefall``.

.. HINT::

We recommend you to use a GPU or several GPUs to run this recipe

For illustration purpose, we fine-tune the Zipformer transducer model
pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
own data for fine-tuning if you create a manifest for your new dataset.

Data preparation
----------------

Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.


Model preparation
-----------------

We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
checkpoint of the model can be downloaded via the following command:

.. code-block:: bash
$ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
$ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
$ git lfs pull --include "pretrained.pt"
$ ln -s pretrained.pt epoch-99.pt
$ cd ../data/lang_bpe_500
$ git lfs pull --include bpe.model
$ cd ../../..
Before fine-tuning, let's test the model's WER on the new domain. The following command performs
decoding on the GigaSpeech test sets:

.. code-block:: bash
./zipformer/decode_gigaspeech.py \
--epoch 99 \
--avg 1 \
--exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
--use-averaged-model 0 \
--max-duration 1000 \
--decoding-method greedy_search
You should see the following numbers:

.. code-block::
For dev, WER of different settings are:
greedy_search 20.06 best for dev
For test, WER of different settings are:
greedy_search 19.27 best for test
Fine-tune with adapter
----------------------

We insert 4 adapters with residual connection in each ``Zipformer2EncoderLayer``.
The original model parameters remain untouched during training and only the parameters of
the adapters are updated. The following command starts a fine-tuning experiment with adapters:

.. code-block:: bash
$ do_finetune=1
$ use_adapters=1
$ adapter_dim=8
$ ./zipformer_adapter/train.py \
--world-size 2 \
--num-epochs 20 \
--start-epoch 1 \
--exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
--use-fp16 1 \
--base-lr 0.045 \
--use-adapters $use_adapters --adapter-dim $adapter_dim \
--bpe-model data/lang_bpe_500/bpe.model \
--do-finetune $do_finetune \
--master-port 13022 \
--finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
--max-duration 1000
The following arguments are related to fine-tuning:

- ``--do-finetune``
If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
**Note that if you want to resume your fine-tuning experiment from certain epochs, you
need to set this to False.**

- ``use-adapters``
If adapters are used during fine-tuning.

- ``--adapter-dim``
The bottleneck dimension of the adapter module. Typically a small number.

You should notice that in the training log, the total number of trainale parameters is shown:

.. code-block::
2024-02-22 21:22:03,808 INFO [train.py:1277] A total of 761344 trainable parameters (1.148% of the whole model)
The trainable parameters only makes up 1.15% of the entire model parameters, so the training will be much faster
and requires less memory than full fine-tuning.


Decoding
--------

After training, let's test the WERs. To test the WERs on the GigaSpeech set,
you can execute the following command:

.. code-block:: bash
$ epoch=20
$ avg=10
$ use_adapters=1
$ adapter_dim=8
% ./zipformer/decode.py \
--epoch $epoch \
--avg $avg \
--use-averaged-model 1 \
--exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
--max-duration 600 \
--use-adapters $use_adapters \
--adapter-dim $adapter_dim \
--decoding-method greedy_search
You should see the following numbers:

.. code-block::
For dev, WER of different settings are:
greedy_search 15.44 best for dev
For test, WER of different settings are:
greedy_search 15.42 best for test
The WER on test set is improved from 19.27 to 15.42, demonstrating the effectiveness of adapters.

The same model can be used to perform decoding on LibriSpeech test sets. You can deactivate the adapters
to keep the same performance of the original model:

.. code-block:: bash
$ epoch=20
$ avg=1
$ use_adapters=0
$ adapter_dim=8
% ./zipformer/decode.py \
--epoch $epoch \
--avg $avg \
--use-averaged-model 1 \
--exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
--max-duration 600 \
--use-adapters $use_adapters \
--adapter-dim $adapter_dim \
--decoding-method greedy_search
.. code-block::
For dev, WER of different settings are:
greedy_search 2.23 best for test-clean
For test, WER of different settings are:
greedy_search 4.96 best for test-other
The numbers are the same as reported in `icefall <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md#normal-scaled-model-number-of-model-parameters-65549011-ie-6555-m>`_. So adapter-based
fine-tuning is also very flexible as the same model can be used for decoding on the original and target domain.


Export the model
----------------

After training, the model can be exported to ``onnx`` format easily using the following command:

.. code-block:: bash
$ use_adapters=1
$ adapter_dim=16
$ ./zipformer_adapter/export-onnx.py \
--tokens icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/tokens.txt \
--use-averaged-model 1 \
--epoch 20 \
--avg 10 \
--exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
--use-adapters $use_adapters \
--adapter-dim $adapter_dim \
--num-encoder-layers "2,2,3,4,3,2" \
--downsampling-factor "1,2,4,8,4,2" \
--feedforward-dim "512,768,1024,1536,1024,768" \
--num-heads "4,4,4,8,4,4" \
--encoder-dim "192,256,384,512,384,256" \
--query-head-dim 32 \
--value-head-dim 12 \
--pos-head-dim 4 \
--pos-dim 48 \
--encoder-unmasked-dim "192,192,256,256,256,192" \
--cnn-module-kernel "31,31,15,15,15,31" \
--decoder-dim 512 \
--joiner-dim 512 \
--causal False \
--chunk-size "16,32,64,-1" \
--left-context-frames "64,128,256,-1"
1 change: 1 addition & 0 deletions docs/source/recipes/Finetune/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ data to improve the performance on new domains.
:caption: Table of Contents

from_supervised/finetune_zipformer
adapter/finetune_adapter
8 changes: 6 additions & 2 deletions egs/aishell/ASR/conformer_ctc/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ def save_results(
for key, results in results_dict.items():
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
results = sorted(results)
store_transcripts(filename=recog_path, texts=results)
store_transcripts(filename=recog_path, texts=results, char_level=True)
if enable_log:
logging.info(f"The transcripts are stored in {recog_path}")

Expand All @@ -432,7 +432,11 @@ def save_results(
results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
with open(errs_filename, "w") as f:
wer = write_error_stats(
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
f,
f"{test_set_name}-{key}",
results_char,
enable_log=enable_log,
compute_CER=True,
)
test_set_wers[key] = wer

Expand Down
8 changes: 6 additions & 2 deletions egs/aishell/ASR/conformer_mmi/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def save_results(
for key, results in results_dict.items():
recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
results = sorted(results)
store_transcripts(filename=recog_path, texts=results)
store_transcripts(filename=recog_path, texts=results, char_level=True)
if enable_log:
logging.info(f"The transcripts are stored in {recog_path}")

Expand All @@ -444,7 +444,11 @@ def save_results(
results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
with open(errs_filename, "w") as f:
wer = write_error_stats(
f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
f,
f"{test_set_name}-{key}",
results_char,
enable_log=enable_log,
compute_CER=True,
)
test_set_wers[key] = wer

Expand Down
8 changes: 6 additions & 2 deletions egs/aishell/ASR/pruned_transducer_stateless2/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def save_results(
for key, results in results_dict.items():
recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
results = sorted(results)
store_transcripts(filename=recog_path, texts=results)
store_transcripts(filename=recog_path, texts=results, char_level=True)
logging.info(f"The transcripts are stored in {recog_path}")

# The following prints out WERs, per-word error statistics and aligned
Expand All @@ -402,7 +402,11 @@ def save_results(
results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
with open(errs_filename, "w") as f:
wer = write_error_stats(
f, f"{test_set_name}-{key}", results_char, enable_log=True
f,
f"{test_set_name}-{key}",
results_char,
enable_log=True,
compute_CER=True,
)
test_set_wers[key] = wer

Expand Down
Loading

0 comments on commit 898b0ce

Please sign in to comment.