Merge branch 'master' into einichi

k2-fsa · Jan 7, 2025 · 5c142d4 · 5c142d4
2 parents 564b632 + 8d60280
commit 5c142d4
Show file tree

Hide file tree

Showing 297 changed files with 30,906 additions and 1,745 deletions.
diff --git a/.github/scripts/baker_zh/TTS/run-matcha.sh b/.github/scripts/baker_zh/TTS/run-matcha.sh
@@ -0,0 +1,167 @@
+#!/usr/bin/env bash
+
+set -ex
+
+apt-get update
+apt-get install -y sox
+
+python3 -m pip install numba conformer==0.3.2 diffusers librosa
+python3 -m pip install jieba
+
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/baker_zh/TTS
+
+sed -i.bak s/600/8/g ./prepare.sh
+sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
+sed -i.bak s/500/5/g ./prepare.sh
+git diff
+
+function prepare_data() {
+  # We have created a subset of the data for testing
+  #
+  mkdir -p download
+  pushd download
+  wget -q https://huggingface.co/csukuangfj/tmp-files/resolve/main/BZNSYP-samples.tar.bz2
+  tar xvf BZNSYP-samples.tar.bz2
+  mv BZNSYP-samples BZNSYP
+  rm BZNSYP-samples.tar.bz2
+  popd
+
+  ./prepare.sh
+  tree .
+}
+
+function train() {
+  pushd ./matcha
+  sed -i.bak s/1500/3/g ./train.py
+  git diff .
+  popd
+
+  ./matcha/train.py \
+    --exp-dir matcha/exp \
+    --num-epochs 1 \
+    --save-every-n 1 \
+    --num-buckets 2 \
+    --tokens data/tokens.txt \
+    --max-duration 20
+
+    ls -lh matcha/exp
+}
+
+function infer() {
+  curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
+
+  ./matcha/infer.py \
+    --num-buckets 2 \
+    --epoch 1 \
+    --exp-dir ./matcha/exp \
+    --tokens data/tokens.txt \
+    --cmvn ./data/fbank/cmvn.json \
+    --vocoder ./generator_v2 \
+    --input-text "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。" \
+    --output-wav ./generated.wav
+
+  ls -lh *.wav
+  soxi ./generated.wav
+  rm -v ./generated.wav
+  rm -v generator_v2
+}
+
+function export_onnx() {
+  pushd matcha/exp
+  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-baker-matcha-zh-2024-12-27/resolve/main/epoch-2000.pt
+  popd
+
+  pushd data/fbank
+  rm -v *.json
+  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-baker-matcha-zh-2024-12-27/resolve/main/cmvn.json
+  popd
+
+  ./matcha/export_onnx.py \
+    --exp-dir ./matcha/exp \
+    --epoch 2000 \
+    --tokens ./data/tokens.txt \
+    --cmvn ./data/fbank/cmvn.json
+
+  ls -lh *.onnx
+
+  if false; then
+    # The CI machine does not have enough memory to run it
+    #
+    curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
+    curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
+    curl -SL -O https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
+    python3 ./matcha/export_onnx_hifigan.py
+  else
+    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v1.onnx
+    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v2.onnx
+    curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28/resolve/main/exp/hifigan_v3.onnx
+  fi
+
+  ls -lh *.onnx
+
+  python3 ./matcha/generate_lexicon.py
+
+  for v in v1 v2 v3; do
+    python3 ./matcha/onnx_pretrained.py \
+     --acoustic-model ./model-steps-6.onnx \
+     --vocoder ./hifigan_$v.onnx \
+     --tokens ./data/tokens.txt \
+     --lexicon ./lexicon.txt \
+     --input-text "当夜幕降临，星光点点，伴随着微风拂面，我在静谧中感受着时光的流转，思念如涟漪荡漾，梦境如画卷展开，我与自然融为一体，沉静在这片宁静的美丽之中，感受着生命的奇迹与温柔。" \
+     --output-wav /icefall/generated-matcha-tts-steps-6-$v.wav
+  done
+
+  ls -lh /icefall/*.wav
+  soxi /icefall/generated-matcha-tts-steps-6-*.wav
+  cp ./model-steps-*.onnx /icefall
+
+  d=matcha-icefall-zh-baker
+  mkdir $d
+  cp -v data/tokens.txt $d
+  cp -v lexicon.txt $d
+  cp model-steps-3.onnx $d
+  pushd $d
+  curl -SL -O https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2
+  tar xvf dict.tar.bz2
+  rm dict.tar.bz2
+
+  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/date.fst
+  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/number.fst
+  curl -SL -O https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06/resolve/main/data/phone.fst
+
+cat >README.md <<EOF
+# Introduction
+
+This model is trained using the dataset from
+https://en.data-baker.com/datasets/freeDatasets/
+
+The dataset contains 10000 Chinese sentences of a native Chinese female speaker,
+which is about 12 hours.
+
+**Note**: The dataset is for non-commercial use only.
+
+You can find the training code at
+https://github.com/k2-fsa/icefall/tree/master/egs/baker_zh/TTS
+EOF
+
+  ls -lh
+  popd
+  tar cvjf $d.tar.bz2 $d
+  mv $d.tar.bz2 /icefall
+  mv $d /icefall
+}
+
+prepare_data
+train
+infer
+export_onnx
+
+rm -rfv generator_v* matcha/exp
+git checkout .
diff --git a/.github/scripts/docker/Dockerfile b/.github/scripts/docker/Dockerfile
@@ -31,12 +31,15 @@ LABEL github_repo="https://github.com/k2-fsa/icefall"
 
 # Install dependencies
 RUN pip install --no-cache-dir \
-      torch==${TORCH_VERSION} torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/cpu/torch_stable.html \
+      torch==${TORCH_VERSION}+cpu -f https://download.pytorch.org/whl/torch \
+      torchaudio==${TORCHAUDIO_VERSION}+cpu -f https://download.pytorch.org/whl/torchaudio \
       k2==${_K2_VERSION} -f https://k2-fsa.github.io/k2/cpu.html \
       \
       git+https://github.com/lhotse-speech/lhotse \
       kaldifeat==${_KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
+      conformer==0.3.2 \
       cython \
+      diffusers \
       dill \
       espnet_tts_frontend \
       graphviz \
@@ -45,10 +48,11 @@ RUN pip install --no-cache-dir \
       kaldialign \
       kaldifst \
       kaldilm \
-      matplotlib \
+      librosa \
+      "matplotlib<=3.9.4" \
       multi_quantization \
       numba \
-      numpy \
+      "numpy<2.0" \
       onnxoptimizer \
       onnxsim \
       onnx \

diff --git a/.github/scripts/docker/generate_build_matrix.py b/.github/scripts/docker/generate_build_matrix.py
@@ -2,9 +2,19 @@
 # Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 
 
+import argparse
 import json
 
 
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--min-torch-version",
+        help="Minimu torch version",
+    )
+    return parser.parse_args()
+
+
 def version_gt(a, b):
     a_major, a_minor = list(map(int, a.split(".")))[:2]
     b_major, b_minor = list(map(int, b.split(".")))[:2]
@@ -42,22 +52,34 @@ def get_torchaudio_version(torch_version):
         return torch_version
 
 
-def get_matrix():
-    k2_version = "1.24.4.dev20240223"
-    kaldifeat_version = "1.25.4.dev20240223"
-    version = "20240725"
+
+def get_matrix(min_torch_version):
+    k2_version = "1.24.4.dev20241029"
+    kaldifeat_version = "1.25.5.dev20241029"
+    version = "20241218"
+
+    # torchaudio 2.5.0 does not support python 3.13
+
     python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
     torch_version = []
     torch_version += ["1.13.0", "1.13.1"]
     torch_version += ["2.0.0", "2.0.1"]
-    torch_version += ["2.1.0", "2.1.1", "2.1.2"]
-    torch_version += ["2.2.0", "2.2.1", "2.2.2"]
+    #  torch_version += ["2.1.0", "2.1.1", "2.1.2"]
+    #  torch_version += ["2.2.0", "2.2.1", "2.2.2"]
+    # Test only torch >= 2.3.0
     torch_version += ["2.3.0", "2.3.1"]
     torch_version += ["2.4.0"]
 
+    torch_version += ["2.4.1"]
+    torch_version += ["2.5.0"]
+    torch_version += ["2.5.1"]
+
     matrix = []
     for p in python_version:
         for t in torch_version:
+            if min_torch_version and version_gt(min_torch_version, t):
+                continue
+
             # torchaudio <= 1.13.x supports only python <= 3.10
 
             if version_gt(p, "3.10") and not version_gt(t, "2.0"):
@@ -67,21 +89,20 @@ def get_matrix():
             if version_gt(p, "3.11") and not version_gt(t, "2.1"):
                 continue
 
-            k2_version_2 = k2_version
-            kaldifeat_version_2 = kaldifeat_version
-
-            if t == "2.2.2":
-                k2_version_2 = "1.24.4.dev20240328"
-                kaldifeat_version_2 = "1.25.4.dev20240329"
-            elif t == "2.3.0":
-                k2_version_2 = "1.24.4.dev20240425"
-                kaldifeat_version_2 = "1.25.4.dev20240425"
-            elif t == "2.3.1":
-                k2_version_2 = "1.24.4.dev20240606"
-                kaldifeat_version_2 = "1.25.4.dev20240606"
-            elif t == "2.4.0":
-                k2_version_2 = "1.24.4.dev20240725"
-                kaldifeat_version_2 = "1.25.4.dev20240725"
+            if version_gt(p, "3.12") and not version_gt(t, "2.4"):
+                continue
+
+            if version_gt(t, "2.4") and version_gt("3.10", p):
+                # torch>=2.5 requires python 3.10
+                continue
+
+
+            if t == "2.5.1":
+                k2_version_2 = "1.24.4.dev20241122"
+                kaldifeat_version_2 = "1.25.5.dev20241126"
+            else:
+                k2_version_2 = k2_version
+                kaldifeat_version_2 = kaldifeat_version
 
             matrix.append(
                 {
@@ -97,7 +118,8 @@ def get_matrix():
 
 
 def main():
-    matrix = get_matrix()
+    args = get_args()
+    matrix = get_matrix(min_torch_version=args.min_torch_version)
     print(json.dumps({"include": matrix}))