F5-TTS Training Recipe for WenetSpeech4TTS (#1846)

* add f5 * add infer * add dit * add README * update pretrained checkpoint usage --------- Co-authored-by: yuekaiz <[email protected]> Co-authored-by: yuekaiz <[email protected]> Co-authored-by: yuekaiz <[email protected]> Co-authored-by: zr_jin <[email protected]>
k2-fsa · Jan 27, 2025 · dd5d7e3 · dd5d7e3
1 parent 39c466e
commit dd5d7e3
Show file tree

Hide file tree

Showing 20 changed files with 7,115 additions and 3 deletions.
diff --git a/egs/ljspeech/TTS/matcha/fbank.py b/egs/ljspeech/TTS/matcha/fbank.py
@@ -17,6 +17,7 @@ class MatchaFbankConfig:
     win_length: int
     f_min: float
     f_max: float
+    device: str = "cuda"
 
 
 @register_extractor
@@ -46,7 +47,7 @@ def extract(
             f"Mismatched sampling rate: extractor expects {expected_sr}, "
             f"got {sampling_rate}"
         )
-        samples = torch.from_numpy(samples)
+        samples = torch.from_numpy(samples).to(self.device)
         assert samples.ndim == 2, samples.shape
         assert samples.shape[0] == 1, samples.shape
 
@@ -81,7 +82,7 @@ def extract(
                 mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
             ).squeeze(0)
 
-        return mel.numpy()
+        return mel.cpu().numpy()
 
     @property
     def frame_shift(self) -> Seconds:

diff --git a/egs/wenetspeech4tts/TTS/README.md b/egs/wenetspeech4tts/TTS/README.md
@@ -68,5 +68,69 @@ python3 valle/infer.py --output-dir demos_epoch_${epoch}_avg_${avg}_top_p_${top_
         --text-extractor pypinyin_initials_finals --top-p ${top_p}
 ```
 
+# [F5-TTS](https://arxiv.org/abs/2410.06885)
+
+./f5-tts contains the code for training F5-TTS model.
+
+Generated samples and training logs of wenetspeech basic 7k hours data can be found [here](https://huggingface.co/yuekai/f5-tts-small-wenetspeech4tts-basic/tensorboard).
+
+Preparation:
+
+```
+bash prepare.sh --stage 5 --stop_stage 6
+```
+(Note: To compatiable with F5-TTS official checkpoint, we direclty use `vocab.txt` from [here.](https://github.com/SWivid/F5-TTS/blob/129014c5b43f135b0100d49a0c6804dd4cf673e1/data/Emilia_ZH_EN_pinyin/vocab.txt) To generate your own `vocab.txt`, you may refer to [the script](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/train/datasets/prepare_emilia.py).)
+
+The training command is given below:
+
+```
+# docker: ghcr.io/swivid/f5-tts:main
+# pip install k2==1.24.4.dev20241030+cuda12.4.torch2.4.0 -f https://k2-fsa.github.io/k2/cuda.html
+# pip install kaldialign lhotse tensorboard bigvganinference sentencepiece
+
+world_size=8
+exp_dir=exp/f5-tts-small
+python3 f5-tts/train.py --max-duration 700 --filter-min-duration 0.5 --filter-max-duration 20  \
+      --num-buckets 6 --dtype "bfloat16" --save-every-n 5000 --valid-interval 10000 \
+      --base-lr 7.5e-5 --warmup-steps 20000 --num-epochs 60  \
+      --num-decoder-layers 18 --nhead 12 --decoder-dim 768 \
+      --exp-dir ${exp_dir} --world-size ${world_size}
+```
+
+To inference with Icefall Wenetspeech4TTS trained F5-Small, use:
+```
+huggingface-cli login
+huggingface-cli download --local-dir seed_tts_eval yuekai/seed_tts_eval --repo-type dataset
+huggingface-cli download --local-dir ${exp_dir} yuekai/f5-tts-small-wenetspeech4tts-basic
+huggingface-cli download nvidia/bigvgan_v2_24khz_100band_256x --local-dir bigvgan_v2_24khz_100band_256x
+
+manifest=./seed_tts_eval/seedtts_testset/zh/meta.lst
+model_path=f5-tts-small-wenetspeech4tts-basic/epoch-56-avg-14.pt
+# skip
+python3 f5-tts/generate_averaged_model.py \
+    --epoch 56 \
+    --avg 14 --decoder-dim 768 --nhead 12 --num-decoder-layers 18 \
+    --exp-dir exp/f5_small
+
+
+accelerate launch f5-tts/infer.py --nfe 16 --model-path $model_path --manifest-file $manifest --output-dir $output_dir --decoder-dim 768 --nhead 12 --num-decoder-layers 18
+bash local/compute_wer.sh $output_dir $manifest
+```
+
+To inference with official Emilia trained F5-Base, use:
+```
+huggingface-cli login
+huggingface-cli download --local-dir seed_tts_eval yuekai/seed_tts_eval --repo-type dataset
+huggingface-cli download --local-dir F5-TTS SWivid/F5-TTS
+huggingface-cli download nvidia/bigvgan_v2_24khz_100band_256x --local-dir bigvgan_v2_24khz_100band_256x
+
+manifest=./seed_tts_eval/seedtts_testset/zh/meta.lst
+model_path=./F5-TTS/F5TTS_Base_bigvgan/model_1250000.pt
+
+accelerate launch f5-tts/infer.py --nfe 16 --model-path $model_path --manifest-file $manifest --output-dir $output_dir
+bash local/compute_wer.sh $output_dir $manifest
+```
+
 # Credits
-- [vall-e](https://github.com/lifeiteng/vall-e)
+- [VALL-E](https://github.com/lifeiteng/vall-e)
+- [F5-TTS](https://github.com/SWivid/F5-TTS)
diff --git a/egs/wenetspeech4tts/TTS/f5-tts/generate_averaged_model.py b/egs/wenetspeech4tts/TTS/f5-tts/generate_averaged_model.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Yifan Yang)
+# Copyright 2024                                 Yuekai Zhang
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) use the checkpoint exp_dir/epoch-xxx.pt
+python3 bin/generate_averaged_model.py \
+    --epoch 40 \
+    --avg 5 \
+    --exp-dir ${exp_dir}
+
+It will generate a file `epoch-28-avg-15.pt` in the given `exp_dir`.
+You can later load it by `torch.load("epoch-28-avg-15.pt")`.
+"""
+
+
+import argparse
+from pathlib import Path
+
+import k2
+import torch
+from train import add_model_arguments, get_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+)
+from icefall.utils import AttributeDict
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+    add_model_arguments(parser)
+    return parser
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = AttributeDict()
+    params.update(vars(args))
+
+    if params.iter > 0:
+        params.suffix = f"checkpoint-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    print("Script started")
+
+    device = torch.device("cpu")
+    print(f"Device: {device}")
+
+    print("About to create model")
+    filename = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+    checkpoint = torch.load(filename, map_location=device)
+    args = AttributeDict(checkpoint)
+    model = get_model(args)
+
+    if params.iter > 0:
+        # TODO FIX ME
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg + 1
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg + 1:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        filename_start = filenames[-1]
+        filename_end = filenames[0]
+        print(
+            "Calculating the averaged model over iteration checkpoints"
+            f" from {filename_start} (excluded) to {filename_end}"
+        )
+        model.to(device)
+        model.load_state_dict(
+            average_checkpoints_with_averaged_model(
+                filename_start=filename_start,
+                filename_end=filename_end,
+                device=device,
+            )
+        )
+        filename = params.exp_dir / f"checkpoint-{params.iter}-avg-{params.avg}.pt"
+        torch.save({"model": model.state_dict()}, filename)
+    else:
+        assert params.avg > 0, params.avg
+        start = params.epoch - params.avg
+        assert start >= 1, start
+        filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+        filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+        print(
+            f"Calculating the averaged model over epoch range from "
+            f"{start} (excluded) to {params.epoch}"
+        )
+        filenames = [
+            f"{params.exp_dir}/epoch-{i}.pt" for i in range(start, params.epoch + 1)
+        ]
+        model.to(device)
+        model.load_state_dict(average_checkpoints(filenames, device=device))
+
+        filename = params.exp_dir / f"epoch-{params.epoch}-avg-{params.avg}.pt"
+        checkpoint["model"] = model.state_dict()
+        torch.save(checkpoint, filename)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    print(f"Number of model parameters: {num_param}")
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()