NVIDIA · thomasdhc · May 1, 2024 · May 1, 2024 · May 2, 2024
diff --git a/launcher_scripts/conf/training/tacotron2/22050.yaml b/launcher_scripts/conf/training/tacotron2/22050.yaml
@@ -0,0 +1,189 @@
+run:
+  name: tacotron2
+  results_dir: ${base_results_dir}/${.name}
+  time_limit: "1-00:00:00"
+  dependency: "singleton"
+
+name: Tacotron2
+
+model:
+  pitch_fmin: 65.40639132514966
+  pitch_fmax: 2093.004522404789
+
+  sample_rate: 22050
+  n_mel_channels: 80
+  n_window_size: 1024
+  n_window_stride: 256
+  n_fft: 1024
+  lowfreq: 0
+  highfreq: 8000
+  window: hann
+  pad_value: -11.52
+
+  text_normalizer:
+    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+    lang: en
+    input_case: cased
+
+  text_normalizer_call_kwargs:
+    verbose: false
+    punct_pre_process: true
+    punct_post_process: true
+
+  text_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
+    punct: true
+    stresses: true
+    chars: true
+    apostrophe: true
+    pad_with_space: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
+      phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
+      heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
+
+  train_ds:
+    dataset:
+      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
+      manifest_filepath: ${data_dir}/train_manifest.json
+      sample_rate: ${training.model.sample_rate}
+      sup_data_path: null
+      sup_data_types: null
+      n_fft: ${training.model.n_fft}
+      win_length: ${training.model.n_window_size}
+      hop_length: ${training.model.n_window_stride}
+      window: ${training.model.window}
+      n_mels: ${training.model.n_mel_channels}
+      lowfreq: ${training.model.lowfreq}
+      highfreq: ${training.model.highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${training.model.pitch_fmin}
+      pitch_fmax: ${training.model.pitch_fmax}
+    dataloader_params:
+      drop_last: false
+      shuffle: true
+      batch_size: 48
+      num_workers: 4
+      pin_memory: true
+
+  validation_ds:
+    dataset:
+      _target_: "nemo.collections.tts.data.dataset.TTSDataset"
+      manifest_filepath: ${data_dir}/test_manifest.json
+      sample_rate: ${training.model.sample_rate}
+      sup_data_path: null
+      sup_data_types: null
+      n_fft: ${training.model.n_fft}
+      win_length: ${training.model.n_window_size}
+      hop_length: ${training.model.n_window_stride}
+      window: ${training.model.window}
+      n_mels: ${training.model.n_mel_channels}
+      lowfreq: ${training.model.lowfreq}
+      highfreq: ${training.model.highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: False
+      pitch_fmin: ${training.model.pitch_fmin}
+      pitch_fmax: ${training.model.pitch_fmax}
+    dataloader_params:
+      drop_last: false
+      shuffle: false
+      batch_size: 24
+      num_workers: 8
+      pin_memory: true
+
+  preprocessor:
+    _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
+    nfilt: ${training.model.n_mel_channels}
+    highfreq: ${training.model.highfreq}
+    log: true
+    log_zero_guard_type: clamp
+    log_zero_guard_value: 1e-05
+    lowfreq: ${training.model.lowfreq}
+    n_fft: ${training.model.n_fft}
+    n_window_size: ${training.model.n_window_size}
+    n_window_stride: ${training.model.n_window_stride}
+    pad_to: 16
+    pad_value: ${training.model.pad_value}
+    sample_rate: ${training.model.sample_rate}
+    window: ${training.model.window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    stft_conv: false
+    nb_augmentation_prob : 0
+    mag_power: 1.0
+    exact_pad: true
+    use_grads: false
+
+  encoder:
+    _target_: nemo.collections.tts.modules.tacotron2.Encoder
+    encoder_kernel_size: 5
+    encoder_n_convolutions: 3
+    encoder_embedding_dim: 512
+
+  decoder:
+    _target_: nemo.collections.tts.modules.tacotron2.Decoder
+    decoder_rnn_dim: 1024
+    encoder_embedding_dim: ${training.model.encoder.encoder_embedding_dim}
+    gate_threshold: 0.5
+    max_decoder_steps: 1000
+    n_frames_per_step: 1  # currently only 1 is supported
+    n_mel_channels: ${training.model.n_mel_channels}
+    p_attention_dropout: 0.1
+    p_decoder_dropout: 0.1
+    prenet_dim: 256
+    prenet_p_dropout: 0.5
+    # Attention parameters
+    attention_dim: 128
+    attention_rnn_dim: 1024
+    # AttentionLocation Layer parameters
+    attention_location_kernel_size: 31
+    attention_location_n_filters: 32
+    early_stopping: true
+
+  postnet:
+    _target_: nemo.collections.tts.modules.tacotron2.Postnet
+    n_mel_channels: ${training.model.n_mel_channels}
+    p_dropout: 0.5
+    postnet_embedding_dim: 512
+    postnet_kernel_size: 5
+    postnet_n_convolutions: 5
+
+  optim:
+    name: adam
+    lr: 1e-3
+    weight_decay: 1e-6
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      min_lr: 1e-5
+
+trainer:
+  devices: 1 # number of gpus
+  max_epochs: 1000
+  num_nodes: 1
+  accelerator: gpu
+  strategy: ddp
+  accumulate_grad_batches: 1
+  enable_checkpointing: False  # Provided by exp_manager
+  logger: False  # Provided by exp_manager
+  gradient_clip_val: 1.0
+  log_every_n_steps: 60
+  check_val_every_n_epoch: 2
+  benchmark: false
+
+exp_manager:
+  exp_dir: ${training.run.results_dir}
+  name: ${training.run.name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+    mode: min
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
@@ -45,6 +45,9 @@
     "starcoder2",
     "chatglm",
 ]
+__SPEECH_MODELS_LIST__ = [
+    "tacotron2",
+]
 __VISION_MODELS_LIST__ = ["vit"]
 __MULTIMODAL_MODELS_LIST__ = [
     "clip",
@@ -882,6 +885,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
             / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
             "mixtral": self._nemo_code_path
             / "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
+            "tacotron2": self._nemo_code_path / "examples/tts/tacotron2.py",
         }
         return model_type_to_code_path[model_type]