Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TTS and ASR model #306

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions launcher_scripts/conf/training/tacotron2/22050.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
run:
name: tacotron2
results_dir: ${base_results_dir}/${.name}
time_limit: "1-00:00:00"
dependency: "singleton"

name: Tacotron2

model:
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789

sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: hann
pad_value: -11.52

text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased

text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true

text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
punct: true
stresses: true
chars: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.models.en_us_arpabet.EnglishG2p
phoneme_dict: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"

train_ds:
dataset:
_target_: "nemo.collections.tts.data.dataset.TTSDataset"
manifest_filepath: ${data_dir}/train_manifest.json
sample_rate: ${training.model.sample_rate}
sup_data_path: null
sup_data_types: null
n_fft: ${training.model.n_fft}
win_length: ${training.model.n_window_size}
hop_length: ${training.model.n_window_stride}
window: ${training.model.window}
n_mels: ${training.model.n_mel_channels}
lowfreq: ${training.model.lowfreq}
highfreq: ${training.model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${training.model.pitch_fmin}
pitch_fmax: ${training.model.pitch_fmax}
dataloader_params:
drop_last: false
shuffle: true
batch_size: 48
num_workers: 4
pin_memory: true

validation_ds:
dataset:
_target_: "nemo.collections.tts.data.dataset.TTSDataset"
manifest_filepath: ${data_dir}/test_manifest.json
sample_rate: ${training.model.sample_rate}
sup_data_path: null
sup_data_types: null
n_fft: ${training.model.n_fft}
win_length: ${training.model.n_window_size}
hop_length: ${training.model.n_window_stride}
window: ${training.model.window}
n_mels: ${training.model.n_mel_channels}
lowfreq: ${training.model.lowfreq}
highfreq: ${training.model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${training.model.pitch_fmin}
pitch_fmax: ${training.model.pitch_fmax}
dataloader_params:
drop_last: false
shuffle: false
batch_size: 24
num_workers: 8
pin_memory: true

preprocessor:
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
nfilt: ${training.model.n_mel_channels}
highfreq: ${training.model.highfreq}
log: true
log_zero_guard_type: clamp
log_zero_guard_value: 1e-05
lowfreq: ${training.model.lowfreq}
n_fft: ${training.model.n_fft}
n_window_size: ${training.model.n_window_size}
n_window_stride: ${training.model.n_window_stride}
pad_to: 16
pad_value: ${training.model.pad_value}
sample_rate: ${training.model.sample_rate}
window: ${training.model.window}
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
stft_conv: false
nb_augmentation_prob : 0
mag_power: 1.0
exact_pad: true
use_grads: false

encoder:
_target_: nemo.collections.tts.modules.tacotron2.Encoder
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512

decoder:
_target_: nemo.collections.tts.modules.tacotron2.Decoder
decoder_rnn_dim: 1024
encoder_embedding_dim: ${training.model.encoder.encoder_embedding_dim}
gate_threshold: 0.5
max_decoder_steps: 1000
n_frames_per_step: 1 # currently only 1 is supported
n_mel_channels: ${training.model.n_mel_channels}
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
prenet_dim: 256
prenet_p_dropout: 0.5
# Attention parameters
attention_dim: 128
attention_rnn_dim: 1024
# AttentionLocation Layer parameters
attention_location_kernel_size: 31
attention_location_n_filters: 32
early_stopping: true

postnet:
_target_: nemo.collections.tts.modules.tacotron2.Postnet
n_mel_channels: ${training.model.n_mel_channels}
p_dropout: 0.5
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5

optim:
name: adam
lr: 1e-3
weight_decay: 1e-6

# scheduler setup
sched:
name: CosineAnnealing
min_lr: 1e-5

trainer:
devices: 1 # number of gpus
max_epochs: 1000
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: False # Provided by exp_manager
gradient_clip_val: 1.0
log_every_n_steps: 60
check_val_every_n_epoch: 2
benchmark: false

exp_manager:
exp_dir: ${training.run.results_dir}
name: ${training.run.name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
mode: min
4 changes: 4 additions & 0 deletions launcher_scripts/nemo_launcher/core/stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
"starcoder2",
"chatglm",
]
__SPEECH_MODELS_LIST__ = [
"tacotron2",
]
__VISION_MODELS_LIST__ = ["vit"]
__MULTIMODAL_MODELS_LIST__ = [
"clip",
Expand Down Expand Up @@ -882,6 +885,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path:
/ "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
"mixtral": self._nemo_code_path
/ "examples/nlp/language_modeling/megatron_gpt_pretraining.py",
"tacotron2": self._nemo_code_path / "examples/tts/tacotron2.py",
}
return model_type_to_code_path[model_type]

Expand Down
Loading