|
| 1 | +# LJSpeech config with L2 reg, zoneout, prenet noise |
| 2 | +################################ |
| 3 | +# Experiment Parameters # |
| 4 | +################################ |
| 5 | +experiment: |
| 6 | + name: "mike" # name of experiment |
| 7 | + log_every: 10 |
| 8 | + iters_per_checkpoint: 500 |
| 9 | + seed: 1234 |
| 10 | + fp16_run: False |
| 11 | + distributed_run: True |
| 12 | + dist_backend: "nccl" |
| 13 | + dist_url: "tcp://localhost:54321" |
| 14 | + cudnn_enabled: True |
| 15 | + cudnn_benchmark: False |
| 16 | + |
| 17 | +training: |
| 18 | + epochs: 700 |
| 19 | + batch_size: 48 |
| 20 | + # expected for 3 gpus, for 1 gpu divide by 2, for 3 gpus multiply by 1.5 |
| 21 | + # learning rate is ramping from 0 and decaying using cosine by default |
| 22 | + learning_rate: 0.0003 |
| 23 | + disable_lr_decay: False |
| 24 | + mask_padding: True # set model's padded outputs to padded values |
| 25 | + |
| 26 | + # Loss params |
| 27 | + loss_use_masking: True |
| 28 | + loss_bce_pos_weight: 5.0 |
| 29 | + loss_guided_attn_weight: 0.3 |
| 30 | + loss_guided_attn_weight_decay: 0.9995 |
| 31 | + loss_guided_attn_min_value: 0.0001 |
| 32 | + |
| 33 | + ################################ |
| 34 | + # Optimization Hyperparameters # |
| 35 | + ################################ |
| 36 | + use_saved_learning_rate: False |
| 37 | + # divide n_epochs by X parts. First 2 decay from lr to lr/10, third from lr/10 to lr/1e2, last from lr/1e2 to 0 |
| 38 | + n_warmup_steps: 500 |
| 39 | + weight_decay: 0.000006 |
| 40 | + grad_clip_thresh: 1.0 |
| 41 | + |
| 42 | +################################ |
| 43 | +# Data Parameters # |
| 44 | +################################ |
| 45 | +data: |
| 46 | + load_mel_from_disk: True |
| 47 | + training_files: '/home/frappuccino/data/filelists/mike_pauses/train.txt' |
| 48 | + validation_files: '/home/frappuccino/data/filelists/mike_pauses/val.txt' |
| 49 | + text_cleaners: ['english_cleaners'] |
| 50 | + cmudict_path: "data/cmudict_dictionary" |
| 51 | + skip_heteronyms: False |
| 52 | + batch_group_size: 0.05 |
| 53 | + # probability of word converted to phonemes |
| 54 | + p_arpabet: 0.99 |
| 55 | + |
| 56 | + ################################ |
| 57 | + # Audio Parameters # |
| 58 | + ################################ |
| 59 | + sampling_rate: 22050 |
| 60 | + mel_fmin: 20.0 |
| 61 | + mel_fmax: 11025.0 |
| 62 | + n_mel_channels: 80 |
| 63 | + max_wav_value: 32768.0 |
| 64 | + filter_length: 1024 |
| 65 | + win_length: 1024 |
| 66 | + hop_length: 256 |
| 67 | + |
| 68 | + # Normalization parameters |
| 69 | + # DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. |
| 70 | + # If it is defined mean-std based notmalization is used and other normalization params are ignored |
| 71 | + stats_path: "/home/frappuccino/data/filelists/mike_pauses/scale_stats.npy" |
| 72 | + |
| 73 | +checkpoint: |
| 74 | + start_from_iteration: -1 # -1 to resume from checkpoint, else from such iteration |
| 75 | + |
| 76 | + # use ignore layers when --warm_start is applied. |
| 77 | + # Empty list is also supported, in that case any new keys in model, but not in checkpoint |
| 78 | + # will also be loaded |
| 79 | + # set to speaker_embedding.weight when doing speaker adaptation |
| 80 | + ignore_layers: [ |
| 81 | + "encoder.embedding.weight", |
| 82 | + "decoder.prenet.*", |
| 83 | + "decoder.linear_projection.*", |
| 84 | + "decoder.gate_layer.linear_layer.weight", |
| 85 | + |
| 86 | +# "decoder.attention_rnn.weight_ih", |
| 87 | +# "decoder.attention.memory_layer.linear_layer.weight", |
| 88 | + ] |
| 89 | + # for speaker adaptation |
| 90 | + train_only_layers: [ |
| 91 | + # 'speaker_embedding.weight' |
| 92 | + # 'embedding.weight' |
| 93 | + #"gst.*" |
| 94 | + |
| 95 | + # retrain gate_layer |
| 96 | + # "decoder.gate_layer.linear_layer.weight", |
| 97 | + # "decoder.gate_layer.linear_layer.bias", |
| 98 | + ] |
| 99 | + |
| 100 | +################################ |
| 101 | +# Model Parameters # |
| 102 | +################################ |
| 103 | +model: |
| 104 | + n_symbols: 153 #len(symbols) |
| 105 | + |
| 106 | + # Encoder parameters |
| 107 | + encoder_kernel_size: 5 |
| 108 | + encoder_n_convolutions: 3 |
| 109 | + encoder_embedding_dim: 512 |
| 110 | + # encoder out size, will be doubled as we use bidirectional lstm |
| 111 | + encoder_lstm_hidden_dim: 512 |
| 112 | + |
| 113 | + ################################ |
| 114 | + # Decoder parameters # |
| 115 | + ################################ |
| 116 | + n_frames_per_step: 2 |
| 117 | + max_decoder_steps: 1500 |
| 118 | + gate_threshold: 0.4 |
| 119 | + decoder_rnn_dim: 1024 |
| 120 | + prenet_dim: 256 |
| 121 | + prenet_noise: 0.01 |
| 122 | + |
| 123 | + p_teacher_forcing: 0.99 |
| 124 | + # quite strong augmentation, check number of iterations in advance to set this properly |
| 125 | + p_teacher_forcing_decay_rate: 1.0 |
| 126 | + |
| 127 | + ################################ |
| 128 | + # Attention parameters # |
| 129 | + ################################ |
| 130 | + # start from 0.2 and decrease to 0.1 at the end of training ( when attention converged ) |
| 131 | + p_attention_dropout: 0.1 |
| 132 | + p_decoder_dropout: 0.1 |
| 133 | + # increases memory consumption, pretty strong augmentation |
| 134 | + use_zoneout: False |
| 135 | + |
| 136 | + # 'location' or 'forward' |
| 137 | + attention_type: 'location' |
| 138 | + attention_rnn_dim: 1024 |
| 139 | + attention_dim: 128 |
| 140 | + # Location Layer parameters |
| 141 | + attention_location_n_filters: 32 |
| 142 | + attention_location_kernel_size: 31 |
| 143 | + # Windowing Attention Parameters (only at inference) |
| 144 | + windowing_attention: True |
| 145 | + win_attention_back: 15 |
| 146 | + win_attention_front: 25 |
| 147 | + pre_alignment: False |
| 148 | + |
| 149 | + # Mel-post processing network parameters |
| 150 | + postnet_embedding_dim: 512 |
| 151 | + postnet_kernel_size: 5 |
| 152 | + postnet_n_convolutions: 5 |
| 153 | + # sometimes fixes problems with exploding gradients |
| 154 | + postnet_bn_disable_running_stats: False |
| 155 | + |
| 156 | + ################################ |
| 157 | + # GST params # |
| 158 | + ################################ |
| 159 | + use_gst: True |
| 160 | + gst_fusion_type: 'sum' # 'sum' or 'concat' |
| 161 | + # if fusion_type is sum, you must set embedding dim the same as encoder_out * 2 |
| 162 | + gst_embedding_dim: 1024 |
| 163 | + gst_reference_encoder_dim: 128 |
| 164 | + gst_num_heads: 8 |
| 165 | + gst_num_style_tokens: 10 |
| 166 | + |
| 167 | + # for TPSE-GST prediction path |
| 168 | + gst_tpse_num_layers: 2 |
| 169 | + gst_tpse_gru_hidden_size: 256 |
| 170 | + bert_embedding_dim: 768 |
0 commit comments