Skip to content

Commit 5fd6d1e

Browse files
Ilya ShigabeevIlya Shigabeev
Ilya Shigabeev
authored and
Ilya Shigabeev
committed
initial commit
1 parent 7f68686 commit 5fd6d1e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+154750
-0
lines changed

LICENSE

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
BSD 3-Clause License
2+
3+
Copyright (c) 2018, NVIDIA Corporation
4+
All rights reserved.
5+
6+
Redistribution and use in source and binary forms, with or without
7+
modification, are permitted provided that the following conditions are met:
8+
9+
* Redistributions of source code must retain the above copyright notice, this
10+
list of conditions and the following disclaimer.
11+
12+
* Redistributions in binary form must reproduce the above copyright notice,
13+
this list of conditions and the following disclaimer in the documentation
14+
and/or other materials provided with the distribution.
15+
16+
* Neither the name of the copyright holder nor the names of its
17+
contributors may be used to endorse or promote products derived from
18+
this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

alignment.jpg

18.2 KB
Loading

configs/config_ljspeech.yaml

+171
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# LJSpeech config with L2 reg, zoneout, prenet noise
2+
################################
3+
# Experiment Parameters #
4+
################################
5+
experiment:
6+
name: "ljspeech" # name of experiment
7+
log_every: 10
8+
iters_per_checkpoint: 500
9+
seed: 1234
10+
fp16_run: False
11+
distributed_run: True
12+
dist_backend: "nccl"
13+
dist_url: "tcp://localhost:54321"
14+
cudnn_enabled: True
15+
cudnn_benchmark: False
16+
17+
training:
18+
epochs: 400
19+
batch_size: 64
20+
# expected for 3 gpus, for 1 gpu divide by 2, for 3 gpus multiply by 1.5
21+
# learning rate is ramping from 0 and decaying using cosine by default
22+
learning_rate: 0.0012
23+
disable_lr_decay: True
24+
mask_padding: True # set model's padded outputs to padded values
25+
26+
# Loss params
27+
loss_use_masking: True
28+
loss_bce_pos_weight: 5.0
29+
loss_guided_attn_weight: 0.3
30+
loss_guided_attn_weight_decay: 0.9995
31+
loss_guided_attn_min_value: 0.0001
32+
33+
################################
34+
# Optimization Hyperparameters #
35+
################################
36+
use_saved_learning_rate: False
37+
# divide n_epochs by X parts. First 2 decay from lr to lr/10, third from lr/10 to lr/1e2, last from lr/1e2 to 0
38+
n_warmup_steps: 1000
39+
weight_decay: 0.000006
40+
grad_clip_thresh: 1.0
41+
42+
################################
43+
# Data Parameters #
44+
################################
45+
data:
46+
load_mel_from_disk: True
47+
training_files: '/home/frappuccino/data/filelists/ljspeech_pauses/train.txt'
48+
validation_files: '/home/frappuccino/data/filelists/ljspeech_pauses/val.txt'
49+
text_cleaners: ['english_cleaners']
50+
cmudict_path: "data/cmudict_dictionary"
51+
skip_heteronyms: False
52+
batch_group_size: 0.05
53+
# probability of word converted to phonemes
54+
p_arpabet: 0.99
55+
56+
################################
57+
# Audio Parameters #
58+
################################
59+
sampling_rate: 22050
60+
mel_fmin: 20.0
61+
mel_fmax: 11025.0
62+
n_mel_channels: 80
63+
max_wav_value: 32768.0
64+
filter_length: 1024
65+
win_length: 1024
66+
hop_length: 256
67+
68+
# Normalization parameters
69+
# DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'.
70+
# If it is defined mean-std based notmalization is used and other normalization params are ignored
71+
stats_path: "/home/frappuccino/data/filelists/ljspeech_pauses/scale_stats.npy"
72+
73+
checkpoint:
74+
start_from_iteration: -1 # -1 to resume from checkpoint, else from such iteration
75+
76+
# use ignore layers when --warm_start is applied.
77+
# Empty list is also supported, in that case any new keys in model, but not in checkpoint
78+
# will also be loaded
79+
# set to speaker_embedding.weight when doing speaker adaptation
80+
ignore_layers: [
81+
"embedding.weight",
82+
"decoder.prenet.layers.0.linear_layer.weight",
83+
"decoder.linear_projection.linear_layer.weight",
84+
"decoder.linear_projection.linear_layer.bias",
85+
86+
"decoder.attention_rnn.weight_ih",
87+
"decoder.attention.memory_layer.linear_layer.weight",
88+
"decoder.gate_layer.linear_layer.weight",
89+
]
90+
# for speaker adaptation
91+
train_only_layers: [
92+
# 'speaker_embedding.weight'
93+
# 'embedding.weight'
94+
#"gst.*"
95+
96+
# retrain gate_layer
97+
# "decoder.gate_layer.linear_layer.weight",
98+
# "decoder.gate_layer.linear_layer.bias",
99+
]
100+
101+
################################
102+
# Model Parameters #
103+
################################
104+
model:
105+
n_symbols: 153 #len(symbols)
106+
107+
# Encoder parameters
108+
encoder_kernel_size: 5
109+
encoder_n_convolutions: 3
110+
encoder_embedding_dim: 512
111+
# encoder out size, will be doubled as we use bidirectional lstm
112+
encoder_lstm_hidden_dim: 512
113+
114+
################################
115+
# Decoder parameters #
116+
################################
117+
n_frames_per_step: 2
118+
max_decoder_steps: 1500
119+
gate_threshold: 0.5
120+
decoder_rnn_dim: 1024
121+
prenet_dim: 256
122+
prenet_noise: 0.01
123+
124+
p_teacher_forcing: 0.99
125+
# quite strong augmentation, check number of iterations in advance to set this properly
126+
p_teacher_forcing_decay_rate: 0.999999
127+
128+
################################
129+
# Attention parameters #
130+
################################
131+
# start from 0.2 and decrease to 0.1 at the end of training ( when attention converged )
132+
p_attention_dropout: 0.1
133+
p_decoder_dropout: 0.1
134+
# increases memory consumption, pretty strong augmentation
135+
use_zoneout: False
136+
137+
# 'location' or 'forward'
138+
attention_type: 'location'
139+
attention_rnn_dim: 1024
140+
attention_dim: 128
141+
# Location Layer parameters
142+
attention_location_n_filters: 32
143+
attention_location_kernel_size: 31
144+
# Windowing Attention Parameters (only at inference)
145+
windowing_attention: False
146+
win_attention_back: 15
147+
win_attention_front: 25
148+
pre_alignment: False
149+
150+
# Mel-post processing network parameters
151+
postnet_embedding_dim: 512
152+
postnet_kernel_size: 5
153+
postnet_n_convolutions: 5
154+
# sometimes fixes problems with exploding gradients
155+
postnet_bn_disable_running_stats: False
156+
157+
################################
158+
# GST params #
159+
################################
160+
use_gst: True
161+
gst_fusion_type: 'sum'
162+
# if fusion_type is sum, you must set embedding dim the same as encoder_out * 2
163+
gst_embedding_dim: 1024
164+
gst_reference_encoder_dim: 128
165+
gst_num_heads: 8
166+
gst_num_style_tokens: 10
167+
168+
# for TPSE-GST prediction path
169+
gst_tpse_num_layers: 2
170+
gst_tpse_gru_hidden_size: 256
171+
bert_embedding_dim: 768

configs/config_mike.yaml

+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
# LJSpeech config with L2 reg, zoneout, prenet noise
2+
################################
3+
# Experiment Parameters #
4+
################################
5+
experiment:
6+
name: "mike" # name of experiment
7+
log_every: 10
8+
iters_per_checkpoint: 500
9+
seed: 1234
10+
fp16_run: False
11+
distributed_run: True
12+
dist_backend: "nccl"
13+
dist_url: "tcp://localhost:54321"
14+
cudnn_enabled: True
15+
cudnn_benchmark: False
16+
17+
training:
18+
epochs: 700
19+
batch_size: 48
20+
# expected for 3 gpus, for 1 gpu divide by 2, for 3 gpus multiply by 1.5
21+
# learning rate is ramping from 0 and decaying using cosine by default
22+
learning_rate: 0.0003
23+
disable_lr_decay: False
24+
mask_padding: True # set model's padded outputs to padded values
25+
26+
# Loss params
27+
loss_use_masking: True
28+
loss_bce_pos_weight: 5.0
29+
loss_guided_attn_weight: 0.3
30+
loss_guided_attn_weight_decay: 0.9995
31+
loss_guided_attn_min_value: 0.0001
32+
33+
################################
34+
# Optimization Hyperparameters #
35+
################################
36+
use_saved_learning_rate: False
37+
# divide n_epochs by X parts. First 2 decay from lr to lr/10, third from lr/10 to lr/1e2, last from lr/1e2 to 0
38+
n_warmup_steps: 500
39+
weight_decay: 0.000006
40+
grad_clip_thresh: 1.0
41+
42+
################################
43+
# Data Parameters #
44+
################################
45+
data:
46+
load_mel_from_disk: True
47+
training_files: '/home/frappuccino/data/filelists/mike_pauses/train.txt'
48+
validation_files: '/home/frappuccino/data/filelists/mike_pauses/val.txt'
49+
text_cleaners: ['english_cleaners']
50+
cmudict_path: "data/cmudict_dictionary"
51+
skip_heteronyms: False
52+
batch_group_size: 0.05
53+
# probability of word converted to phonemes
54+
p_arpabet: 0.99
55+
56+
################################
57+
# Audio Parameters #
58+
################################
59+
sampling_rate: 22050
60+
mel_fmin: 20.0
61+
mel_fmax: 11025.0
62+
n_mel_channels: 80
63+
max_wav_value: 32768.0
64+
filter_length: 1024
65+
win_length: 1024
66+
hop_length: 256
67+
68+
# Normalization parameters
69+
# DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'.
70+
# If it is defined mean-std based notmalization is used and other normalization params are ignored
71+
stats_path: "/home/frappuccino/data/filelists/mike_pauses/scale_stats.npy"
72+
73+
checkpoint:
74+
start_from_iteration: -1 # -1 to resume from checkpoint, else from such iteration
75+
76+
# use ignore layers when --warm_start is applied.
77+
# Empty list is also supported, in that case any new keys in model, but not in checkpoint
78+
# will also be loaded
79+
# set to speaker_embedding.weight when doing speaker adaptation
80+
ignore_layers: [
81+
"encoder.embedding.weight",
82+
"decoder.prenet.*",
83+
"decoder.linear_projection.*",
84+
"decoder.gate_layer.linear_layer.weight",
85+
86+
# "decoder.attention_rnn.weight_ih",
87+
# "decoder.attention.memory_layer.linear_layer.weight",
88+
]
89+
# for speaker adaptation
90+
train_only_layers: [
91+
# 'speaker_embedding.weight'
92+
# 'embedding.weight'
93+
#"gst.*"
94+
95+
# retrain gate_layer
96+
# "decoder.gate_layer.linear_layer.weight",
97+
# "decoder.gate_layer.linear_layer.bias",
98+
]
99+
100+
################################
101+
# Model Parameters #
102+
################################
103+
model:
104+
n_symbols: 153 #len(symbols)
105+
106+
# Encoder parameters
107+
encoder_kernel_size: 5
108+
encoder_n_convolutions: 3
109+
encoder_embedding_dim: 512
110+
# encoder out size, will be doubled as we use bidirectional lstm
111+
encoder_lstm_hidden_dim: 512
112+
113+
################################
114+
# Decoder parameters #
115+
################################
116+
n_frames_per_step: 2
117+
max_decoder_steps: 1500
118+
gate_threshold: 0.4
119+
decoder_rnn_dim: 1024
120+
prenet_dim: 256
121+
prenet_noise: 0.01
122+
123+
p_teacher_forcing: 0.99
124+
# quite strong augmentation, check number of iterations in advance to set this properly
125+
p_teacher_forcing_decay_rate: 1.0
126+
127+
################################
128+
# Attention parameters #
129+
################################
130+
# start from 0.2 and decrease to 0.1 at the end of training ( when attention converged )
131+
p_attention_dropout: 0.1
132+
p_decoder_dropout: 0.1
133+
# increases memory consumption, pretty strong augmentation
134+
use_zoneout: False
135+
136+
# 'location' or 'forward'
137+
attention_type: 'location'
138+
attention_rnn_dim: 1024
139+
attention_dim: 128
140+
# Location Layer parameters
141+
attention_location_n_filters: 32
142+
attention_location_kernel_size: 31
143+
# Windowing Attention Parameters (only at inference)
144+
windowing_attention: True
145+
win_attention_back: 15
146+
win_attention_front: 25
147+
pre_alignment: False
148+
149+
# Mel-post processing network parameters
150+
postnet_embedding_dim: 512
151+
postnet_kernel_size: 5
152+
postnet_n_convolutions: 5
153+
# sometimes fixes problems with exploding gradients
154+
postnet_bn_disable_running_stats: False
155+
156+
################################
157+
# GST params #
158+
################################
159+
use_gst: True
160+
gst_fusion_type: 'sum' # 'sum' or 'concat'
161+
# if fusion_type is sum, you must set embedding dim the same as encoder_out * 2
162+
gst_embedding_dim: 1024
163+
gst_reference_encoder_dim: 128
164+
gst_num_heads: 8
165+
gst_num_style_tokens: 10
166+
167+
# for TPSE-GST prediction path
168+
gst_tpse_num_layers: 2
169+
gst_tpse_gru_hidden_size: 256
170+
bert_embedding_dim: 768

0 commit comments

Comments
 (0)