-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathpreprocess_LJSpeech.py
87 lines (77 loc) · 3.02 KB
/
preprocess_LJSpeech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import argparse
import json
import numpy as np
import torch
from tqdm import tqdm
from pathlib import Path
import librosa
import soundfile as sf
from g2p_en import G2p
from wav2vec2.wrapper import MinimalClassifier
import sys
sys.path.insert(0, 'waveglow/tacotron2')
from layers import TacotronSTFT
parser = argparse.ArgumentParser()
parser.add_argument('--datadir', type=str, required=True)
parser.add_argument('--outputdir', type=str, required=True)
parser.add_argument('--emo_model_dir', type=str, default=None)
parser.add_argument('--filter_length', type=int, default=1024)
parser.add_argument('--hop_length', type=int, default=256)
parser.add_argument('--win_length', type=int, default=1024)
parser.add_argument('--mel_fmin', type=float, default=0)
parser.add_argument('--mel_fmax', type=float, default=8000)
args = parser.parse_args()
stft = TacotronSTFT(filter_length=args.filter_length,
hop_length=args.hop_length,
win_length=args.win_length,
sampling_rate=22050,
mel_fmin=args.mel_fmin, mel_fmax=args.mel_fmax).cuda()
if args.emo_model_dir:
emo_model = MinimalClassifier.load_from_checkpoint(args.emo_model_dir,
strict=False).cuda()
else:
emo_model = MinimalClassifier().cuda()
emo_model.freeze()
emo_model.eval()
g2p = G2p()
mel_dir = os.path.join(args.outputdir, 'mels')
Path(mel_dir).mkdir(parents=True, exist_ok=True)
emo_reps_dir = os.path.join(args.outputdir, 'emo_reps')
Path(emo_reps_dir).mkdir(parents=True, exist_ok=True)
raw_dir = os.path.join(args.outputdir, '16k_wav')
Path(raw_dir).mkdir(parents=True, exist_ok=True)
metadata = dict()
wavdir = os.path.join(args.datadir, 'wavs')
textdir = os.path.join(args.datadir, 'metadata.csv')
textdict = dict()
with open(textdir, 'r') as f:
for line in f.readlines():
name, _, text = line.strip().split('|')
textdict[name] = text
for wav in tqdm(os.listdir(wavdir)):
audio_name = os.path.join(wavdir, wav)
basename = os.path.splitext(wav)[0]
audio, sr = librosa.load(audio_name, sr=None)
length = float(len(audio)) / sr
if length > 10.:
continue
melspec = librosa.resample(audio, sr, 22050)
melspec = np.clip(melspec, -1, 1)
melspec = torch.cuda.FloatTensor(melspec).unsqueeze(0)
melspec = stft.mel_spectrogram(melspec).squeeze(0).cpu().numpy()
_wav = librosa.resample(audio, sr, 16000)
_wav = np.clip(_wav, -1, 1)
emo_reps = torch.cuda.FloatTensor(_wav).unsqueeze(0)
emo_reps = emo_model(emo_reps).squeeze(0).cpu().numpy()
np.save(os.path.join(mel_dir, basename + '.npy'), melspec)
sf.write(os.path.join(raw_dir, basename + '.wav'), _wav, 16000)
np.save(os.path.join(emo_reps_dir, basename + '.npy'), emo_reps)
phonemes = g2p(textdict[basename])
metadata[basename] = {
'length': length,
'text': textdict[basename],
'phonemes': phonemes
}
with open(os.path.join(args.outputdir, 'metadata.json'), 'w') as f:
json.dump(metadata, f, indent=4)