b04901014
diff --git a/‎README.md
+117-1 b/‎README.md
+117-1
diff --git a/‎data/QuantizeDataset.py
+138 b/‎data/QuantizeDataset.py
+138
diff --git a/‎data/sampler.py
+104 b/‎data/sampler.py
+104
@@ -1 +1,117 @@
-# MQTTS
+# MQTTS
+ - Official implementation for the paper [TODO]().
+ - Audio samples (40 each system) can be accessed at [here](https://cmu.box.com/s/ktbk9pi04e2z1dlyepkkw69xcu9w91dj).
+ - Quick demo can be accessed [TODO]().
+## Setup the environment
+1. Setup conda environment:
+```
+conda create --name mqtts python=3.9
+conda activate mqtts
+conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
+pip install -r requirements.txt
+```
+(Update) You may need to create an access token to use the speaker embedding of pyannote as they updated their policy.
+If that's the case follow the [pyannote repo](https://github.com/pyannote/pyannote-audio) and change every `Inference("pyannote/embedding", window="whole")` accordingly.
+
+2. Download the pretrained phonemizer checkpoint
+```
+wget https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_forward.pt
+```
+
+## Preprocess the dataset
+1. Get the GigaSpeech dataset from the [official repo](https://github.com/SpeechColab/GigaSpeech)
+2. Install [FFmpeg](https://ffmpeg.org), then
+```
+conda install ffmpeg=4.3=hf484d3e_0
+conda update ffmpeg
+```
+3. Run python script
+```
+python preprocess.py --giga_speech_dir GIGASPEECH --outputdir datasets 
+```
+
+## Train the quantizer and inference
+1. Train
+```
+cd quantizer/
+python train.py --input_wavs_dir ../datasets/audios \
+                --input_training_file ../datasets/training.txt \
+                --input_validation_file ../datasets/validation.txt \
+                --checkpoint_path ./checkpoints \
+                --config config.json
+```
+
+2. Inference to get codes for training the second stage
+```
+python get_labels.py --input_json ../datasets/train.json \
+                     --input_wav_dir ../datasets/audios \
+                     --output_json ../datasets/train_q.json \
+                     --checkpoint_file ./checkpoints/g_{training_steps}
+python get_labels.py --input_json ../datasets/dev.json \
+                     --input_wav_dir ../datasets/audios \
+                     --output_json ../datasets/dev_q.json \
+                     --checkpoint_file ./checkpoints/g_{training_steps}
+```
+
+## Train the transformer (below an example for the 100M version)
+```
+cd ..
+mkdir ckpt
+python train.py \
+     --distributed \
+     --saving_path ckpt/ \
+     --sampledir logs/ \
+     --vocoder_config_path quantizer/checkpoints/config.json \
+     --vocoder_ckpt_path quantizer/checkpoints/g_{training_steps} \
+     --datadir datasets/audios \
+     --metapath datasets/train_q.json \
+     --val_metapath datasets/dev_q.json \
+     --use_repetition_token \
+     --ar_layer 4 \
+     --ar_ffd_size 1024 \
+     --ar_hidden_size 256 \
+     --ar_nheads 4 \
+     --speaker_embed_dropout 0.05 \
+     --enc_nlayers 6 \
+     --dec_nlayers 6 \
+     --ffd_size 3072 \
+     --hidden_size 768 \
+     --nheads 12 \
+     --batch_size 200 \
+     --precision bf16 \
+     --training_step 800000 \
+     --layer_norm_eps 1e-05
+```
+You can view the progress using:
+```
+tensorboard --logdir logs/
+```
+
+## Run batched inference (You'll have to change `speaker_to_text.json`, it's just an example.)
+```
+mkdir infer_samples
+CUDA_VISIBLE_DEVICES=0 python infer.py \
+    --phonemizer_dict_path en_us_cmudict_forward.pt \
+    --model_path ckpt/last.ckpt \
+    --config_path ckpt/config.json \
+    --input_path speaker_to_text.json \
+    --outputdir infer_samples \
+    --batch_size {batch_size} \
+    --top_p 0.8 \
+    --min_top_k 2 \
+    --max_output_length {Maximum Output Frames to prevent infinite loop} \
+    --phone_context_window 3 \
+    --clean_speech_prior
+```
+
+### Pretrained checkpoints
+
+1. Quantizer (put it under `quantizer/checkpoints/`):
+```
+wget https://anonfiles.com/Tf52ua4dy8/g_00600000
+```
+
+2. Transformer (100M version) (put it under `ckpt/`):
+```
+wget https://anonfiles.com/o6C1u747y6/last_ckpt
+```
@@ -0,0 +1,138 @@
+import os
+from torch.utils import data
+import torch
+import json
+import numpy as np
+import soundfile as sf
+import random
+from pathlib import Path
+from librosa.util import normalize
+from pyannote.audio import Inference
+
+import torch.nn.functional as F
+
+def random_crop(x, maxseqlen):
+    if x.shape[0] >= maxseqlen:
+        offset = random.randrange(x.shape[0] - maxseqlen + 1)
+        x = x[offset: offset + maxseqlen]
+    else:
+        offset = 0
+    return x, offset
+
+def dynamic_range_compression(x, C=0.3, M=6.5, clip_val=1e-5):
+    return (np.log(np.clip(x, a_min=clip_val, a_max=None)) + M) * C
+
+def dynamic_range_decompression(x, C=0.3, M=6.5):
+    return np.exp(x / C - M)
+
+class QuantizeDataset(data.Dataset):
+    def __init__(self, hp, metapath):
+        self.hp = hp
+        print (f'Loading metadata in {metapath}...')
+        with open(metapath, 'r') as f:
+            self.text = json.load(f) #{name: {text:, phoneme:, ..., duration: }}
+        self.datasetbase = [x for x in self.text.keys()]
+        self.dataset = [os.path.join(self.hp.datadir, x) for x in self.datasetbase]
+        self.phoneset = ['<pad>', 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH', ',', '.']
+        print (self.phoneset)
+        if self.hp.speaker_embedding_dir is None:
+            self.spkr_embedding = Inference("pyannote/embedding", window="whole")
+
+        #Print statistics:
+        l = len(self.dataset)
+        print (f'Total {l} examples')
+
+        self.lengths = [float(v['duration']) for v in self.text.values()]
+        avglen = sum(self.lengths) / len(self.lengths)
+        maxlen = max(self.lengths)
+        minlen = min(self.lengths)
+        print (f"Average duration of audio: {avglen} sec, Maximum duration: {maxlen} sec, Minimum duration: {minlen} sec")
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        dataname = self.dataset[i]
+        _name = self.datasetbase[i]
+        metadata = self.text[_name]
+        #To synthesized phoneme sequence
+        phonemes = [self.phoneset.index(ph) for ph in metadata['phoneme'].split() if ph in self.phoneset]
+
+        if self.hp.speaker_embedding_dir is None:
+            audio, sampling_rate = sf.read(dataname)
+            audio = normalize(audio) * 0.95
+            speaker_embedding = self.spkr_embedding({'waveform': torch.FloatTensor(audio).unsqueeze(0), 'sample_rate': self.hp.sample_rate})
+        else:
+            speaker_embedding = os.path.join(self.hp.speaker_embedding_dir, os.path.splitext(_name)[0] + '.npy')
+            speaker_embedding = np.load(speaker_embedding).astype(np.float32)
+
+        #Ground truth for TTS system
+        quantization = np.array(metadata['quantization']).T # ..., 4
+        #Add start token, end token
+        start, end = np.full((1, self.hp.n_cluster_groups), self.hp.n_codes + 1, dtype=np.int16), np.full((1, self.hp.n_cluster_groups), self.hp.n_codes, dtype=np.int16)
+        quantization_s = np.concatenate([start, quantization.copy()], 0)
+        #Add repetition token if needed for ground truth "label"
+        if self.hp.use_repetition_token:
+            pad = np.full((1, self.hp.n_cluster_groups), -100, dtype=np.int16)
+            np_mask = np.diff(quantization, axis=0, prepend=pad)
+            quantization[np_mask == 0] = self.hp.n_codes + 2
+        quantization_e = np.concatenate([quantization, end], 0)
+        return speaker_embedding, quantization_s, quantization_e, phonemes, dataname
+
+    def seqCollate(self, batch):
+        output = {
+            'speaker': [],
+            'phone': [],
+            'phone_mask': [],
+            'tts_quantize_input': [],
+            'tts_quantize_output': [],
+            'quantize_mask': [],
+        }
+        #Get the max length of everything
+        max_len_q, max_phonelen = 0, 0
+        for spkr, q_s, q_e, ph, _ in batch:
+            if len(q_s) > max_len_q:
+                max_len_q = len(q_s)
+            if len(ph) > max_phonelen:
+                max_phonelen = len(ph)
+            output['speaker'].append(spkr)
+        #Pad each element, create mask
+        for _, qs, qe, phone, _ in batch:
+            #Deal with phonemes
+            phone_mask = np.array([False] * len(phone) + [True] * (max_phonelen - len(phone)))
+            phone = np.pad(phone, [0, max_phonelen-len(phone)])
+            #Deal with quantizations
+            q_mask = np.array([False] * len(qs) + [True] * (max_len_q - len(qs)))
+            qs = np.pad(qs, [[0, max_len_q-len(qs)], [0, 0]], constant_values=self.hp.n_codes)
+            qe = np.pad(qe, [[0, max_len_q-len(qe)], [0, 0]], constant_values=self.hp.n_codes)
+            #Aggregate
+            output['phone'].append(phone)
+            output['phone_mask'].append(phone_mask)
+            output['tts_quantize_input'].append(qs)
+            output['tts_quantize_output'].append(qe)
+            output['quantize_mask'].append(q_mask)
+        for k in output.keys():
+            output[k] = np.array(output[k])
+            if 'mask' in k:
+                output[k] = torch.BoolTensor(output[k])
+            elif k in ['phone', 'tts_quantize_input', 'tts_quantize_output']:
+                output[k] = torch.LongTensor(output[k])
+            else:
+                output[k] = torch.FloatTensor(output[k])
+        return output
+
+class QuantizeDatasetVal(QuantizeDataset):
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        speaker_embedding, quantization_s, quantization_e, phonemes, dataname = super().__getitem__(i)
+        audio, sampling_rate = sf.read(dataname)
+        audio = normalize(audio) * 0.95
+        return (
+            torch.FloatTensor(speaker_embedding),
+            torch.LongTensor(quantization_s),
+            torch.LongTensor(quantization_e),
+            torch.LongTensor(phonemes),
+            torch.FloatTensor(audio)
+        )
@@ -0,0 +1,104 @@
+from torch.utils import data
+import torch
+import math
+import numpy as np
+import random
+
+def StandardSampler(dataset, shuffle, distributed=False,
+                    world_size=None, rank=None):
+    if distributed:
+        return data.distributed.DistributedSampler(dataset, shuffle=shuffle,
+                                                   num_replicas=world_size, rank=rank)
+    if shuffle:
+        return data.RandomSampler(dataset)
+    return data.SequentialSampler(dataset)
+
+def RandomBucketSampler(nbuckets, length, batch_size, drop_last, distributed=False,
+                        world_size=None, rank=None):
+    if distributed:
+        return DistributedRandomBucketSampler(nbuckets, length, batch_size, drop_last, world_size, rank)
+    return SingleRandomBucketSampler(nbuckets, length, batch_size, drop_last)
+
+class SingleRandomBucketSampler(data.Sampler):
+    def __init__(self, nbuckets, length, batch_size, drop_last):
+        self.length = length
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        indices = np.argsort([-x for x in length])
+        split = len(indices) // nbuckets
+        self.indices = []
+        for i in range(nbuckets):
+            self.indices.append(indices[i*split:(i+1)*split])
+        if nbuckets * split < len(length):
+            self.indices.append(indices[nbuckets*split:])
+
+    def __iter__(self):
+        random.shuffle(self.indices)
+        for x in self.indices:
+            random.shuffle(x)
+        idxs = [i for x in self.indices for i in x]
+        batches, batch, sum_len, max_len = [], [], 0, 0
+        for idx in idxs:
+            batch.append(idx)
+            sum_len += self.length[idx]
+            max_len = max(self.length[idx], max_len)
+            if max_len * len(batch) > self.batch_size:
+                batches.append(batch[:-1])
+                batch, sum_len, max_len = [batch[-1]], self.length[idx], self.length[idx]
+        if len(batch) > 0 and not self.drop_last:
+            batches.append(batch)
+        random.shuffle(batches)
+        return iter(batches)
+
+class DistributedRandomBucketSampler(data.Sampler):
+    def __init__(self, nbuckets, length, batch_size,
+                 drop_last, num_replicas, rank, seed=1234):
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                "Invalid rank {}, rank should be in the interval"
+                " [0, {}]".format(rank, num_replicas - 1))
+        indices = np.argsort(length)
+        split = len(indices) // nbuckets
+        self.length = length
+        self.batch_size = batch_size
+        self.indices = []
+        for i in range(nbuckets):
+            self.indices.append(indices[i*split:(i+1)*split])
+        if nbuckets * split < len(length):
+            self.indices.append(indices[nbuckets*split:])
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed
+
+    def __iter__(self):
+        #Deterministic shuffling
+        random.Random(self.epoch + self.seed).shuffle(self.indices)
+        for i, x in enumerate(self.indices):
+            seed = self.epoch + self.seed + i * 5
+            random.Random(seed).shuffle(x)
+        indices = [i for x in self.indices for i in x]
+
+        #Batching
+        batches, batch, sum_len, max_len = [], [], 0, 0
+        for idx in indices:
+            batch.append(idx)
+            sum_len += self.length[idx]
+            max_len = max(self.length[idx], max_len)
+            if max_len * len(batch) > self.batch_size:
+                batches.append(batch[:-1])
+                batch, sum_len, max_len = [batch[-1]], self.length[idx], self.length[idx]
+        # subsample
+        num_samples = math.ceil((len(batches) - self.num_replicas) / self.num_replicas)
+        total_size = num_samples * self.num_replicas
+        batches = batches[:total_size]
+        batches = batches[self.rank*num_samples: (self.rank+1)*num_samples]
+        assert len(batches) == num_samples
+
+        #Stochastic suffling
+        random.shuffle(batches)
+        return iter(batches)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+