Skip to content

Commit f9e05fe

Browse files
Alexandre LissyAlexandre Lissy
Alexandre Lissy
authored and
Alexandre Lissy
committed
Share argparser amongst importers
1 parent 29a2ac3 commit f9e05fe

14 files changed

+72
-53
lines changed

bin/import_aidatatang.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
sys.path.insert(1, os.path.join(sys.path[0], '..'))
99

10-
import argparse
10+
from util.importers import get_importers_parser
1111
import glob
1212
import pandas
1313
import tarfile
@@ -81,7 +81,7 @@ def load_set(glob_path):
8181

8282
def main():
8383
# https://www.openslr.org/62/
84-
parser = argparse.ArgumentParser(description='Import aidatatang_200zh corpus')
84+
parser = get_importers_parser(description='Import aidatatang_200zh corpus')
8585
parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz')
8686
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
8787
params = parser.parse_args()

bin/import_aishell.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
sys.path.insert(1, os.path.join(sys.path[0], '..'))
99

10-
import argparse
10+
from util.importers import get_importers_parser
1111
import glob
1212
import tarfile
1313
import pandas
@@ -80,7 +80,7 @@ def load_set(glob_path):
8080

8181
def main():
8282
# http://www.openslr.org/33/
83-
parser = argparse.ArgumentParser(description='Import AISHELL corpus')
83+
parser = get_importers_parser(description='Import AISHELL corpus')
8484
parser.add_argument('aishell_tgz_file', help='Path to data_aishell.tgz')
8585
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
8686
params = parser.parse_args()

bin/import_cv2.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
import csv
1818
import sox
19-
import argparse
2019
import subprocess
2120
import progressbar
2221
import unicodedata
@@ -26,7 +25,8 @@
2625
from multiprocessing.dummy import Pool
2726
from multiprocessing import cpu_count
2827
from util.downloader import SIMPLE_BAR
29-
from util.text import Alphabet, validate_label
28+
from util.text import Alphabet
29+
from util.importers import get_importers_parser, validate_label_eng as validate_label
3030
from util.helpers import secs_to_hours
3131

3232

@@ -136,7 +136,7 @@ def _maybe_convert_wav(mp3_filename, wav_filename):
136136

137137

138138
if __name__ == "__main__":
139-
PARSER = argparse.ArgumentParser(description='Import CommonVoice v2.0 corpora')
139+
PARSER = get_importers_parser(description='Import CommonVoice v2.0 corpora')
140140
PARSER.add_argument('tsv_dir', help='Directory containing tsv files')
141141
PARSER.add_argument('--audio_dir', help='Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
142142
PARSER.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')

bin/import_freestmandarin.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
sys.path.insert(1, os.path.join(sys.path[0], '..'))
99

10-
import argparse
10+
from util.importers import get_importers_parser
1111
import glob
1212
import numpy as np
1313
import pandas
@@ -81,7 +81,7 @@ def load_set(glob_path):
8181

8282
def main():
8383
# https://www.openslr.org/38/
84-
parser = argparse.ArgumentParser(description='Import Free ST Chinese Mandarin corpus')
84+
parser = get_importers_parser(description='Import Free ST Chinese Mandarin corpus')
8585
parser.add_argument('tgz_file', help='Path to ST-CMDS-20170001_1-OS.tar.gz')
8686
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
8787
params = parser.parse_args()

bin/import_gram_vaani.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
#!/usr/bin/env python
22

3+
# Make sure we can import stuff from util/
4+
# This script needs to be run from the root of the DeepSpeech repository
35
import os
4-
import csv
56
import sys
7+
sys.path.insert(1, os.path.join(sys.path[0], '..'))
8+
9+
import csv
610
import math
711
import urllib
812
import logging
9-
import argparse
13+
from util.importers import get_importers_parser
1014
import subprocess
1115
from os import path
1216
from pathlib import Path
@@ -38,7 +42,7 @@ def parse_args(args):
3842
Returns:
3943
:obj:`argparse.Namespace`: command line parameters namespace
4044
"""
41-
parser = argparse.ArgumentParser(
45+
parser = get_importers_parser(
4246
description="Imports GramVaani data for Deep Speech"
4347
)
4448
parser.add_argument(

bin/import_lingua_libre.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@
33

44
# Make sure we can import stuff from util/
55
# This script needs to be run from the root of the DeepSpeech repository
6-
import argparse
76
import os
87
import sys
9-
10-
118
sys.path.insert(1, os.path.join(sys.path[0], '..'))
129

10+
from util.importers import get_importers_parser
11+
1312
import csv
1413
import re
1514
import sox
@@ -173,7 +172,7 @@ def _maybe_convert_wav(ogg_filename, wav_filename):
173172
print('SoX processing error', ex, ogg_filename, wav_filename)
174173

175174
def handle_args():
176-
parser = argparse.ArgumentParser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
175+
parser = get_importers_parser(description='Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.')
177176
parser.add_argument(dest='target_dir')
178177
parser.add_argument('--qId', type=int, required=True, help='LinguaLibre language qId')
179178
parser.add_argument('--iso639-3', type=str, required=True, help='ISO639-3 language code')

bin/import_m-ailabs.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
# Make sure we can import stuff from util/
66
# This script needs to be run from the root of the DeepSpeech repository
7-
import argparse
87
import os
98
import sys
109

1110
sys.path.insert(1, os.path.join(sys.path[0], '..'))
1211

12+
from util.importers import get_importers_parser
13+
1314
import csv
1415
import subprocess
1516
import progressbar
@@ -168,7 +169,7 @@ def one_sample(sample):
168169

169170

170171
def handle_args():
171-
parser = argparse.ArgumentParser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
172+
parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
172173
parser.add_argument(dest='target_dir')
173174
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
174175
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')

bin/import_magicdata.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
sys.path.insert(1, os.path.join(sys.path[0], '..'))
99

10-
import argparse
10+
from util.importers import get_importers_parser
1111
import glob
1212
import pandas
1313
import tarfile
@@ -99,7 +99,7 @@ def load_set(set_path):
9999

100100
def main():
101101
# https://openslr.org/68/
102-
parser = argparse.ArgumentParser(description='Import MAGICDATA corpus')
102+
parser = get_importers_parser(description='Import MAGICDATA corpus')
103103
parser.add_argument('folder_with_archives', help='Path to folder containing magicdata_{train,dev,test}.tar.gz')
104104
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives')
105105
params = parser.parse_args()

bin/import_primewords.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import sys
88
sys.path.insert(1, os.path.join(sys.path[0], '..'))
99

10-
import argparse
10+
from util.importers import get_importers_parser
1111
import glob
1212
import json
1313
import numpy as np
@@ -93,7 +93,7 @@ def load_set(glob_path):
9393

9494
def main():
9595
# https://www.openslr.org/47/
96-
parser = argparse.ArgumentParser(description='Import Primewords Chinese corpus set 1')
96+
parser = get_importers_parser(description='Import Primewords Chinese corpus set 1')
9797
parser.add_argument('tgz_file', help='Path to primewords_md_2018_set1.tar.gz')
9898
parser.add_argument('--target_dir', default='', help='Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.')
9999
params = parser.parse_args()

bin/import_slr57.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,12 @@
33

44
# Make sure we can import stuff from util/
55
# This script needs to be run from the root of the DeepSpeech repository
6-
import argparse
76
import os
87
import sys
9-
10-
118
sys.path.insert(1, os.path.join(sys.path[0], '..'))
129

10+
from util.importers import get_importers_parser
11+
1312
import csv
1413
import re
1514
import sox
@@ -195,7 +194,7 @@ def one_sample(sample):
195194
print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE))
196195

197196
def handle_args():
198-
parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
197+
parser = get_importers_parser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.')
199198
parser.add_argument(dest='target_dir')
200199
parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
201200
parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')

bin/import_ts.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,13 @@
33

44
# Make sure we can import stuff from util/
55
# This script needs to be run from the root of the DeepSpeech repository
6-
import argparse
76
import os
87
import re
98
import sys
10-
11-
129
sys.path.insert(1, os.path.join(sys.path[0], '..'))
1310

11+
from util.importers import get_importers_parser
12+
1413
import csv
1514
import unidecode
1615
import zipfile
@@ -186,7 +185,7 @@ def cleanup_transcript(text, english_compatible=False):
186185

187186

188187
def handle_args():
189-
parser = argparse.ArgumentParser(description='Importer for TrainingSpeech dataset.')
188+
parser = get_importers_parser(description='Importer for TrainingSpeech dataset.')
190189
parser.add_argument(dest='target_dir')
191190
parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.')
192191
return parser.parse_args()

util/importers.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import argparse
2+
import re
3+
4+
def get_importers_parser(description):
5+
parser = argparse.ArgumentParser(description=description)
6+
return parser
7+
8+
# Validate and normalize transcriptions. Returns a cleaned version of the label
9+
# or None if it's invalid.
10+
def validate_label_eng(label):
11+
# For now we can only handle [a-z ']
12+
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
13+
return None
14+
15+
label = label.replace("-", " ")
16+
label = label.replace("_", " ")
17+
label = re.sub("[ ]{2,}", " ", label)
18+
label = label.replace(".", "")
19+
label = label.replace(",", "")
20+
label = label.replace(";", "")
21+
label = label.replace("?", "")
22+
label = label.replace("!", "")
23+
label = label.replace(":", "")
24+
label = label.replace("\"", "")
25+
label = label.strip()
26+
label = label.lower()
27+
28+
return label if label else None

util/test_importers.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import unittest
2+
3+
from .importers import validate_label_eng
4+
5+
class TestValidateLabelEng(unittest.TestCase):
6+
7+
def test_numbers(self):
8+
label = validate_label_eng("this is a 1 2 3 test")
9+
self.assertEqual(label, None)
10+
11+
if __name__ == '__main__':
12+
unittest.main()

util/text.py

-23
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from __future__ import absolute_import, division, print_function
22

33
import numpy as np
4-
import re
54
import struct
65

76
from six.moves import range
@@ -166,25 +165,3 @@ def levenshtein(a, b):
166165
current[j] = min(add, delete, change)
167166

168167
return current[n]
169-
170-
# Validate and normalize transcriptions. Returns a cleaned version of the label
171-
# or None if it's invalid.
172-
def validate_label(label):
173-
# For now we can only handle [a-z ']
174-
if re.search(r"[0-9]|[(<\[\]&*{]", label) is not None:
175-
return None
176-
177-
label = label.replace("-", " ")
178-
label = label.replace("_", " ")
179-
label = re.sub("[ ]{2,}", " ", label)
180-
label = label.replace(".", "")
181-
label = label.replace(",", "")
182-
label = label.replace(";", "")
183-
label = label.replace("?", "")
184-
label = label.replace("!", "")
185-
label = label.replace(":", "")
186-
label = label.replace("\"", "")
187-
label = label.strip()
188-
label = label.lower()
189-
190-
return label if label else None

0 commit comments

Comments
 (0)