Skip to content

Commit

Permalink
Merge pull request #30 from opensource-spraakherkenning-nl/development
Browse files Browse the repository at this point in the history
Skip normalization options + add variations for BN-NL part of N-Best corpus
  • Loading branch information
greenw0lf authored Apr 5, 2024
2 parents 1a4745d + 0d23a05 commit 47dc7d1
Show file tree
Hide file tree
Showing 9 changed files with 436 additions and 89 deletions.
29 changes: 19 additions & 10 deletions ASR_NL_benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,39 @@
if __name__ == "__main__":
# Set parser
parser = argparse.ArgumentParser(description='normalize ref and hyp file')
parser.add_argument('-hyp', '--hypfile', nargs='+',
parser.add_argument('-hyp', '--hypfile', nargs=2,
metavar=('hypfile_name', 'extension'),
default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'], help='help: path to the hypothesis file and its extension')
parser.add_argument('-ref', '--reffile', nargs='+',
default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'],
help='path to the hypothesis file and its extension')
parser.add_argument('-ref', '--reffile', nargs=2,
metavar=('reffile_name', 'extension'),
default=['ASR_NL_benchmark/data/test_ref.stm', 'stm'],
help='help: path to the reference file and its extension')
help='path to the reference file and its extension')
parser.add_argument('-kind', '--kind',
metavar=('speechrecognizer'),
default='',
help='help: enter the name of your speech recognizer')
help='enter the name of your speech recognizer')
parser.add_argument('-interactive',
metavar='value',
default='',
help='help: True if you want to use the GUI')
action = 'store_true',
help='if you want to use the GUI')
parser.add_argument('-skip_ref_normalization',
action = 'store_true',
help = 'skip the normalization step for the reference file')
parser.add_argument('-skip_hyp_normalization',
action = 'store_true',
help = 'skip the normalization step for the hypothesis file')

args = parser.parse_args()

if bool(args.interactive):
if args.interactive:
print('Opening interface')
interface.main()
else:
print('Running benchmarking')
benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], args.reffile[0], args.reffile[1], kind=args.kind)
benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], \
args.reffile[0], args.reffile[1], \
kind=args.kind, \
skip_ref_norm=args.skip_ref_normalization, skip_hyp_norm=args.skip_hyp_normalization)
benchmarking.main()
pipeline.process_results(kind=args.kind)

4 changes: 3 additions & 1 deletion ASR_NL_benchmark/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def upload_page():
hyp = os.path.join(os.path.sep,'input',request.form.get('hyp'))
ref = os.path.join(os.path.sep,'input',request.form.get('ref'))
kind = request.form.get('kind')
skip_ref_norm = request.form.get('skip-ref-norm')
skip_hyp_norm = request.form.get('skip-hyp-norm')
global benchmarking
benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind)
benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind, skip_ref_norm, skip_hyp_norm)
Thread(target=benchmarking.main).start()
return redirect(f'/progress?ref={ref}&hyp={hyp}')
return render_template('select_files.html')
Expand Down
6 changes: 5 additions & 1 deletion ASR_NL_benchmark/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def check_and_covert_interger(word):
float(word)
new_word = num2words(word, to='cardinal', lang='nl')
new_word = new_word.replace('komma', 'punt')
new_word = new_word.replace('duizend', 'duizend ')
logging.info(f'converted the number {word} to {new_word}')
return new_word
except:
Expand All @@ -29,6 +30,7 @@ def check_and_covert_interger(word):
try:
float(new_word)
new_word = num2words(new_word, to='cardinal', lang='nl')
new_word = new_word.replace('duizend', 'duizend ')
logging.info(f'converted number {word} to {new_word}')
return new_word
except:
Expand All @@ -44,9 +46,10 @@ def replace_numbers_and_symbols(text):
>>> replace_numbers_and_symbols('12,3%')
'twaalf komma drie procent'
"""
removed_punct = string.punctuation.replace("'", '').replace('-', '')
text_without_symbols = replace_symbols(text)
clean_text = replace_numbers(text_without_symbols)
clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))
clean_text = clean_text.translate(str.maketrans('', '', removed_punct))
return clean_text

def replace_numbers(text):
Expand All @@ -65,6 +68,7 @@ def replace_numbers(text):
if word.isdigit():
number_of_numbers += 1
text_list[position] = num2words(word, to='cardinal', lang='nl')
text_list[position] = text_list[position].replace('duizend', 'duizend ')
elif check_and_covert_interger(word):
text_list[position] = check_and_covert_interger(word)
text_without_numbers = " ".join(text_list)
Expand Down
36 changes: 22 additions & 14 deletions ASR_NL_benchmark/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def set_logging(logpath):
return logging


def run_pipeline(hypfile, reffile):
def run_pipeline(hypfile, reffile, skip_ref_norm, skip_hyp_norm):
""" Validates and Normalizes the hyp and ref file and runs them trough sclite
Args:
hypfile: the hypothesis file
Expand All @@ -37,26 +37,30 @@ def run_pipeline(hypfile, reffile):
reffile.validate(great_expectations_validation)

# Normalize
reffile.clean_text(replace_numbers_and_symbols)
if not skip_ref_norm:
reffile.clean_text(replace_numbers_and_symbols)
reffile.export(os.path.join(os.path.sep,'input',f'{reffile.name}_normalized.{reffile.extension}'))
hypfile.clean_text(replace_numbers_and_symbols)
if not skip_hyp_norm:
hypfile.clean_text(replace_numbers_and_symbols)
hypfile.export(os.path.join(os.path.sep,'input',f'{hypfile.name}_normalized.{hypfile.extension}'))

#Create results folder if not exists:
if not os.path.exists(os.path.join(os.path.sep,'input','results')):
os.makedirs(os.path.join(os.path.sep,'input','results'))

# sclite command to be logged and executed
command = f"sclite -D -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} \
-m hyp -O {os.path.join(os.path.sep,'input','results')} -o prf dtl spk"

# Run variation scripts
logging.info(
f"running: sclite -h {hypfile.normalized_path} {hypfile.extension} -r {reffile.normalized_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o dtl spk")
run = os.system(
f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")
# Hypothesis
run = os.system(f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")
# Reference
run = os.system(f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")

# Run sclite
run = os.system(
f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")
run = os.system(
f"sclite -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o dtl spk")
# Log & run sclite
logging.info("running:" + command)
run = os.system(command)

def calculate_wer(df):
""" Calculates the word error rate and adds the collumn 'product' to the dataframe
Expand Down Expand Up @@ -210,19 +214,23 @@ def process_input(hypfile_arg, reffile_arg):


class Pipeline():
def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind):
def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind, skip_ref_norm, skip_hyp_norm):
self.progress = 0
self.failed = 0
self.hypfile_input_path = os.path.join(os.path.sep,'input',hypfile_input_path)
self.reffile_input_path = os.path.join(os.path.sep,'input',reffile_input_path)
self.hypextension = hypextension
self.refextension = refextension
self.kind = kind
self.skip_ref_norm = skip_ref_norm
self.skip_hyp_norm = skip_hyp_norm
self.logging = set_logging(logpath=os.path.join(os.path.sep,'input',f'{date.today()}_logging.log'))
self.logging.info(f"hypfile path from terminal: {hypfile_input_path}")
self.logging.info(f"reffile path from terminal: {reffile_input_path}")
self.logging.info(f"Pipeline class' hypfile path: {self.hypfile_input_path}")
self.logging.info(f"Pipeline class' reffile path: {self.reffile_input_path}")
self.logging.info(f"Skip reffile normalization: {self.skip_ref_norm}")
self.logging.info(f"Skip hypfile normalization: {self.skip_hyp_norm}")

def main(self):
hyp_list, ref_list = process_input(self.hypfile_input_path, self.reffile_input_path)
Expand All @@ -235,7 +243,7 @@ def main(self):
# Parse input
reffile = STM(reffile_path, self.refextension)
hypfile = CTM(hypfile_path, self.hypextension)
run_pipeline(hypfile, reffile)
run_pipeline(hypfile, reffile, self.skip_ref_norm, self.skip_hyp_norm)
done += 1
self.progress = done/total
except:
Expand Down
13 changes: 11 additions & 2 deletions ASR_NL_benchmark/templates/select_files.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,26 @@


<div class="container pt-3 m-3" width="80%">
<h1> Select Hypothese and Reference files or folders </h1>
<h1> Select Hypothesis and Reference files or folders </h1>
</div>
<div class="container pt-3 m-3" width="80%">
<div class="form-group">
<form method="POST">
<label>Name of speech recognizer</label>
<input type="text" class="form-control" id="kind" name="kind" placeholder="Name of speech recognizer">
<p>_______________________________</p>
<label>Path to hypothesis file or folder</label>
<input type="text" class="form-control" id="hyp" name="hyp" placeholder="Hyp File or folder">
<input type="checkbox" id="skip-hyp-norm" name="skip-hyp-norm">
<label for="skip-hyp-norm">Skip the normalization step for the hypothesis file(s)</label>
<br>
<p>_______________________________</p>
<label>Path to reference file or folder</label>
<input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder"><button type="submit" class="btn btn-primary" >Submit</button>
<input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder">
<input type="checkbox" id="skip-ref-norm" name="skip-ref-norm">
<label for="skip-ref-norm">Skip the normalization step for the reference file(s)</label>
<br>
<button type="submit" class="btn btn-primary" >Submit</button>
</form>
</div>
</div>
Expand Down
12 changes: 12 additions & 0 deletions ASR_NL_benchmark/variations.glm
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,15 @@ tewerk => te werk / [ ] __ [ ]
[marktonderzoekbureau] => [{ marktonderzoekbureau / marktonderzoeksbureau }] / [ ] __ [ ]
[Noordwestkust] => [{ Noordwestkust / Noord-Westkust }] / [ ] __ [ ]
[carnavalvierders] => [{ carnavalvierders / carnavalsvierders }] / [ ] __ [ ]

;; Whisper evaluation on N-Best
;; BN-NL
ie => hij / [ ] __ [ ]
da's => dat is / [ ] __ [ ]
[BNR-nieuwsradio] => [{ BNR-nieuwsradio / BNR nieuwsradio }] / [ ] __ [ ]
[Moszkowicz] => [{ Moszkowicz / Moskovic / Moskowitz }] / [ ] __ [ ]
[Kooi] => [{ Kooi / Kooij }] / [ ] __ [ ]
[Araújo] => [{ Araújo / Araujo }] / [ ] __ [ ]
[Bagdad] => [{ Bagdad / Baghdad }] / [ ] __ [ ]
[Holleeder] => [{ Holleeder / Holleder }] / [ ] __ [ ]
[Imac] => [{ Imac / Imaç }] / [ ] __ [ ]
Loading

0 comments on commit 47dc7d1

Please sign in to comment.