Merge pull request #30 from opensource-spraakherkenning-nl/development

Skip normalization options + add variations for BN-NL part of N-Best corpus
opensource-spraakherkenning-nl · Apr 5, 2024 · 47dc7d1 · 47dc7d1
2 parents 1a4745d + 0d23a05
commit 47dc7d1
Show file tree

Hide file tree

Showing 9 changed files with 436 additions and 89 deletions.
diff --git a/ASR_NL_benchmark/__main__.py b/ASR_NL_benchmark/__main__.py
@@ -6,30 +6,39 @@
 if __name__ == "__main__":
     # Set parser
     parser = argparse.ArgumentParser(description='normalize ref and hyp file')
-    parser.add_argument('-hyp', '--hypfile', nargs='+',
+    parser.add_argument('-hyp', '--hypfile', nargs=2,
                         metavar=('hypfile_name', 'extension'),
-                        default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'], help='help: path to the hypothesis file and its extension')
-    parser.add_argument('-ref', '--reffile', nargs='+',
+                        default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'],
+                        help='path to the hypothesis file and its extension')
+    parser.add_argument('-ref', '--reffile', nargs=2,
                         metavar=('reffile_name', 'extension'),
                         default=['ASR_NL_benchmark/data/test_ref.stm', 'stm'],
-                        help='help: path to the reference file and its extension')
+                        help='path to the reference file and its extension')
     parser.add_argument('-kind', '--kind',
                         metavar=('speechrecognizer'),
                         default='',
-                        help='help: enter the name of your speech recognizer')
+                        help='enter the name of your speech recognizer')
     parser.add_argument('-interactive',
-                        metavar='value',
-                        default='',
-                        help='help: True if you want to use the GUI')
+                        action = 'store_true',
+                        help='if you want to use the GUI')
+    parser.add_argument('-skip_ref_normalization', 
+                        action = 'store_true',
+                        help = 'skip the normalization step for the reference file')
+    parser.add_argument('-skip_hyp_normalization', 
+                        action = 'store_true',
+                        help = 'skip the normalization step for the hypothesis file')
 
     args = parser.parse_args()
 
-    if bool(args.interactive):
+    if args.interactive:
         print('Opening interface')
         interface.main()
     else:
         print('Running benchmarking')
-        benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], args.reffile[0], args.reffile[1], kind=args.kind)
+        benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], \
+                                         args.reffile[0], args.reffile[1], \
+                                         kind=args.kind, \
+                                         skip_ref_norm=args.skip_ref_normalization, skip_hyp_norm=args.skip_hyp_normalization)
         benchmarking.main()
         pipeline.process_results(kind=args.kind)
 
diff --git a/ASR_NL_benchmark/interface.py b/ASR_NL_benchmark/interface.py
@@ -20,8 +20,10 @@ def upload_page():
         hyp = os.path.join(os.path.sep,'input',request.form.get('hyp'))
         ref = os.path.join(os.path.sep,'input',request.form.get('ref'))
         kind = request.form.get('kind')
+        skip_ref_norm = request.form.get('skip-ref-norm')
+        skip_hyp_norm = request.form.get('skip-hyp-norm')
         global benchmarking
-        benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind)
+        benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind, skip_ref_norm, skip_hyp_norm)
         Thread(target=benchmarking.main).start()
         return redirect(f'/progress?ref={ref}&hyp={hyp}')
     return render_template('select_files.html')

diff --git a/ASR_NL_benchmark/normalize.py b/ASR_NL_benchmark/normalize.py
@@ -20,6 +20,7 @@ def check_and_covert_interger(word):
             float(word)
             new_word = num2words(word, to='cardinal', lang='nl')
             new_word = new_word.replace('komma', 'punt')
+            new_word = new_word.replace('duizend', 'duizend ')
             logging.info(f'converted the number {word} to {new_word}')
             return new_word
         except:
@@ -29,6 +30,7 @@ def check_and_covert_interger(word):
         try:
             float(new_word)
             new_word = num2words(new_word, to='cardinal', lang='nl')
+            new_word = new_word.replace('duizend', 'duizend ')
             logging.info(f'converted number {word} to {new_word}')
             return new_word
         except:
@@ -44,9 +46,10 @@ def replace_numbers_and_symbols(text):
     >>> replace_numbers_and_symbols('12,3%')
     'twaalf komma drie procent'
     """
+    removed_punct = string.punctuation.replace("'", '').replace('-', '')
     text_without_symbols = replace_symbols(text)
     clean_text = replace_numbers(text_without_symbols)
-    clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))
+    clean_text = clean_text.translate(str.maketrans('', '', removed_punct))
     return clean_text
 
 def replace_numbers(text):
@@ -65,6 +68,7 @@ def replace_numbers(text):
         if word.isdigit():
             number_of_numbers += 1
             text_list[position] = num2words(word, to='cardinal', lang='nl')
+            text_list[position] = text_list[position].replace('duizend', 'duizend ')
         elif check_and_covert_interger(word):
             text_list[position] = check_and_covert_interger(word)
     text_without_numbers = " ".join(text_list)

diff --git a/ASR_NL_benchmark/pipeline.py b/ASR_NL_benchmark/pipeline.py
@@ -26,7 +26,7 @@ def set_logging(logpath):
     return logging
 
 
-def run_pipeline(hypfile, reffile):
+def run_pipeline(hypfile, reffile, skip_ref_norm, skip_hyp_norm):
     """ Validates and Normalizes the hyp and ref file and runs them trough sclite
     Args:
         hypfile: the hypothesis file
@@ -37,26 +37,30 @@ def run_pipeline(hypfile, reffile):
     reffile.validate(great_expectations_validation)
 
     # Normalize
-    reffile.clean_text(replace_numbers_and_symbols)
+    if not skip_ref_norm:
+        reffile.clean_text(replace_numbers_and_symbols)
     reffile.export(os.path.join(os.path.sep,'input',f'{reffile.name}_normalized.{reffile.extension}'))
-    hypfile.clean_text(replace_numbers_and_symbols)
+    if not skip_hyp_norm:
+        hypfile.clean_text(replace_numbers_and_symbols)
     hypfile.export(os.path.join(os.path.sep,'input',f'{hypfile.name}_normalized.{hypfile.extension}'))
 
     #Create results folder if not exists:
     if not os.path.exists(os.path.join(os.path.sep,'input','results')):
         os.makedirs(os.path.join(os.path.sep,'input','results'))
+
+    # sclite command to be logged and executed
+    command = f"sclite -D -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} \
+        -m hyp -O {os.path.join(os.path.sep,'input','results')} -o prf dtl spk"
 
     # Run variation scripts
-    logging.info(
-        f"running: sclite -h {hypfile.normalized_path} {hypfile.extension} -r {reffile.normalized_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')}  -o dtl spk")
-    run = os.system(
-        f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")
+    # Hypothesis
+    run = os.system(f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")
+    # Reference
+    run = os.system(f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")
 
-    # Run sclite
-    run = os.system(
-        f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")
-    run = os.system(
-        f"sclite -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o dtl spk")
+    # Log & run sclite
+    logging.info("running:" + command)
+    run = os.system(command)
 
 def calculate_wer(df):
     """ Calculates the word error rate and adds the collumn 'product' to the dataframe
@@ -210,19 +214,23 @@ def process_input(hypfile_arg, reffile_arg):
 
 
 class Pipeline():
-    def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind):
+    def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind, skip_ref_norm, skip_hyp_norm):
         self.progress = 0
         self.failed = 0
         self.hypfile_input_path = os.path.join(os.path.sep,'input',hypfile_input_path)
         self.reffile_input_path = os.path.join(os.path.sep,'input',reffile_input_path)
         self.hypextension = hypextension
         self.refextension = refextension
         self.kind = kind
+        self.skip_ref_norm = skip_ref_norm
+        self.skip_hyp_norm = skip_hyp_norm
         self.logging = set_logging(logpath=os.path.join(os.path.sep,'input',f'{date.today()}_logging.log'))
         self.logging.info(f"hypfile path from terminal: {hypfile_input_path}")
         self.logging.info(f"reffile path from terminal: {reffile_input_path}")
         self.logging.info(f"Pipeline class' hypfile path: {self.hypfile_input_path}")
         self.logging.info(f"Pipeline class' reffile path: {self.reffile_input_path}")
+        self.logging.info(f"Skip reffile normalization: {self.skip_ref_norm}")
+        self.logging.info(f"Skip hypfile normalization: {self.skip_hyp_norm}")
 
     def main(self):
         hyp_list, ref_list = process_input(self.hypfile_input_path, self.reffile_input_path)
@@ -235,7 +243,7 @@ def main(self):
                 # Parse input
                 reffile = STM(reffile_path, self.refextension)
                 hypfile = CTM(hypfile_path, self.hypextension)
-                run_pipeline(hypfile, reffile)
+                run_pipeline(hypfile, reffile, self.skip_ref_norm, self.skip_hyp_norm)
                 done += 1
                 self.progress = done/total
             except:

diff --git a/ASR_NL_benchmark/templates/select_files.html b/ASR_NL_benchmark/templates/select_files.html
@@ -21,17 +21,26 @@
 
 
   <div class="container pt-3 m-3" width="80%">
-  <h1> Select Hypothese and Reference files or folders </h1>
+  <h1> Select Hypothesis and Reference files or folders </h1>
   </div>
   <div class="container pt-3 m-3" width="80%">
   <div class="form-group">
       <form method="POST">
           <label>Name of speech recognizer</label>
           <input type="text" class="form-control" id="kind" name="kind" placeholder="Name of speech recognizer">
+          <p>_______________________________</p>
           <label>Path to hypothesis file or folder</label>
           <input type="text" class="form-control" id="hyp" name="hyp" placeholder="Hyp File or folder">
+          <input type="checkbox" id="skip-hyp-norm" name="skip-hyp-norm"> 
+          <label for="skip-hyp-norm">Skip the normalization step for the hypothesis file(s)</label>
+          <br>
+          <p>_______________________________</p>
           <label>Path to reference file or folder</label>
-          <input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder"><button type="submit" class="btn btn-primary" >Submit</button>
+          <input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder">
+          <input type="checkbox" id="skip-ref-norm" name="skip-ref-norm">
+          <label for="skip-ref-norm">Skip the normalization step for the reference file(s)</label>
+          <br>
+          <button type="submit" class="btn btn-primary" >Submit</button>
       </form>
   </div>
   </div>

diff --git a/ASR_NL_benchmark/variations.glm b/ASR_NL_benchmark/variations.glm
@@ -87,3 +87,15 @@ tewerk => te werk / [ ] __ [ ]
 [marktonderzoekbureau] => [{ marktonderzoekbureau / marktonderzoeksbureau }] / [ ] __ [ ]
 [Noordwestkust] => [{ Noordwestkust / Noord-Westkust }] / [ ] __ [ ]
 [carnavalvierders] => [{ carnavalvierders / carnavalsvierders }] / [ ] __ [ ]
+
+;; Whisper evaluation on N-Best
+;; BN-NL
+ie => hij / [ ] __ [ ]
+da's => dat is / [ ] __ [ ]
+[BNR-nieuwsradio] => [{ BNR-nieuwsradio / BNR nieuwsradio }] / [ ] __ [ ]
+[Moszkowicz] => [{ Moszkowicz / Moskovic / Moskowitz }] / [ ] __ [ ]
+[Kooi] => [{ Kooi / Kooij }] / [ ] __ [ ]
+[Araújo] => [{ Araújo / Araujo }] / [ ] __ [ ]
+[Bagdad] => [{ Bagdad / Baghdad }] / [ ] __ [ ]
+[Holleeder] => [{ Holleeder / Holleder }] / [ ] __ [ ]
+[Imac] => [{ Imac / Imaç }] / [ ] __ [ ]