opensource-spraakherkenning-nl · greenw0lf · Apr 5, 2024 · Oct 5, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/ASR_NL_benchmark/__main__.py b/ASR_NL_benchmark/__main__.py
@@ -21,6 +21,15 @@
                         metavar='value',
                         default='',
                         help='help: True if you want to use the GUI')
+    parser.add_argument('-skip_ref_normalization', 
+                        action = 'store_true',
+                        help = 'Skip the normalization step for the reference file')
+    parser.add_argument('-skip_hyp_normalization', 
+                        action = 'store_true',
+                        help = 'Skip the normalization step for the hypothesis file')
+    parser.add_argument('-skip-normalization',
+                        action = 'store_true',
+                        help = 'Skip the normalization step for both hypothesis and reference files')
 
     args = parser.parse_args()
 
@@ -29,7 +38,12 @@
         interface.main()
     else:
         print('Running benchmarking')
-        benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], args.reffile[0], args.reffile[1], kind=args.kind)
+        skip_ref_norm = args.skip_ref_normalization
+        skip_hyp_norm = args.skip_hyp_normalization
+        if args.skip_normalization:
+            skip_ref_norm = args.skip_ref_normalization
+            skip_hyp_norm = args.skip_hyp_normalization
+        benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], args.reffile[0], args.reffile[1], kind=args.kind, skip_ref_norm=skip_ref_norm, skip_hyp_norm=skip_hyp_norm)
         benchmarking.main()
         pipeline.process_results(kind=args.kind)
 
diff --git a/ASR_NL_benchmark/interface.py b/ASR_NL_benchmark/interface.py
@@ -20,8 +20,10 @@ def upload_page():
         hyp = os.path.join(os.path.sep,'input',request.form.get('hyp'))
         ref = os.path.join(os.path.sep,'input',request.form.get('ref'))
         kind = request.form.get('kind')
+        skip_ref_norm = request.form.get('skip-ref-norm')
+        skip_hyp_norm = request.form.get('skip-hyp-norm')
         global benchmarking
-        benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind)
+        benchmarking = pipeline.Pipeline(hyp, 'ctm', ref, 'stm', kind, skip_ref_norm, skip_hyp_norm)
         Thread(target=benchmarking.main).start()
         return redirect(f'/progress?ref={ref}&hyp={hyp}')
     return render_template('select_files.html')

diff --git a/ASR_NL_benchmark/normalize.py b/ASR_NL_benchmark/normalize.py
@@ -44,9 +44,10 @@ def replace_numbers_and_symbols(text):
     >>> replace_numbers_and_symbols('12,3%')
     'twaalf komma drie procent'
     """
+    removed_punct = string.punctuation.replace("'", '').replace('-', '')
     text_without_symbols = replace_symbols(text)
     clean_text = replace_numbers(text_without_symbols)
-    clean_text = clean_text.translate(str.maketrans('', '', string.punctuation))
+    clean_text = clean_text.translate(str.maketrans('', '', removed_punct))
     return clean_text
 
 def replace_numbers(text):

diff --git a/ASR_NL_benchmark/pipeline.py b/ASR_NL_benchmark/pipeline.py
@@ -26,7 +26,7 @@ def set_logging(logpath):
     return logging
 
 
-def run_pipeline(hypfile, reffile):
+def run_pipeline(hypfile, reffile, skip_ref_norm, skip_hyp_norm):
     """ Validates and Normalizes the hyp and ref file and runs them trough sclite
     Args:
         hypfile: the hypothesis file
@@ -37,9 +37,11 @@ def run_pipeline(hypfile, reffile):
     reffile.validate(great_expectations_validation)
 
     # Normalize
-    reffile.clean_text(replace_numbers_and_symbols)
+    if not skip_ref_norm:
+        reffile.clean_text(replace_numbers_and_symbols)
     reffile.export(os.path.join(os.path.sep,'input',f'{reffile.name}_normalized.{reffile.extension}'))
-    hypfile.clean_text(replace_numbers_and_symbols)
+    if not skip_hyp_norm:
+        hypfile.clean_text(replace_numbers_and_symbols)
     hypfile.export(os.path.join(os.path.sep,'input',f'{hypfile.name}_normalized.{hypfile.extension}'))
 
     #Create results folder if not exists:
@@ -56,7 +58,7 @@ def run_pipeline(hypfile, reffile):
     run = os.system(
         f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")
     run = os.system(
-        f"sclite -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o dtl spk")
+        f"sclite -D -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o prf dtl spk")
 
 def calculate_wer(df):
     """ Calculates the word error rate and adds the collumn 'product' to the dataframe
@@ -210,19 +212,23 @@ def process_input(hypfile_arg, reffile_arg):
 
 
 class Pipeline():
-    def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind):
+    def __init__(self, hypfile_input_path, hypextension, reffile_input_path, refextension, kind, skip_ref_norm, skip_hyp_norm):
         self.progress = 0
         self.failed = 0
         self.hypfile_input_path = os.path.join(os.path.sep,'input',hypfile_input_path)
         self.reffile_input_path = os.path.join(os.path.sep,'input',reffile_input_path)
         self.hypextension = hypextension
         self.refextension = refextension
         self.kind = kind
+        self.skip_ref_norm = skip_ref_norm
+        self.skip_hyp_norm = skip_hyp_norm
         self.logging = set_logging(logpath=os.path.join(os.path.sep,'input',f'{date.today()}_logging.log'))
         self.logging.info(f"hypfile path from terminal: {hypfile_input_path}")
         self.logging.info(f"reffile path from terminal: {reffile_input_path}")
         self.logging.info(f"Pipeline class' hypfile path: {self.hypfile_input_path}")
         self.logging.info(f"Pipeline class' reffile path: {self.reffile_input_path}")
+        self.logging.info(f"Skip reffile normalization: {self.skip_ref_norm}")
+        self.logging.info(f"Skip hypfile normalization: {self.skip_hyp_norm}")
 
     def main(self):
         hyp_list, ref_list = process_input(self.hypfile_input_path, self.reffile_input_path)
@@ -235,7 +241,7 @@ def main(self):
                 # Parse input
                 reffile = STM(reffile_path, self.refextension)
                 hypfile = CTM(hypfile_path, self.hypextension)
-                run_pipeline(hypfile, reffile)
+                run_pipeline(hypfile, reffile, self.skip_ref_norm, self.skip_hyp_norm)
                 done += 1
                 self.progress = done/total
             except:

diff --git a/ASR_NL_benchmark/templates/select_files.html b/ASR_NL_benchmark/templates/select_files.html
@@ -21,17 +21,26 @@
 
 
   <div class="container pt-3 m-3" width="80%">
-  <h1> Select Hypothese and Reference files or folders </h1>
+  <h1> Select Hypothesis and Reference files or folders </h1>
   </div>
   <div class="container pt-3 m-3" width="80%">
   <div class="form-group">
       <form method="POST">
           <label>Name of speech recognizer</label>
           <input type="text" class="form-control" id="kind" name="kind" placeholder="Name of speech recognizer">
+          <p>_______________________________</p>
           <label>Path to hypothesis file or folder</label>
           <input type="text" class="form-control" id="hyp" name="hyp" placeholder="Hyp File or folder">
+          <input type="checkbox" id="skip-hyp-norm" name="skip-hyp-norm"> 
+          <label for="skip-hyp-norm">Skip the normalization step for the hypothesis file(s)</label>
+          <br>
+          <p>_______________________________</p>
           <label>Path to reference file or folder</label>
-          <input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder"><button type="submit" class="btn btn-primary" >Submit</button>
+          <input type="text" class="form-control" id="ref" name="ref" placeholder="Ref File or folder">
+          <input type="checkbox" id="skip-ref-norm" name="skip-ref-norm">
+          <label for="skip-ref-norm">Skip the normalization step for the reference file(s)</label>
+          <br>
+          <button type="submit" class="btn btn-primary" >Submit</button>
       </form>
   </div>
   </div>

diff --git a/ASR_NL_benchmark/variations.glm b/ASR_NL_benchmark/variations.glm
@@ -87,3 +87,15 @@ tewerk => te werk / [ ] __ [ ]
 [marktonderzoekbureau] => [{ marktonderzoekbureau / marktonderzoeksbureau }] / [ ] __ [ ]
 [Noordwestkust] => [{ Noordwestkust / Noord-Westkust }] / [ ] __ [ ]
 [carnavalvierders] => [{ carnavalvierders / carnavalsvierders }] / [ ] __ [ ]
+
+;; Whisper evaluation on N-Best
+;; BN-NL
+ie => hij / [ ] __ [ ]
+da's => dat is / [ ] __ [ ]
+[BNR-nieuwsradio] => [{ BNR-nieuwsradio / BNR nieuwsradio }] / [ ] __ [ ]
+[Moszkowicz] => [{ Moszkowicz / Moskovic / Moskowitz }] / [ ] __ [ ]
+[Kooi] => [{ Kooi / Kooij }] / [ ] __ [ ]
+[Araújo] => [{ Araújo / Araujo }] / [ ] __ [ ]
+[Bagdad] => [{ Bagdad / Baghdad }] / [ ] __ [ ]
+[Holleeder] => [{ Holleeder / Holleder }] / [ ] __ [ ]
+[Imac] => [{ Imac / Imaç }] / [ ] __ [ ]