Skip to content

Commit

Permalink
Add small changes before adding the sc_args functionality
Browse files Browse the repository at this point in the history
Changes include:
- Changing nargs for hypfile and reffile args
- Small rewording of comments and help messages
- Removing skip_normalization as it was redundant
- Changing the way interactive behaves (have it be a True value in the code when used)
- Change normalization of numbers slightly (add a space after duizend, this is how it's done for Dutch)
- Reorder sclite and variation related LOCs
- Update README with new arguments added
  • Loading branch information
greenw0lf committed Feb 22, 2024
1 parent 7b675ec commit 3074dee
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 30 deletions.
38 changes: 18 additions & 20 deletions ASR_NL_benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,42 @@
if __name__ == "__main__":
# Set parser
parser = argparse.ArgumentParser(description='normalize ref and hyp file')
parser.add_argument('-hyp', '--hypfile', nargs='+',
parser.add_argument('-hyp', '--hypfile', nargs=2,
metavar=('hypfile_name', 'extension'),
default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'], help='help: path to the hypothesis file and its extension')
parser.add_argument('-ref', '--reffile', nargs='+',
default=['ASR_NL_benchmark/data/test_hyp.ctm', 'ctm'],
help='path to the hypothesis file and its extension')
parser.add_argument('-ref', '--reffile', nargs=2,
metavar=('reffile_name', 'extension'),
default=['ASR_NL_benchmark/data/test_ref.stm', 'stm'],
help='help: path to the reference file and its extension')
help='path to the reference file and its extension')
parser.add_argument('-kind', '--kind',
metavar=('speechrecognizer'),
default='',
help='help: enter the name of your speech recognizer')
help='enter the name of your speech recognizer')
parser.add_argument('-interactive',
metavar='value',
default='',
help='help: True if you want to use the GUI')
action = 'store_true',
help='if you want to use the GUI')
parser.add_argument('-skip_ref_normalization',
action = 'store_true',
help = 'Skip the normalization step for the reference file')
help = 'skip the normalization step for the reference file')
parser.add_argument('-skip_hyp_normalization',
action = 'store_true',
help = 'Skip the normalization step for the hypothesis file')
parser.add_argument('-skip-normalization',
action = 'store_true',
help = 'Skip the normalization step for both hypothesis and reference files')
help = 'skip the normalization step for the hypothesis file')
parser.add_argument('-sc_args', nargs='*',
default=[],
help='extra sclite arguments you want to use (without the -)')

args = parser.parse_args()

if bool(args.interactive):
if args.interactive:
print('Opening interface')
interface.main()
else:
print('Running benchmarking')
skip_ref_norm = args.skip_ref_normalization
skip_hyp_norm = args.skip_hyp_normalization
if args.skip_normalization:
skip_ref_norm = args.skip_ref_normalization
skip_hyp_norm = args.skip_hyp_normalization
benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], args.reffile[0], args.reffile[1], kind=args.kind, skip_ref_norm=skip_ref_norm, skip_hyp_norm=skip_hyp_norm)
benchmarking = pipeline.Pipeline(args.hypfile[0], args.hypfile[1], \
args.reffile[0], args.reffile[1], \
kind=args.kind, \
skip_ref_norm=args.skip_ref_normalization, skip_hyp_norm=args.skip_hyp_normalization)
benchmarking.main()
pipeline.process_results(kind=args.kind)

3 changes: 3 additions & 0 deletions ASR_NL_benchmark/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def check_and_covert_interger(word):
float(word)
new_word = num2words(word, to='cardinal', lang='nl')
new_word = new_word.replace('komma', 'punt')
new_word = new_word.replace('duizend', 'duizend ')
logging.info(f'converted the number {word} to {new_word}')
return new_word
except:
Expand All @@ -29,6 +30,7 @@ def check_and_covert_interger(word):
try:
float(new_word)
new_word = num2words(new_word, to='cardinal', lang='nl')
new_word = new_word.replace('duizend', 'duizend ')
logging.info(f'converted number {word} to {new_word}')
return new_word
except:
Expand Down Expand Up @@ -66,6 +68,7 @@ def replace_numbers(text):
if word.isdigit():
number_of_numbers += 1
text_list[position] = num2words(word, to='cardinal', lang='nl')
text_list[position] = text_list[position].replace('duizend', 'duizend ')
elif check_and_covert_interger(word):
text_list[position] = check_and_covert_interger(word)
text_without_numbers = " ".join(text_list)
Expand Down
22 changes: 12 additions & 10 deletions ASR_NL_benchmark/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,20 @@ def run_pipeline(hypfile, reffile, skip_ref_norm, skip_hyp_norm):
#Create results folder if not exists:
if not os.path.exists(os.path.join(os.path.sep,'input','results')):
os.makedirs(os.path.join(os.path.sep,'input','results'))

# sclite command to be logged and executed
command = f"sclite -D -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} \
-m hyp -O {os.path.join(os.path.sep,'input','results')} -o prf dtl spk"

# Run variation scripts
logging.info(
f"running: sclite -h {hypfile.normalized_path} {hypfile.extension} -r {reffile.normalized_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o dtl spk")
run = os.system(
f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")

# Run sclite
run = os.system(
f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")
run = os.system(
f"sclite -D -h {hypfile.variation_path} {hypfile.extension} -r {reffile.variation_path} {reffile.extension} -m hyp -O {os.path.join(os.path.sep,'input','results')} -o prf dtl spk")
# Hypothesis
run = os.system(f"csrfilt.sh -s -i ctm {os.path.join('ASR_NL_benchmark','variations.glm')} < {hypfile.normalized_path} > {hypfile.variation_path}")
# Reference
run = os.system(f"csrfilt.sh -s -i stm {os.path.join('ASR_NL_benchmark','variations.glm')} < {reffile.normalized_path} > {reffile.variation_path}")

# Log & run sclite
logging.info("running:" + command)
run = os.system(command)

def calculate_wer(df):
""" Calculates the word error rate and adds the collumn 'product' to the dataframe
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ The final results are saved in .csv format inside a folder named `results` store
- .spk files - Report with scoring for a speaker as returned by sclite
- .csv files - Overall results of the benchmarking as shown in the interface

## Extra arguments
There are extra arguments that you can add to the command line:
- `-skip_hyp_normalization`: Skips the normalization step for the hypothesis file(s) (STILL APPLIES VARIATIONS)
- `-skip_ref_normalization`: Skips the normalization step for the reference file(s) (STILL APPLIES VARIATIONS)
- `-sc_args`: With this argument, you can add extra sclite-specific flags. For more information, check the [documentation of sclite](https://github.com/usnistgov/SCTK/blob/master/doc/sclite.htm) (to view it properly, we suggest locally downloading the entire `doc` folder of the SCTK repository).

## More about the pipeline
### Normalization
Expand Down

0 comments on commit 3074dee

Please sign in to comment.