misc. update

k2-fsa · Apr 8, 2024 · 05e48ca · 05e48ca
1 parent b9d34fb
commit 05e48ca
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 7 deletions.
diff --git a/egs/commonvoice/ASR/RESULTS.md b/egs/commonvoice/ASR/RESULTS.md
@@ -14,7 +14,7 @@ The best CER, for CommonVoice 16.1 (cv-corpus-16.1-2023-12-06/zh-HK) is below:
 | modified_beam_search | 0.98  | 1.11 | --epoch 24 --avg 5 |
 |   fast_beam_search   | 1.08  | 1.27 | --epoch 24 --avg 5 |
 
-When doing the cross-corpus validation on MDCC (w/o blank penalty),
+When doing the cross-corpus validation on [MDCC](https://arxiv.org/abs/2201.02419) (w/o blank penalty),
 the best CER is below:
 
 |                      |  Dev  | Test |        Note        |
@@ -23,7 +23,7 @@ the best CER is below:
 | modified_beam_search | 39.73 | 39.19| --epoch 24 --avg 5 |
 |   fast_beam_search   | 42.14 | 41.98| --epoch 24 --avg 5 |
 
-When doing the cross-corpus validation on MDCC (with blank penalty set to 2.2),
+When doing the cross-corpus validation on [MDCC](https://arxiv.org/abs/2201.02419) (with blank penalty set to 2.2),
 the best CER is below:
 
 |                      |  Dev  | Test |                  Note                  |
@@ -68,14 +68,16 @@ Detailed experimental results and pre-trained model are available at:
 <https://huggingface.co/zrjin/icefall-asr-commonvoice-zh-HK-zipformer-2024-03-20>
 
 
-### GigaSpeech BPE training results (Pruned Stateless Transducer 7)
+### CommonVoice English (en) BPE training results (Pruned Stateless Transducer 7)
 
 #### [pruned_transducer_stateless7](./pruned_transducer_stateless7)
 
-See #997  for more details.
+See #997 for more details.
 
 Number of model parameters: 70369391, i.e., 70.37 M
 
+Note that the result is obtained using GigaSpeech transcript trained BPE model
+
 The best WER, as of 2023-04-17, for Common Voice English 13.0 (cv-corpus-13.0-2023-03-09/en) is below:
 
 Results are:

diff --git a/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py b/egs/commonvoice/ASR/local/compute_fbank_commonvoice_splits.py
@@ -47,6 +47,7 @@ def get_args():
         "--subset",
         type=str,
         default="train",
+        choices=["train", "validated", "invalidated"],
         help="""Dataset parts to compute fbank. """,
     )
 

diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py
@@ -130,11 +130,11 @@ def is_cs(line: str) -> bool:
     norm_lines = [normalize_text(line, lang) for line in lines]
 
     text_words_segments = get_word_segments(norm_lines)
-    with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
+    with open(output_dir / "transcript_words.txt", "w", encoding="utf-8") as f:
         f.writelines(text_words_segments)
 
     words = get_words(text_words_segments)[1:]  # remove "\n" from words
-    with open(output_dir / "words_no_ids.txt", "w+", encoding="utf-8") as f:
+    with open(output_dir / "words_no_ids.txt", "w", encoding="utf-8") as f:
         f.writelines([word + "\n" for word in sorted(words)])
 
     words = (
@@ -143,5 +143,5 @@ def is_cs(line: str) -> bool:
         + ["#0", "<s>", "<\s>"]
     )
 
-    with open(output_dir / "words.txt", "w+", encoding="utf-8") as f:
+    with open(output_dir / "words.txt", "w", encoding="utf-8") as f:
         f.writelines([f"{word} {i}\n" for i, word in enumerate(words)])