From 0e6992b1f0f2255b400dac4a9827a91b59a1e93f Mon Sep 17 00:00:00 2001
From: yifanyeung <v-yifanyang@microsoft.com>
Date: Mon, 21 Oct 2024 15:13:03 +0800
Subject: [PATCH] remove incomplete recipe

---
 README.md                                     |   2 -
 .../ASR/local/compute_fbank_musan.py          |   1 -
 .../compute_fbank_peoples_speech_splits.py    | 154 -----------
 ...compute_fbank_peoples_speech_valid_test.py |  93 -------
 egs/peoples_speech/ASR/local/filter_cuts.py   |   1 -
 .../ASR/local/prepare_lang_bpe.py             |   1 -
 .../ASR/local/preprocess_peoples_speech.py    | 123 ---------
 .../ASR/local/train_bpe_model.py              |   1 -
 .../ASR/local/validate_bpe_lexicon.py         |   1 -
 egs/peoples_speech/ASR/prepare.sh             | 247 ------------------
 egs/peoples_speech/ASR/shared                 |   1 -
 11 files changed, 625 deletions(-)
 delete mode 120000 egs/peoples_speech/ASR/local/compute_fbank_musan.py
 delete mode 100755 egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py
 delete mode 100755 egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_valid_test.py
 delete mode 120000 egs/peoples_speech/ASR/local/filter_cuts.py
 delete mode 120000 egs/peoples_speech/ASR/local/prepare_lang_bpe.py
 delete mode 100755 egs/peoples_speech/ASR/local/preprocess_peoples_speech.py
 delete mode 120000 egs/peoples_speech/ASR/local/train_bpe_model.py
 delete mode 120000 egs/peoples_speech/ASR/local/validate_bpe_lexicon.py
 delete mode 100755 egs/peoples_speech/ASR/prepare.sh
 delete mode 120000 egs/peoples_speech/ASR/shared

diff --git a/README.md b/README.md
index 81cfc03ce7..57db5eb8db 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,6 @@ for more details.
   - [LibriSpeech][librispeech]
   - [Libriheavy][libriheavy]
   - [Multi-Dialect Broadcast News Arabic Speech Recognition][mgb2]
-  - [PeopleSpeech][peoplespeech]
   - [SPGISpeech][spgispeech]
   - [Switchboard][swbd]
   - [TIMIT][timit]
@@ -375,7 +374,6 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [libricss]: egs/libricss/SURT
 [libriheavy]: egs/libriheavy/ASR
 [mgb2]: egs/mgb2/ASR
-[peoplespeech]: egs/peoples_speech/ASR
 [spgispeech]: egs/spgispeech/ASR
 [voxpopuli]: egs/voxpopuli/ASR
 [xbmu-amdo31]: egs/xbmu-amdo31/ASR
diff --git a/egs/peoples_speech/ASR/local/compute_fbank_musan.py b/egs/peoples_speech/ASR/local/compute_fbank_musan.py
deleted file mode 120000
index 5833f2484e..0000000000
--- a/egs/peoples_speech/ASR/local/compute_fbank_musan.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/compute_fbank_musan.py
\ No newline at end of file
diff --git a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py
deleted file mode 100755
index 6f05b9f8c2..0000000000
--- a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_splits.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.             (Yifan Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-from datetime import datetime
-from pathlib import Path
-
-import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-    LilcomChunkyWriter,
-    set_audio_duration_mismatch_tolerance,
-    set_caching_enabled,
-)
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--num-workers",
-        type=int,
-        default=20,
-        help="Number of dataloading workers used for reading the audio.",
-    )
-
-    parser.add_argument(
-        "--batch-duration",
-        type=float,
-        default=600.0,
-        help="The maximum number of audio seconds in a batch."
-        "Determines batch size dynamically.",
-    )
-
-    parser.add_argument(
-        "--num-splits",
-        type=int,
-        required=True,
-        help="The number of splits of the train subset",
-    )
-
-    parser.add_argument(
-        "--start",
-        type=int,
-        default=0,
-        help="Process pieces starting from this number (included).",
-    )
-
-    parser.add_argument(
-        "--stop",
-        type=int,
-        default=-1,
-        help="Stop processing pieces until this number (excluded).",
-    )
-
-    return parser.parse_args()
-
-
-def compute_fbank_peoples_speech_splits(args):
-    subsets = ("dirty", "dirty_sa", "clean", "clean_sa")
-    num_splits = args.num_splits
-    output_dir = f"data/fbank/peoples_speech_train_split"
-    output_dir = Path(output_dir)
-    assert output_dir.exists(), f"{output_dir} does not exist!"
-
-    num_digits = 8
-
-    start = args.start
-    stop = args.stop
-    if stop < start:
-        stop = num_splits
-
-    stop = min(stop, num_splits)
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
-    logging.info(f"device: {device}")
-
-    set_audio_duration_mismatch_tolerance(0.01)  # 10ms tolerance
-    set_caching_enabled(False)
-
-    for partition in subsets:
-        for i in range(start, stop):
-            idx = f"{i + 1}".zfill(num_digits)
-            logging.info(f"Processing {partition}: {idx}")
-
-            cuts_path = output_dir / f"peoples_speech_cuts_{partition}.{idx}.jsonl.gz"
-            if cuts_path.is_file():
-                logging.info(f"{cuts_path} exists - skipping")
-                continue
-
-            raw_cuts_path = (
-                output_dir / f"peoples_speech_cuts_{partition}_raw.{idx}.jsonl.gz"
-            )
-
-            logging.info(f"Loading {raw_cuts_path}")
-            cut_set = CutSet.from_file(raw_cuts_path)
-
-            logging.info("Splitting cuts into smaller chunks.")
-            cut_set = cut_set.trim_to_supervisions(
-                keep_overlapping=False, min_duration=None
-            )
-
-            logging.info("Computing features")
-            cut_set = cut_set.compute_and_store_features_batch(
-                extractor=extractor,
-                storage_path=f"{output_dir}/peoples_speech_feats_{partition}_{idx}",
-                num_workers=args.num_workers,
-                batch_duration=args.batch_duration,
-                storage_type=LilcomChunkyWriter,
-                overwrite=True,
-            )
-
-            logging.info(f"Saving to {cuts_path}")
-            cut_set.to_file(cuts_path)
-
-
-def main():
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    args = get_args()
-    logging.info(vars(args))
-    compute_fbank_peoples_speech_splits(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_valid_test.py b/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_valid_test.py
deleted file mode 100755
index 89f43a674b..0000000000
--- a/egs/peoples_speech/ASR/local/compute_fbank_peoples_speech_valid_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Yifan Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This file computes fbank features of the People's Speech dataset.
-It looks for manifests in the directory data/manifests.
-
-The generated fbank features are saved in data/fbank.
-"""
-
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Optional
-
-import torch
-from filter_cuts import filter_cuts
-from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig, LilcomChunkyWriter
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-
-def compute_fbank_peoples_speech_valid_test():
-    src_dir = Path(f"data/manifests")
-    output_dir = Path(f"data/fbank")
-    num_workers = 42
-    batch_duration = 600
-
-    subsets = ("validation", "test")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
-
-    logging.info(f"device: {device}")
-
-    for partition in subsets:
-        cuts_path = output_dir / f"peoples_speech_cuts_{partition}.jsonl.gz"
-        if cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping.")
-            continue
-
-        raw_cuts_path = output_dir / f"peoples_speech_cuts_{partition}_raw.jsonl.gz"
-
-        logging.info(f"Loading {raw_cuts_path}")
-        cut_set = CutSet.from_file(raw_cuts_path)
-
-        logging.info("Splitting cuts into smaller chunks")
-        cut_set = cut_set.trim_to_supervisions(
-            keep_overlapping=False, min_duration=None
-        )
-
-        logging.info("Computing features")
-        cut_set = cut_set.compute_and_store_features_batch(
-            extractor=extractor,
-            storage_path=f"{output_dir}/peoples_speech_feats_{partition}",
-            num_workers=num_workers,
-            batch_duration=batch_duration,
-            storage_type=LilcomChunkyWriter,
-            overwrite=True,
-        )
-
-        logging.info(f"Saving to {cuts_path}")
-        cut_set.to_file(cuts_path)
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    compute_fbank_peoples_speech_valid_test()
diff --git a/egs/peoples_speech/ASR/local/filter_cuts.py b/egs/peoples_speech/ASR/local/filter_cuts.py
deleted file mode 120000
index 27aca17293..0000000000
--- a/egs/peoples_speech/ASR/local/filter_cuts.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/filter_cuts.py
\ No newline at end of file
diff --git a/egs/peoples_speech/ASR/local/prepare_lang_bpe.py b/egs/peoples_speech/ASR/local/prepare_lang_bpe.py
deleted file mode 120000
index 36b40e7fc2..0000000000
--- a/egs/peoples_speech/ASR/local/prepare_lang_bpe.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/prepare_lang_bpe.py
\ No newline at end of file
diff --git a/egs/peoples_speech/ASR/local/preprocess_peoples_speech.py b/egs/peoples_speech/ASR/local/preprocess_peoples_speech.py
deleted file mode 100755
index c5417049f5..0000000000
--- a/egs/peoples_speech/ASR/local/preprocess_peoples_speech.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Yifan Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import re
-from pathlib import Path
-from typing import Optional
-
-from lhotse import CutSet, SupervisionSegment
-from lhotse.recipes.utils import read_manifests_if_cached
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        help="""Dataset parts to compute fbank. If None, we will use all""",
-    )
-
-    return parser.parse_args()
-
-
-def normalize_text(utt: str) -> str:
-    utt = re.sub(r"[{0}]+".format("-"), " ", utt)
-    return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
-
-
-def preprocess_peoples_speech(dataset: Optional[str] = None):
-    src_dir = Path(f"data/manifests")
-    output_dir = Path(f"data/fbank")
-    output_dir.mkdir(exist_ok=True)
-
-    if dataset is None:
-        dataset_parts = (
-            "validation",
-            "test",
-            "dirty",
-            "dirty_sa",
-            "clean",
-            "clean_sa",
-        )
-    else:
-        dataset_parts = dataset.split(" ", -1)
-
-    logging.info("Loading manifest, it may takes 8 minutes")
-    prefix = f"peoples_speech"
-    suffix = "jsonl.gz"
-    manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts,
-        output_dir=src_dir,
-        suffix=suffix,
-        prefix=prefix,
-    )
-    assert manifests is not None
-
-    assert len(manifests) == len(dataset_parts), (
-        len(manifests),
-        len(dataset_parts),
-        list(manifests.keys()),
-        dataset_parts,
-    )
-
-    for partition, m in manifests.items():
-        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
-        if raw_cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping")
-            continue
-
-        logging.info(f"Normalizing text in {partition}")
-        i = 0
-        for sup in m["supervisions"]:
-            text = str(sup.text)
-            orig_text = text
-            sup.text = normalize_text(sup.text)
-            text = str(sup.text)
-            if i < 10 and len(orig_text) != len(text):
-                logging.info(
-                    f"\nOriginal text vs normalized text:\n{orig_text}\n{text}"
-                )
-                i += 1
-
-        # Create long-recording cut manifests.
-        cut_set = CutSet.from_manifests(
-            recordings=m["recordings"],
-            supervisions=m["supervisions"],
-        ).resample(16000)
-
-        # Run data augmentation that needs to be done in the
-        # time domain.
-        logging.info(f"Saving to {raw_cuts_path}")
-        cut_set.to_file(raw_cuts_path)
-
-
-def main():
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    args = get_args()
-    logging.info(vars(args))
-    preprocess_peoples_speech(dataset=args.dataset)
-    logging.info("Done")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/peoples_speech/ASR/local/train_bpe_model.py b/egs/peoples_speech/ASR/local/train_bpe_model.py
deleted file mode 120000
index 6fad36421e..0000000000
--- a/egs/peoples_speech/ASR/local/train_bpe_model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/train_bpe_model.py
\ No newline at end of file
diff --git a/egs/peoples_speech/ASR/local/validate_bpe_lexicon.py b/egs/peoples_speech/ASR/local/validate_bpe_lexicon.py
deleted file mode 120000
index 721bb48e7c..0000000000
--- a/egs/peoples_speech/ASR/local/validate_bpe_lexicon.py
+++ /dev/null
@@ -1 +0,0 @@
-../../../librispeech/ASR/local/validate_bpe_lexicon.py
\ No newline at end of file
diff --git a/egs/peoples_speech/ASR/prepare.sh b/egs/peoples_speech/ASR/prepare.sh
deleted file mode 100755
index 3787858d97..0000000000
--- a/egs/peoples_speech/ASR/prepare.sh
+++ /dev/null
@@ -1,247 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-nj=32
-stage=-1
-stop_stage=100
-
-# Split data/set to a number of pieces
-# This is to avoid OOM during feature extraction.
-num_per_split=4000
-
-# We assume dl_dir (download dir) contains the following
-# directories and files. If not, they will be downloaded
-# by this script automatically.
-#
-#  - $dl_dir/peoples_speech
-#      This directory contains the following files downloaded from
-#       https://huggingface.co/datasets/MLCommons/peoples_speech
-# 
-#     - test
-#     - train
-#     - validation
-#
-#  - $dl_dir/musan
-#      This directory contains the following directories downloaded from
-#       http://www.openslr.org/17/
-#
-#     - music
-#     - noise
-#     - speech
-
-dl_dir=$PWD/download
-
-. shared/parse_options.sh || exit 1
-
-# vocab size for sentence piece models.
-# It will generate data/lang_bpe_xxx,
-# data/lang_bpe_yyy if the array contains xxx, yyy
-vocab_sizes=(
-  # 5000
-  # 2000
-  # 1000
-  500
-)
-
-# All files generated by this script are saved in "data".
-# You can safely remove "data" and rerun this script to regenerate it.
-mkdir -p data
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-log "dl_dir: $dl_dir"
-
-if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
-  log "Stage 0: Download data"
-
-  # If you have pre-downloaded it to /path/to/peoples_speech,
-  # you can create a symlink
-  #
-  #   ln -sfv /path/to/peoples_speech $dl_dir/peoples_speech
-  #
-  if [ ! -d $dl_dir/peoples_speech/train ]; then
-    git lfs install
-    git clone https://huggingface.co/datasets/MLCommons/peoples_speech
-  fi
-
-  # If you have pre-downloaded it to /path/to/musan,
-  # you can create a symlink
-  #
-  #   ln -sfv /path/to/musan $dl_dir/
-  #
-  if [ ! -d $dl_dir/musan ]; then
-    lhotse download musan $dl_dir
-  fi
-fi
-
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  log "Stage 1: Prepare People's Speech manifest"
-  # We assume that you have downloaded the People's Speech corpus
-  # to $dl_dir/peoples_speech
-  mkdir -p data/manifests
-  if [ ! -e data/manifests/.peoples_speech.done ]; then
-    lhotse prepare peoples-speech -j $nj $dl_dir/peoples_speech data/manifests
-    touch data/manifests/.peoples_speech.done
-  fi
-fi
-
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Prepare musan manifest"
-  # We assume that you have downloaded the musan corpus
-  # to data/musan
-  mkdir -p data/manifests
-  if [ ! -e data/manifests/.musan.done ]; then
-    lhotse prepare musan $dl_dir/musan data/manifests
-    touch data/manifests/.musan.done
-  fi
-fi
-
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "Stage 3: Preprocess People's Speech manifest"
-  mkdir -p data/fbank
-  if [ ! -e data/fbank/.preprocess_complete ]; then
-    ./local/preprocess_peoples_speech.py
-    touch data/fbank/.preprocess_complete
-  fi
-fi
-
-if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  log "Stage 4: Compute fbank for valid and test subsets of People's Speech"
-  if [ ! -e data/fbank/.peoples_speech_valid_test.done ]; then
-    ./local/compute_fbank_peoples_speech_valid_test.py
-    touch data/fbank/.peoples_speech_valid_test.done
-  fi
-fi
-
-if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Split train subset into pieces"
-  split_dir=data/fbank/peoples_speech_train_split
-  if [ ! -e $split_dir/.peoples_speech_dirty_split.done ]; then
-    lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz $split_dir $num_per_split
-    touch $split_dir/.peoples_speech_dirty_split.done
-  fi
-
-  if [ ! -e $split_dir/.peoples_speech_dirty_sa_split.done ]; then
-    lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz $split_dir $num_per_split
-    touch $split_dir/.peoples_speech_dirty_sa_split.done
-  fi
-
-  if [ ! -e $split_dir/.peoples_speech_clean_split.done ]; then
-    lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz $split_dir $num_per_split
-    touch $split_dir/.peoples_speech_clean_split.done
-  fi
-
-  if [ ! -e $split_dir/.peoples_speech_clean_sa_split.done ]; then
-    lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz $split_dir $num_per_split
-    touch $split_dir/.peoples_speech_clean_sa_split.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Compute features for train subset of People's Speech"
-  if [ ! -e data/fbank/.peoples_speech_train.done ]; then
-    ./local/compute_fbank_peoples_speech_splits.py \
-      --num-workers $nj \
-      --batch-duration 600 \
-      --start 0 \
-      --num-splits 2000
-    touch data/fbank/.peoples_speech_train.done
-  fi
-fi
-
-if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
-  log "Stage 7: Compute fbank for musan"
-  mkdir -p data/fbank
-  if [ ! -e data/fbank/.musan.done ]; then
-    ./local/compute_fbank_musan.py
-    touch data/fbank/.musan.done
-  fi
-fi
-
-if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
-  log "Stage 8: Prepare BPE based lang"
-
-  for vocab_size in ${vocab_sizes[@]}; do
-    lang_dir=data/lang_bpe_${vocab_size}
-    mkdir -p $lang_dir
-
-    if [ ! -f $lang_dir/transcript_words.txt ]; then
-      log "Generate data for BPE training"
-      file=$(
-        find "data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz"
-	find "data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz"
-	find "data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz"
-	find "data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz"
-      )
-      gunzip -c ${file} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt
-
-      # Ensure space only appears once
-      sed -i 's/\t/ /g' $lang_dir/transcript_words.txt
-      sed -i 's/ +/ /g' $lang_dir/transcript_words.txt
-    fi
- 
-    if [ ! -f $lang_dir/words.txt ]; then
-      cat $lang_dir/transcript_words.txt | sed 's/ /\n/g' \
-        | sort -u | sed '/^$/d' > $lang_dir/words.txt
-      (echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
-        cat - $lang_dir/words.txt | sort | uniq | awk '
-        BEGIN {
-          print "<eps> 0";
-        }
-        {
-          if ($1 == "<s>") {
-            print "<s> is in the vocabulary!" | "cat 1>&2"
-            exit 1;
-          }
-          if ($1 == "</s>") {
-            print "</s> is in the vocabulary!" | "cat 1>&2"
-            exit 1;
-          }
-          printf("%s %d\n", $1, NR);
-        }
-        END {
-          printf("#0 %d\n", NR+1);
-          printf("<s> %d\n", NR+2);
-          printf("</s> %d\n", NR+3);
-        }' > $lang_dir/words || exit 1;
-      mv $lang_dir/words $lang_dir/words.txt
-    fi
- 
-    if [ ! -f $lang_dir/bpe.model ]; then
-      ./local/train_bpe_model.py \
-        --lang-dir $lang_dir \
-        --vocab-size $vocab_size \
-        --transcript $lang_dir/transcript_words.txt
-    fi
-  
-    if [ ! -f $lang_dir/L_disambig.pt ]; then
-      ./local/prepare_lang_bpe.py --lang-dir $lang_dir
-
-      log "Validating $lang_dir/lexicon.txt"
-      ./local/validate_bpe_lexicon.py \
-        --lexicon $lang_dir/lexicon.txt \
-        --bpe-model $lang_dir/bpe.model
-    fi
-
-    if [ ! -f $lang_dir/L.fst ]; then
-      log "Converting L.pt to L.fst"
-      ./shared/convert-k2-to-openfst.py \
-        --olabels aux_labels \
-        $lang_dir/L.pt \
-        $lang_dir/L.fst
-    fi
-
-    if [ ! -f $lang_dir/L_disambig.fst ]; then
-      log "Converting L_disambig.pt to L_disambig.fst"
-      ./shared/convert-k2-to-openfst.py \
-        --olabels aux_labels \
-        $lang_dir/L_disambig.pt \
-        $lang_dir/L_disambig.fst
-    fi
-  done
-fi
diff --git a/egs/peoples_speech/ASR/shared b/egs/peoples_speech/ASR/shared
deleted file mode 120000
index 4c5e91438c..0000000000
--- a/egs/peoples_speech/ASR/shared
+++ /dev/null
@@ -1 +0,0 @@
-../../../icefall/shared/
\ No newline at end of file