From 77125064cb17ae9f65c151d6b219b057aa39de4e Mon Sep 17 00:00:00 2001
From: yifanyeung <yifanyeung@yifanyeung.local>
Date: Fri, 22 Dec 2023 23:01:47 +0800
Subject: [PATCH 1/4] Add SSL

---
 egs/librispeech/SSL/hubert/asr_datamodule.py |  260 ++
 egs/librispeech/SSL/hubert/beam_search.py    | 2942 ++++++++++++++++++
 egs/librispeech/SSL/hubert/ctc_decode.py     |  847 +++++
 egs/librispeech/SSL/hubert/dataset.py        |  154 +
 egs/librispeech/SSL/hubert/decode.py         | 1032 ++++++
 egs/librispeech/SSL/hubert/decoder.py        |  134 +
 egs/librispeech/SSL/hubert/finetune.py       | 1447 +++++++++
 egs/librispeech/SSL/hubert/joiner.py         |   67 +
 egs/librispeech/SSL/hubert/model.py          |  343 ++
 egs/librispeech/SSL/hubert/optim.py          | 1244 ++++++++
 egs/librispeech/SSL/hubert/scaling.py        | 1908 ++++++++++++
 egs/librispeech/SSL/hubert/ssl_datamodule.py |  262 ++
 egs/librispeech/SSL/hubert/subsampling.py    |  406 +++
 egs/librispeech/SSL/shared                   |    1 +
 14 files changed, 11047 insertions(+)
 create mode 100644 egs/librispeech/SSL/hubert/asr_datamodule.py
 create mode 100644 egs/librispeech/SSL/hubert/beam_search.py
 create mode 100644 egs/librispeech/SSL/hubert/ctc_decode.py
 create mode 100644 egs/librispeech/SSL/hubert/dataset.py
 create mode 100644 egs/librispeech/SSL/hubert/decode.py
 create mode 100644 egs/librispeech/SSL/hubert/decoder.py
 create mode 100644 egs/librispeech/SSL/hubert/finetune.py
 create mode 100644 egs/librispeech/SSL/hubert/joiner.py
 create mode 100644 egs/librispeech/SSL/hubert/model.py
 create mode 100644 egs/librispeech/SSL/hubert/optim.py
 create mode 100644 egs/librispeech/SSL/hubert/scaling.py
 create mode 100644 egs/librispeech/SSL/hubert/ssl_datamodule.py
 create mode 100644 egs/librispeech/SSL/hubert/subsampling.py
 create mode 120000 egs/librispeech/SSL/shared

diff --git a/egs/librispeech/SSL/hubert/asr_datamodule.py b/egs/librispeech/SSL/hubert/asr_datamodule.py
new file mode 100644
index 0000000000..eb2055fe52
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/asr_datamodule.py
@@ -0,0 +1,260 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2023  Xiaomi Corporation     (Author: Yifan Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+from dataset import HubertAsrDataset
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.dataset import DynamicBucketingSampler, SimpleCutSampler
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class LibriSpeechAsrDataModule:
+    """
+    DataModule for ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies.",
+        )
+        group.add_argument(
+            "--full-libri",
+            type=str2bool,
+            default=True,
+            help="When enabled use 960h LibriSpeech. " "Otherwise, use 100h subset.",
+        )
+
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/fbank"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        logging.info("About to create train dataset")
+        train = HubertAsrDataset()
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SimpleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        logging.info("About to create dev dataset")
+        validate = HubertAsrDataset()
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = HubertAsrDataset()
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_clean_100_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-100 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
+        )
+
+    @lru_cache()
+    def train_clean_360_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-360 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
+        )
+
+    @lru_cache()
+    def train_other_500_cuts(self) -> CutSet:
+        logging.info("About to get train-other-500 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
+        )
+
+    @lru_cache()
+    def dev_clean_cuts(self) -> CutSet:
+        logging.info("About to get dev-clean cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
+        )
+
+    @lru_cache()
+    def dev_other_cuts(self) -> CutSet:
+        logging.info("About to get dev-other cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
+        )
+
+    @lru_cache()
+    def test_clean_cuts(self) -> CutSet:
+        logging.info("About to get test-clean cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
+        )
+
+    @lru_cache()
+    def test_other_cuts(self) -> CutSet:
+        logging.info("About to get test-other cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
+        )
diff --git a/egs/librispeech/SSL/hubert/beam_search.py b/egs/librispeech/SSL/hubert/beam_search.py
new file mode 100644
index 0000000000..7fcd242fcd
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/beam_search.py
@@ -0,0 +1,2942 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
+#                                                  Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Union
+
+import k2
+import sentencepiece as spm
+import torch
+from torch import nn
+
+from icefall import ContextGraph, ContextState, NgramLm, NgramLmStateCost
+from icefall.decode import Nbest, one_best_decoding
+from icefall.lm_wrapper import LmScorer
+from icefall.rnn_lm.model import RnnLmModel
+from icefall.transformer_lm.model import TransformerLM
+from icefall.utils import (
+    DecodingResults,
+    add_eos,
+    add_sos,
+    get_texts,
+    get_texts_with_timestamp,
+)
+
+
+def fast_beam_search_one_best(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    temperature: float = 1.0,
+    ilme_scale: float = 0.0,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+    allow_partial: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using fast beam search, and then
+    the shortest path within the lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+        ilme_scale=ilme_scale,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
+    )
+
+    best_path = one_best_decoding(lattice)
+
+    if not return_timestamps:
+        return get_texts(best_path)
+    else:
+        return get_texts_with_timestamp(best_path)
+
+
+def fast_beam_search_nbest_LG(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    num_paths: int,
+    nbest_scale: float = 0.5,
+    use_double_scores: bool = True,
+    temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    ilme_scale: float = 0.0,
+    return_timestamps: bool = False,
+    allow_partial: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    The process to get the results is:
+     - (1) Use fast beam search to get a lattice
+     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
+     - (3) Unique the selected paths
+     - (4) Intersect the selected paths with the lattice and compute the
+           shortest path from the intersection result
+     - (5) The path with the largest score is used as the decoding output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
+        ilme_scale=ilme_scale,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+
+    # The following code is modified from nbest.intersect()
+    word_fsa = k2.invert(nbest.fsa)
+    if hasattr(lattice, "aux_labels"):
+        # delete token IDs as it is not needed
+        del word_fsa.aux_labels
+    word_fsa.scores.zero_()
+    word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa)
+    path_to_utt_map = nbest.shape.row_ids(1)
+
+    if hasattr(lattice, "aux_labels"):
+        # lattice has token IDs as labels and word IDs as aux_labels.
+        # inv_lattice has word IDs as labels and token IDs as aux_labels
+        inv_lattice = k2.invert(lattice)
+        inv_lattice = k2.arc_sort(inv_lattice)
+    else:
+        inv_lattice = k2.arc_sort(lattice)
+
+    if inv_lattice.shape[0] == 1:
+        path_lattice = k2.intersect_device(
+            inv_lattice,
+            word_fsa_with_epsilon_loops,
+            b_to_a_map=torch.zeros_like(path_to_utt_map),
+            sorted_match_a=True,
+        )
+    else:
+        path_lattice = k2.intersect_device(
+            inv_lattice,
+            word_fsa_with_epsilon_loops,
+            b_to_a_map=path_to_utt_map,
+            sorted_match_a=True,
+        )
+
+    # path_lattice has word IDs as labels and token IDs as aux_labels
+    path_lattice = k2.top_sort(k2.connect(path_lattice))
+    tot_scores = path_lattice.get_tot_scores(
+        use_double_scores=use_double_scores,
+        log_semiring=True,  # Note: we always use True
+    )
+    # See https://github.com/k2-fsa/icefall/pull/420 for why
+    # we always use log_semiring=True
+
+    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+    best_hyp_indexes = ragged_tot_scores.argmax()
+    best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes)
+
+    if not return_timestamps:
+        return get_texts(best_path)
+    else:
+        return get_texts_with_timestamp(best_path)
+
+
+def fast_beam_search_nbest(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    num_paths: int,
+    nbest_scale: float = 0.5,
+    use_double_scores: bool = True,
+    temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+    allow_partial: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    The process to get the results is:
+     - (1) Use fast beam search to get a lattice
+     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
+     - (3) Unique the selected paths
+     - (4) Intersect the selected paths with the lattice and compute the
+           shortest path from the intersection result
+     - (5) The path with the largest score is used as the decoding output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        blank_penalty=blank_penalty,
+        temperature=temperature,
+        allow_partial=allow_partial,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+
+    # at this point, nbest.fsa.scores are all zeros.
+
+    nbest = nbest.intersect(lattice)
+    # Now nbest.fsa.scores contains acoustic scores
+
+    max_indexes = nbest.tot_scores().argmax()
+
+    best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+    if not return_timestamps:
+        return get_texts(best_path)
+    else:
+        return get_texts_with_timestamp(best_path)
+
+
+def fast_beam_search_nbest_oracle(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    num_paths: int,
+    ref_texts: List[List[int]],
+    use_double_scores: bool = True,
+    nbest_scale: float = 0.5,
+    temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+    allow_partial: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using fast beam search, and then
+    we select `num_paths` linear paths from the lattice. The path
+    that has the minimum edit distance with the given reference transcript
+    is used as the output.
+
+    This is the best result we can achieve for any nbest based rescoring
+    methods.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      ref_texts:
+        A list-of-list of integers containing the reference transcripts.
+        If the decoding_graph is a trivial_graph, the integer ID is the
+        BPE token ID.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+
+    hyps = nbest.build_levenshtein_graphs()
+    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
+
+    levenshtein_alignment = k2.levenshtein_alignment(
+        refs=refs,
+        hyps=hyps,
+        hyp_to_ref_map=nbest.shape.row_ids(1),
+        sorted_match_ref=True,
+    )
+
+    tot_scores = levenshtein_alignment.get_tot_scores(
+        use_double_scores=False, log_semiring=False
+    )
+    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+
+    max_indexes = ragged_tot_scores.argmax()
+
+    best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+    if not return_timestamps:
+        return get_texts(best_path)
+    else:
+        return get_texts_with_timestamp(best_path)
+
+
+def fast_beam_search(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    temperature: float = 1.0,
+    subtract_ilme: bool = False,
+    ilme_scale: float = 0.1,
+    allow_partial: bool = False,
+    blank_penalty: float = 0.0,
+) -> k2.Fsa:
+    """It limits the maximum number of symbols per frame to 1.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      temperature:
+        Softmax temperature.
+    Returns:
+      Return an FsaVec with axes [utt][state][arc] containing the decoded
+      lattice. Note: When the input graph is a TrivialGraph, the returned
+      lattice is actually an acceptor.
+    """
+    assert encoder_out.ndim == 3
+
+    context_size = model.decoder.context_size
+    vocab_size = model.decoder.vocab_size
+
+    B, T, C = encoder_out.shape
+
+    config = k2.RnntDecodingConfig(
+        vocab_size=vocab_size,
+        decoder_history_len=context_size,
+        beam=beam,
+        max_contexts=max_contexts,
+        max_states=max_states,
+    )
+    individual_streams = []
+    for i in range(B):
+        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
+    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    for t in range(T):
+        # shape is a RaggedShape of shape (B, context)
+        # contexts is a Tensor of shape (shape.NumElements(), context_size)
+        shape, contexts = decoding_streams.get_contexts()
+        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
+        contexts = contexts.to(torch.int64)
+        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
+        decoder_out = model.decoder(contexts, need_pad=False)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # current_encoder_out is of shape
+        # (shape.NumElements(), 1, joiner_dim)
+        # fmt: off
+        current_encoder_out = torch.index_select(
+            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
+        )
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out.unsqueeze(2),
+            decoder_out.unsqueeze(1),
+            project_input=False,
+        )
+        logits = logits.squeeze(1).squeeze(1)
+
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)
+
+        if ilme_scale != 0:
+            ilme_logits = model.joiner(
+                torch.zeros_like(
+                    current_encoder_out, device=current_encoder_out.device
+                ).unsqueeze(2),
+                decoder_out.unsqueeze(1),
+                project_input=False,
+            )
+            ilme_logits = ilme_logits.squeeze(1).squeeze(1)
+            if blank_penalty != 0:
+                ilme_logits[:, 0] -= blank_penalty
+            ilme_log_probs = (ilme_logits / temperature).log_softmax(dim=-1)
+            log_probs -= ilme_scale * ilme_log_probs
+
+        decoding_streams.advance(log_probs)
+    decoding_streams.terminate_and_flush_to_streams()
+    lattice = decoding_streams.format_output(
+        encoder_out_lens.tolist(), allow_partial=allow_partial
+    )
+
+    return lattice
+
+
+def greedy_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    max_sym_per_frame: int,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """Greedy search for a single utterance.
+    Args:
+      model:
+        An instance of `Transducer`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      max_sym_per_frame:
+        Maximum number of symbols per frame. If it is set to 0, the WER
+        would be 100%.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+
+    blank_id = model.decoder.blank_id
+    context_size = model.decoder.context_size
+    unk_id = getattr(model, "unk_id", blank_id)
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+    hyp = [blank_id] * context_size
+
+    # timestamp[i] is the frame index after subsampling
+    # on which hyp[i] is decoded
+    timestamp = []
+
+    # Maximum symbols per utterance.
+    max_sym_per_utt = 1000
+
+    # symbols per frame
+    sym_per_frame = 0
+
+    # symbols per utterance decoded so far
+    sym_per_utt = 0
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        if sym_per_frame >= max_sym_per_frame:
+            sym_per_frame = 0
+            t += 1
+            continue
+
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits is (1, 1, 1, vocab_size)
+
+        if blank_penalty != 0:
+            logits[:, :, :, 0] -= blank_penalty
+
+        y = logits.argmax().item()
+        if y not in (blank_id, unk_id):
+            hyp.append(y)
+            timestamp.append(t)
+            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
+                1, context_size
+            )
+
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+            sym_per_utt += 1
+            sym_per_frame += 1
+        else:
+            sym_per_frame = 0
+            t += 1
+    hyp = hyp[context_size:]  # remove blanks
+
+    if not return_timestamps:
+        return hyp
+    else:
+        return DecodingResults(
+            hyps=[hyp],
+            timestamps=[timestamp],
+        )
+
+
+def greedy_search_batch(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    blank_penalty: float = 0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = next(model.parameters()).device
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
+
+    # timestamp[n][i] is the frame index after subsampling
+    # on which hyp[n][i] is decoded
+    timestamps = [[] for _ in range(N)]
+    # scores[n][i] is the logits on which hyp[n][i] is decoded
+    scores = [[] for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    # decoder_out: (N, 1, decoder_out_dim)
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    for t, batch_size in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits'shape (batch_size, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
+        assert logits.ndim == 2, logits.shape
+
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                timestamps[i].append(t)
+                scores[i].append(logits[i, v].item())
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    ans_timestamps = []
+    ans_scores = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(timestamps[unsorted_indices[i]])
+        ans_scores.append(scores[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+            scores=ans_scores,
+        )
+
+
+@dataclass
+class Hypothesis:
+    # The predicted tokens so far.
+    # Newly predicted tokens are appended to `ys`.
+    ys: List[int]
+
+    # The log prob of ys.
+    # It contains only one entry.
+    log_prob: torch.Tensor
+
+    # timestamp[i] is the frame index after subsampling
+    # on which ys[i] is decoded
+    timestamp: List[int] = field(default_factory=list)
+
+    # the lm score for next token given the current ys
+    lm_score: Optional[torch.Tensor] = None
+
+    # the RNNLM states (h and c in LSTM)
+    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+
+    # N-gram LM state
+    state_cost: Optional[NgramLmStateCost] = None
+
+    # Context graph state
+    context_state: Optional[ContextState] = None
+
+    @property
+    def key(self) -> str:
+        """Return a string representation of self.ys"""
+        return "_".join(map(str, self.ys))
+
+
+class HypothesisList(object):
+    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
+        """
+        Args:
+          data:
+            A dict of Hypotheses. Its key is its `value.key`.
+        """
+        if data is None:
+            self._data = {}
+        else:
+            self._data = data
+
+    @property
+    def data(self) -> Dict[str, Hypothesis]:
+        return self._data
+
+    def add(self, hyp: Hypothesis) -> None:
+        """Add a Hypothesis to `self`.
+
+        If `hyp` already exists in `self`, its probability is updated using
+        `log-sum-exp` with the existed one.
+
+        Args:
+          hyp:
+            The hypothesis to be added.
+        """
+        key = hyp.key
+        if key in self:
+            old_hyp = self._data[key]  # shallow copy
+            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
+        else:
+            self._data[key] = hyp
+
+    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
+        """Get the most probable hypothesis, i.e., the one with
+        the largest `log_prob`.
+
+        Args:
+          length_norm:
+            If True, the `log_prob` of a hypothesis is normalized by the
+            number of tokens in it.
+        Returns:
+          Return the hypothesis that has the largest `log_prob`.
+        """
+        if length_norm:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
+        else:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
+
+    def remove(self, hyp: Hypothesis) -> None:
+        """Remove a given hypothesis.
+
+        Caution:
+          `self` is modified **in-place**.
+
+        Args:
+          hyp:
+            The hypothesis to be removed from `self`.
+            Note: It must be contained in `self`. Otherwise,
+            an exception is raised.
+        """
+        key = hyp.key
+        assert key in self, f"{key} does not exist"
+        del self._data[key]
+
+    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
+        """Remove all Hypotheses whose log_prob is less than threshold.
+
+        Caution:
+          `self` is not modified. Instead, a new HypothesisList is returned.
+
+        Returns:
+          Return a new HypothesisList containing all hypotheses from `self`
+          with `log_prob` being greater than the given `threshold`.
+        """
+        ans = HypothesisList()
+        for _, hyp in self._data.items():
+            if hyp.log_prob > threshold:
+                ans.add(hyp)  # shallow copy
+        return ans
+
+    def topk(self, k: int, length_norm: bool = False) -> "HypothesisList":
+        """Return the top-k hypothesis.
+
+        Args:
+          length_norm:
+            If True, the `log_prob` of a hypothesis is normalized by the
+            number of tokens in it.
+        """
+        hyps = list(self._data.items())
+
+        if length_norm:
+            hyps = sorted(
+                hyps, key=lambda h: h[1].log_prob / len(h[1].ys), reverse=True
+            )[:k]
+        else:
+            hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
+
+        ans = HypothesisList(dict(hyps))
+        return ans
+
+    def __contains__(self, key: str):
+        return key in self._data
+
+    def __iter__(self):
+        return iter(self._data.values())
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __str__(self) -> str:
+        s = []
+        for key in self:
+            s.append(key)
+        return ", ".join(s)
+
+
+def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
+    """Return a ragged shape with axes [utt][num_hyps].
+
+    Args:
+      hyps:
+        len(hyps) == batch_size. It contains the current hypothesis for
+        each utterance in the batch.
+    Returns:
+      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
+      the shape is on CPU.
+    """
+    num_hyps = [len(h) for h in hyps]
+
+    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
+    # to get exclusive sum later.
+    num_hyps.insert(0, 0)
+
+    num_hyps = torch.tensor(num_hyps)
+    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
+    ans = k2.ragged.create_ragged_shape2(
+        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
+    )
+    return ans
+
+
+def modified_beam_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    context_graph: Optional[ContextGraph] = None,
+    beam: int = 4,
+    temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                context_state=None if context_graph is None else context_graph.root,
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for t, batch_size in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                context_score = 0
+                new_context_state = None if context_graph is None else hyp.context_state
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+                    if context_graph is not None:
+                        (
+                            context_score,
+                            new_context_state,
+                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
+
+                new_log_prob = topk_log_probs[k] + context_score
+
+                new_hyp = Hypothesis(
+                    ys=new_ys,
+                    log_prob=new_log_prob,
+                    timestamp=new_timestamp,
+                    context_state=new_context_state,
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+
+    # finalize context_state, if the matched contexts do not reach final state
+    # we need to add the score on the corresponding backoff arc
+    if context_graph is not None:
+        finalized_B = [HypothesisList() for _ in range(len(B))]
+        for i, hyps in enumerate(B):
+            for hyp in list(hyps):
+                context_score, new_context_state = context_graph.finalize(
+                    hyp.context_state
+                )
+                finalized_B[i].add(
+                    Hypothesis(
+                        ys=hyp.ys,
+                        log_prob=hyp.log_prob + context_score,
+                        timestamp=hyp.timestamp,
+                        context_state=new_context_state,
+                    )
+                )
+        B = finalized_B
+
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    sorted_timestamps = [h.timestamp for h in best_hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
+
+
+def modified_beam_search_lm_rescore(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    LM: LmScorer,
+    lm_scale_list: List[int],
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+    Rescore the final results with RNNLM and return the one with the highest score
+
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      LM:
+        A neural network language model
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for t, batch_size in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+
+    # get the am_scores for n-best list
+    hyps_shape = get_hyps_shape(B)
+    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
+    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
+
+    # now LM rescore
+    # prepare input data to LM
+    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
+    possible_seqs = k2.RaggedTensor(candidate_seqs)
+    row_splits = possible_seqs.shape.row_splits(1)
+    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
+    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
+    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
+    sentence_token_lengths += 1
+
+    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
+    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
+    x = x.to(device).to(torch.int64)
+    y = y.to(device).to(torch.int64)
+    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
+
+    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
+    assert lm_scores.ndim == 2
+    lm_scores = -1 * lm_scores.sum(dim=1)
+
+    ans = {}
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+
+    # get the best hyp with different lm_scale
+    for lm_scale in lm_scale_list:
+        key = f"nnlm_scale_{lm_scale:.2f}"
+        tot_scores = am_scores.values + lm_scores * lm_scale
+        ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
+        max_indexes = ragged_tot_scores.argmax().tolist()
+        unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
+        hyps = []
+        for idx in unsorted_indices:
+            hyps.append(unsorted_hyps[idx])
+
+        ans[key] = hyps
+    return ans
+
+
+def modified_beam_search_lm_rescore_LODR(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    LM: LmScorer,
+    LODR_lm: NgramLm,
+    sp: spm.SentencePieceProcessor,
+    lm_scale_list: List[int],
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+    Rescore the final results with RNNLM and return the one with the highest score
+
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      LM:
+        A neural network language model
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for t, batch_size in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+
+    # get the am_scores for n-best list
+    hyps_shape = get_hyps_shape(B)
+    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
+    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
+
+    # now LM rescore
+    # prepare input data to LM
+    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
+    possible_seqs = k2.RaggedTensor(candidate_seqs)
+    row_splits = possible_seqs.shape.row_splits(1)
+    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
+    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
+    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
+    sentence_token_lengths += 1
+
+    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
+    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
+    x = x.to(device).to(torch.int64)
+    y = y.to(device).to(torch.int64)
+    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
+
+    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
+    assert lm_scores.ndim == 2
+    lm_scores = -1 * lm_scores.sum(dim=1)
+
+    # now LODR scores
+    import math
+
+    LODR_scores = []
+    for seq in candidate_seqs:
+        tokens = " ".join(sp.id_to_piece(seq))
+        LODR_scores.append(LODR_lm.score(tokens))
+    LODR_scores = torch.tensor(LODR_scores).to(device) * math.log(
+        10
+    )  # arpa scores are 10-based
+    assert lm_scores.shape == LODR_scores.shape
+
+    ans = {}
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+
+    LODR_scale_list = [0.05 * i for i in range(1, 20)]
+    # get the best hyp with different lm_scale and lodr_scale
+    for lm_scale in lm_scale_list:
+        for lodr_scale in LODR_scale_list:
+            key = f"nnlm_scale_{lm_scale:.2f}_lodr_scale_{lodr_scale:.2f}"
+            tot_scores = (
+                am_scores.values / lm_scale + lm_scores - LODR_scores * lodr_scale
+            )
+            ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
+            max_indexes = ragged_tot_scores.argmax().tolist()
+            unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
+            hyps = []
+            for idx in unsorted_indices:
+                hyps.append(unsorted_hyps[idx])
+
+            ans[key] = hyps
+    return ans
+
+
+def _deprecated_modified_beam_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    beam: int = 4,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    It decodes only one utterance at a time. We keep it only for reference.
+    The function :func:`modified_beam_search` should be preferred as it
+    supports batch decoding.
+
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      beam:
+        Beam size.
+      return_timestamps:
+        Whether to return timestamps.
+
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+
+    assert encoder_out.ndim == 3
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    device = next(model.parameters()).device
+
+    T = encoder_out.size(1)
+
+    B = HypothesisList()
+    B.add(
+        Hypothesis(
+            ys=[-1] * (context_size - 1) + [blank_id],
+            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+            timestamp=[],
+        )
+    )
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    for t in range(T):
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
+        # fmt: on
+        A = list(B)
+        B = HypothesisList()
+
+        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
+        # ys_log_probs is of shape (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyp in A],
+            device=device,
+            dtype=torch.int64,
+        )
+        # decoder_input is of shape (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_output is of shape (num_hyps, 1, 1, joiner_dim)
+
+        current_encoder_out = current_encoder_out.expand(
+            decoder_out.size(0), 1, 1, -1
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )
+        # logits is of shape (num_hyps, 1, 1, vocab_size)
+        logits = logits.squeeze(1).squeeze(1)
+
+        # now logits is of shape (num_hyps, vocab_size)
+        log_probs = logits.log_softmax(dim=-1)
+
+        log_probs.add_(ys_log_probs)
+
+        log_probs = log_probs.reshape(-1)
+        topk_log_probs, topk_indexes = log_probs.topk(beam)
+
+        # topk_hyp_indexes are indexes into `A`
+        topk_hyp_indexes = topk_indexes // logits.size(-1)
+        topk_token_indexes = topk_indexes % logits.size(-1)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            topk_hyp_indexes = topk_hyp_indexes.tolist()
+            topk_token_indexes = topk_token_indexes.tolist()
+
+        for i in range(len(topk_hyp_indexes)):
+            hyp = A[topk_hyp_indexes[i]]
+            new_ys = hyp.ys[:]
+            new_timestamp = hyp.timestamp[:]
+            new_token = topk_token_indexes[i]
+            if new_token not in (blank_id, unk_id):
+                new_ys.append(new_token)
+                new_timestamp.append(t)
+            new_log_prob = topk_log_probs[i]
+            new_hyp = Hypothesis(
+                ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+            )
+            B.add(new_hyp)
+
+    best_hyp = B.get_most_probable(length_norm=True)
+    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
+
+    if not return_timestamps:
+        return ys
+    else:
+        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
+
+
+def beam_search(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """
+    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
+
+    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      beam:
+        Beam size.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [blank_id] * context_size,
+        device=device,
+        dtype=torch.int64,
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+
+    B = HypothesisList()
+    B.add(
+        Hypothesis(
+            ys=[-1] * (context_size - 1) + [blank_id], log_prob=0.0, timestamp=[]
+        )
+    )
+
+    max_sym_per_utt = 20000
+
+    sym_per_utt = 0
+
+    decoder_cache: Dict[str, torch.Tensor] = {}
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        A = B
+        B = HypothesisList()
+
+        joint_cache: Dict[str, torch.Tensor] = {}
+
+        # TODO(fangjun): Implement prefix search to update the `log_prob`
+        # of hypotheses in A
+
+        while True:
+            y_star = A.get_most_probable()
+            A.remove(y_star)
+
+            cached_key = y_star.key
+
+            if cached_key not in decoder_cache:
+                decoder_input = torch.tensor(
+                    [y_star.ys[-context_size:]],
+                    device=device,
+                    dtype=torch.int64,
+                ).reshape(1, context_size)
+
+                decoder_out = model.decoder(decoder_input, need_pad=False)
+                decoder_out = model.joiner.decoder_proj(decoder_out)
+                decoder_cache[cached_key] = decoder_out
+            else:
+                decoder_out = decoder_cache[cached_key]
+
+            cached_key += f"-t-{t}"
+            if cached_key not in joint_cache:
+                logits = model.joiner(
+                    current_encoder_out,
+                    decoder_out.unsqueeze(1),
+                    project_input=False,
+                )
+
+                if blank_penalty != 0:
+                    logits[:, :, :, 0] -= blank_penalty
+
+                # TODO(fangjun): Scale the blank posterior
+                log_prob = (logits / temperature).log_softmax(dim=-1)
+                # log_prob is (1, 1, 1, vocab_size)
+                log_prob = log_prob.squeeze()
+                # Now log_prob is (vocab_size,)
+                joint_cache[cached_key] = log_prob
+            else:
+                log_prob = joint_cache[cached_key]
+
+            # First, process the blank symbol
+            skip_log_prob = log_prob[blank_id]
+            new_y_star_log_prob = y_star.log_prob + skip_log_prob
+
+            # ys[:] returns a copy of ys
+            B.add(
+                Hypothesis(
+                    ys=y_star.ys[:],
+                    log_prob=new_y_star_log_prob,
+                    timestamp=y_star.timestamp[:],
+                )
+            )
+
+            # Second, process other non-blank labels
+            values, indices = log_prob.topk(beam + 1)
+            for i, v in zip(indices.tolist(), values.tolist()):
+                if i in (blank_id, unk_id):
+                    continue
+                new_ys = y_star.ys + [i]
+                new_log_prob = y_star.log_prob + v
+                new_timestamp = y_star.timestamp + [t]
+                A.add(
+                    Hypothesis(
+                        ys=new_ys,
+                        log_prob=new_log_prob,
+                        timestamp=new_timestamp,
+                    )
+                )
+
+            # Check whether B contains more than "beam" elements more probable
+            # than the most probable in A
+            A_most_probable = A.get_most_probable()
+
+            kept_B = B.filter(A_most_probable.log_prob)
+
+            if len(kept_B) >= beam:
+                B = kept_B.topk(beam)
+                break
+
+        t += 1
+
+    best_hyp = B.get_most_probable(length_norm=True)
+    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
+
+    if not return_timestamps:
+        return ys
+    else:
+        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
+
+
+def fast_beam_search_with_nbest_rescoring(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    ngram_lm_scale_list: List[float],
+    num_paths: int,
+    G: k2.Fsa,
+    sp: spm.SentencePieceProcessor,
+    word_table: k2.SymbolTable,
+    oov_word: str = "<UNK>",
+    use_double_scores: bool = True,
+    nbest_scale: float = 0.5,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Dict[str, Union[List[List[int]], DecodingResults]]:
+    """It limits the maximum number of symbols per frame to 1.
+    A lattice is first obtained using fast beam search, num_path are selected
+    and rescored using a given language model. The shortest path within the
+    lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi.
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      ngram_lm_scale_list:
+        A list of floats representing LM score scales.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      G:
+        An FsaVec containing only a single FSA. It is an n-gram LM.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      oov_word:
+        OOV words are replaced with this word.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      Return the decoded result in a dict, where the key has the form
+      'ngram_lm_scale_xx' and the value is the decoded results
+      optionally with timestamps. `xx` is the ngram LM scale value
+      used during decoding, i.e., 0.1.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+    # at this point, nbest.fsa.scores are all zeros.
+
+    nbest = nbest.intersect(lattice)
+    # Now nbest.fsa.scores contains acoustic scores
+
+    am_scores = nbest.tot_scores()
+
+    # Now we need to compute the LM scores of each path.
+    # (1) Get the token IDs of each Path. We assume the decoding_graph
+    # is an acceptor, i.e., lattice is also an acceptor
+    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
+
+    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
+    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
+
+    token_list: List[List[int]] = tokens.tolist()
+    word_list: List[List[str]] = sp.decode(token_list)
+
+    assert isinstance(oov_word, str), oov_word
+    assert oov_word in word_table, oov_word
+    oov_word_id = word_table[oov_word]
+
+    word_ids_list: List[List[int]] = []
+
+    for words in word_list:
+        this_word_ids = []
+        for w in words.split():
+            if w in word_table:
+                this_word_ids.append(word_table[w])
+            else:
+                this_word_ids.append(oov_word_id)
+        word_ids_list.append(this_word_ids)
+
+    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
+    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
+
+    num_unique_paths = len(word_ids_list)
+
+    b_to_a_map = torch.zeros(
+        num_unique_paths,
+        dtype=torch.int32,
+        device=lattice.device,
+    )
+
+    rescored_word_fsas = k2.intersect_device(
+        a_fsas=G,
+        b_fsas=word_fsas_with_self_loops,
+        b_to_a_map=b_to_a_map,
+        sorted_match_a=True,
+        ret_arc_maps=False,
+    )
+
+    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
+    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
+    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
+        use_double_scores=True,
+        log_semiring=False,
+    )
+
+    ans: Dict[str, Union[List[List[int]], DecodingResults]] = {}
+    for s in ngram_lm_scale_list:
+        key = f"ngram_lm_scale_{s}"
+        tot_scores = am_scores.values + s * ngram_lm_scores
+        ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+        max_indexes = ragged_tot_scores.argmax()
+        best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+        if not return_timestamps:
+            ans[key] = get_texts(best_path)
+        else:
+            ans[key] = get_texts_with_timestamp(best_path)
+
+    return ans
+
+
+def fast_beam_search_with_nbest_rnn_rescoring(
+    model: nn.Module,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    ngram_lm_scale_list: List[float],
+    num_paths: int,
+    G: k2.Fsa,
+    sp: spm.SentencePieceProcessor,
+    word_table: k2.SymbolTable,
+    rnn_lm_model: torch.nn.Module,
+    rnn_lm_scale_list: List[float],
+    oov_word: str = "<UNK>",
+    use_double_scores: bool = True,
+    nbest_scale: float = 0.5,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Dict[str, Union[List[List[int]], DecodingResults]]:
+    """It limits the maximum number of symbols per frame to 1.
+    A lattice is first obtained using fast beam search, num_path are selected
+    and rescored using a given language model and a rnn-lm.
+    The shortest path within the lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a LG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi.
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      ngram_lm_scale_list:
+        A list of floats representing LM score scales.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      G:
+        An FsaVec containing only a single FSA. It is an n-gram LM.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      rnn_lm_model:
+        A rnn-lm model used for LM rescoring
+      rnn_lm_scale_list:
+        A list of floats representing RNN score scales.
+      oov_word:
+        OOV words are replaced with this word.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      Return the decoded result in a dict, where the key has the form
+      'ngram_lm_scale_xx' and the value is the decoded results
+      optionally with timestamps. `xx` is the ngram LM scale value
+      used during decoding, i.e., 0.1.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+        temperature=temperature,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+    # at this point, nbest.fsa.scores are all zeros.
+
+    nbest = nbest.intersect(lattice)
+    # Now nbest.fsa.scores contains acoustic scores
+
+    am_scores = nbest.tot_scores()
+
+    # Now we need to compute the LM scores of each path.
+    # (1) Get the token IDs of each Path. We assume the decoding_graph
+    # is an acceptor, i.e., lattice is also an acceptor
+    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
+
+    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
+    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
+
+    token_list: List[List[int]] = tokens.tolist()
+    word_list: List[List[str]] = sp.decode(token_list)
+
+    assert isinstance(oov_word, str), oov_word
+    assert oov_word in word_table, oov_word
+    oov_word_id = word_table[oov_word]
+
+    word_ids_list: List[List[int]] = []
+
+    for words in word_list:
+        this_word_ids = []
+        for w in words.split():
+            if w in word_table:
+                this_word_ids.append(word_table[w])
+            else:
+                this_word_ids.append(oov_word_id)
+        word_ids_list.append(this_word_ids)
+
+    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
+    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
+
+    num_unique_paths = len(word_ids_list)
+
+    b_to_a_map = torch.zeros(
+        num_unique_paths,
+        dtype=torch.int32,
+        device=lattice.device,
+    )
+
+    rescored_word_fsas = k2.intersect_device(
+        a_fsas=G,
+        b_fsas=word_fsas_with_self_loops,
+        b_to_a_map=b_to_a_map,
+        sorted_match_a=True,
+        ret_arc_maps=False,
+    )
+
+    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
+    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
+    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
+        use_double_scores=True,
+        log_semiring=False,
+    )
+
+    # Now RNN-LM
+    blank_id = model.decoder.blank_id
+    sos_id = sp.piece_to_id("sos_id")
+    eos_id = sp.piece_to_id("eos_id")
+
+    sos_tokens = add_sos(tokens, sos_id)
+    tokens_eos = add_eos(tokens, eos_id)
+    sos_tokens_row_splits = sos_tokens.shape.row_splits(1)
+    sentence_lengths = sos_tokens_row_splits[1:] - sos_tokens_row_splits[:-1]
+
+    x_tokens = sos_tokens.pad(mode="constant", padding_value=blank_id)
+    y_tokens = tokens_eos.pad(mode="constant", padding_value=blank_id)
+
+    x_tokens = x_tokens.to(torch.int64)
+    y_tokens = y_tokens.to(torch.int64)
+    sentence_lengths = sentence_lengths.to(torch.int64)
+
+    rnn_lm_nll = rnn_lm_model(x=x_tokens, y=y_tokens, lengths=sentence_lengths)
+    assert rnn_lm_nll.ndim == 2
+    assert rnn_lm_nll.shape[0] == len(token_list)
+    rnn_lm_scores = -1 * rnn_lm_nll.sum(dim=1)
+
+    ans: Dict[str, List[List[int]]] = {}
+    for n_scale in ngram_lm_scale_list:
+        for rnn_scale in rnn_lm_scale_list:
+            key = f"ngram_lm_scale_{n_scale}_rnn_lm_scale_{rnn_scale}"
+            tot_scores = (
+                am_scores.values + n_scale * ngram_lm_scores + rnn_scale * rnn_lm_scores
+            )
+            ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+            max_indexes = ragged_tot_scores.argmax()
+            best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+            if not return_timestamps:
+                ans[key] = get_texts(best_path)
+            else:
+                ans[key] = get_texts_with_timestamp(best_path)
+
+    return ans
+
+
+def modified_beam_search_ngram_rescoring(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    ngram_lm: NgramLm,
+    ngram_lm_scale: float,
+    beam: int = 4,
+    temperature: float = 1.0,
+) -> List[List[int]]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+    Returns:
+      Return a list-of-list of token IDs. ans[i] is the decoding results
+      for the i-th utterance.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+    lm_scale = ngram_lm_scale
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                state_cost=NgramLmStateCost(ngram_lm),
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [
+                hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale
+                for hyps in A
+                for hyp in hyps
+            ]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+        vocab_size = log_probs.size(-1)
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    state_cost = hyp.state_cost.forward_one_step(new_token)
+                else:
+                    state_cost = hyp.state_cost
+
+                # We only keep AM scores in new_hyp.log_prob
+                new_log_prob = topk_log_probs[k] - hyp.state_cost.lm_score * lm_scale
+
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, state_cost=state_cost
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
+    return ans
+
+
+def modified_beam_search_LODR(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    LODR_lm: NgramLm,
+    LODR_lm_scale: float,
+    LM: LmScorer,
+    beam: int = 4,
+    context_graph: Optional[ContextGraph] = None,
+) -> List[List[int]]:
+    """This function implements LODR (https://arxiv.org/abs/2203.16776) with
+    `modified_beam_search`. It uses a bi-gram language model as the estimate
+    of the internal language model and subtracts its score during shallow fusion
+    with an external language model. This implementation uses a RNNLM as the
+    external language model.
+
+    Args:
+        model (Transducer):
+            The transducer model
+        encoder_out (torch.Tensor):
+            Encoder output in (N,T,C)
+        encoder_out_lens (torch.Tensor):
+            A 1-D tensor of shape (N,), containing the number of
+            valid frames in encoder_out before padding.
+        LODR_lm:
+            A low order n-gram LM, whose score will be subtracted during shallow fusion
+        LODR_lm_scale:
+            The scale of the LODR_lm
+        LM:
+            A neural net LM, e.g an RNNLM or transformer LM
+        beam (int, optional):
+            Beam size. Defaults to 4.
+
+    Returns:
+      Return a list-of-list of token IDs. ans[i] is the decoding results
+      for the i-th utterance.
+
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+    assert LM is not None
+    lm_scale = LM.lm_scale
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    sos_id = getattr(LM, "sos_id", 1)
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    # get initial lm score and lm state by scoring the "sos" token
+    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
+    lens = torch.tensor([1]).to(device)
+    init_score, init_states = LM.score_token(sos_token, lens)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                state=init_states,  # state of the NN LM
+                lm_score=init_score.reshape(-1),
+                state_cost=NgramLmStateCost(
+                    LODR_lm
+                ),  # state of the source domain ngram
+                context_state=None if context_graph is None else context_graph.root,
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]  # get batch
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+        """
+        for all hyps with a non-blank new token, score this token.
+        It is a little confusing here because this for-loop
+        looks very similar to the one below. Here, we go through all
+        top-k tokens and only add the non-blanks ones to the token_list.
+        LM will score those tokens given the LM states. Note that
+        the variable `scores` is the LM score after seeing the new
+        non-blank token.
+        """
+        token_list = []
+        hs = []
+        cs = []
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_token = topk_token_indexes[k]
+                if new_token not in (blank_id, unk_id):
+                    if LM.lm_type == "rnn":
+                        token_list.append([new_token])
+                        # store the LSTM states
+                        hs.append(hyp.state[0])
+                        cs.append(hyp.state[1])
+                    else:
+                        # for transformer LM
+                        token_list.append(
+                            [sos_id] + hyp.ys[context_size:] + [new_token]
+                        )
+
+        # forward NN LM to get new states and scores
+        if len(token_list) != 0:
+            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
+            if LM.lm_type == "rnn":
+                tokens_to_score = (
+                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
+                )
+                hs = torch.cat(hs, dim=1).to(device)
+                cs = torch.cat(cs, dim=1).to(device)
+                state = (hs, cs)
+            else:
+                # for transformer LM
+                tokens_list = [torch.tensor(tokens) for tokens in token_list]
+                tokens_to_score = (
+                    torch.nn.utils.rnn.pad_sequence(
+                        tokens_list, batch_first=True, padding_value=0.0
+                    )
+                    .to(device)
+                    .to(torch.int64)
+                )
+
+                state = None
+
+            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
+
+        count = 0  # index, used to locate score and lm states
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                ys = hyp.ys[:]
+
+                # current score of hyp
+                lm_score = hyp.lm_score
+                state = hyp.state
+
+                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
+                new_token = topk_token_indexes[k]
+
+                context_score = 0
+                new_context_state = None if context_graph is None else hyp.context_state
+                if new_token not in (blank_id, unk_id):
+                    if context_graph is not None:
+                        (
+                            context_score,
+                            new_context_state,
+                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
+
+                    ys.append(new_token)
+                    state_cost = hyp.state_cost.forward_one_step(new_token)
+
+                    # calculate the score of the latest token
+                    current_ngram_score = state_cost.lm_score - hyp.state_cost.lm_score
+
+                    assert current_ngram_score <= 0.0, (
+                        state_cost.lm_score,
+                        hyp.state_cost.lm_score,
+                    )
+                    # score = score + TDLM_score - LODR_score
+                    # LODR_LM_scale should be a negative number here
+                    hyp_log_prob += (
+                        lm_score[new_token] * lm_scale
+                        + LODR_lm_scale * current_ngram_score
+                        + context_score
+                    )  # add the lm score
+
+                    lm_score = scores[count]
+                    if LM.lm_type == "rnn":
+                        state = (
+                            lm_states[0][:, count, :].unsqueeze(1),
+                            lm_states[1][:, count, :].unsqueeze(1),
+                        )
+                    count += 1
+                else:
+                    state_cost = hyp.state_cost
+
+                new_hyp = Hypothesis(
+                    ys=ys,
+                    log_prob=hyp_log_prob,
+                    state=state,
+                    lm_score=lm_score,
+                    state_cost=state_cost,
+                    context_state=new_context_state,
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+
+    # finalize context_state, if the matched contexts do not reach final state
+    # we need to add the score on the corresponding backoff arc
+    if context_graph is not None:
+        finalized_B = [HypothesisList() for _ in range(len(B))]
+        for i, hyps in enumerate(B):
+            for hyp in list(hyps):
+                context_score, new_context_state = context_graph.finalize(
+                    hyp.context_state
+                )
+                finalized_B[i].add(
+                    Hypothesis(
+                        ys=hyp.ys,
+                        log_prob=hyp.log_prob + context_score,
+                        timestamp=hyp.timestamp,
+                        context_state=new_context_state,
+                    )
+                )
+        B = finalized_B
+
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
+    return ans
+
+
+def modified_beam_search_lm_shallow_fusion(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    LM: LmScorer,
+    beam: int = 4,
+    return_timestamps: bool = False,
+) -> List[List[int]]:
+    """Modified_beam_search + NN LM shallow fusion
+
+    Args:
+        model (Transducer):
+            The transducer model
+        encoder_out (torch.Tensor):
+            Encoder output in (N,T,C)
+        encoder_out_lens (torch.Tensor):
+            A 1-D tensor of shape (N,), containing the number of
+            valid frames in encoder_out before padding.
+        sp:
+            Sentence piece generator.
+        LM (LmScorer):
+            A neural net LM, e.g RNN or Transformer
+        beam (int, optional):
+            Beam size. Defaults to 4.
+
+    Returns:
+      Return a list-of-list of token IDs. ans[i] is the decoding results
+      for the i-th utterance.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+    assert LM is not None
+    lm_scale = LM.lm_scale
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    sos_id = getattr(LM, "sos_id", 1)
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    # get initial lm score and lm state by scoring the "sos" token
+    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
+    lens = torch.tensor([1]).to(device)
+    init_score, init_states = LM.score_token(sos_token, lens)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[-1] * (context_size - 1) + [blank_id],
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                state=init_states,
+                lm_score=init_score.reshape(-1),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for t, batch_size in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]  # get batch
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )
+
+        lm_scores = torch.cat(
+            [hyp.lm_score.reshape(1, -1) for hyps in A for hyp in hyps]
+        )
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+        """
+        for all hyps with a non-blank new token, score this token.
+        It is a little confusing here because this for-loop
+        looks very similar to the one below. Here, we go through all
+        top-k tokens and only add the non-blanks ones to the token_list.
+        `LM` will score those tokens given the LM states. Note that
+        the variable `scores` is the LM score after seeing the new
+        non-blank token.
+        """
+        token_list = []  # a list of list
+        hs = []
+        cs = []
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_token = topk_token_indexes[k]
+                if new_token not in (blank_id, unk_id):
+                    if LM.lm_type == "rnn":
+                        token_list.append([new_token])
+                        # store the LSTM states
+                        hs.append(hyp.state[0])
+                        cs.append(hyp.state[1])
+                    else:
+                        # for transformer LM
+                        token_list.append(
+                            [sos_id] + hyp.ys[context_size:] + [new_token]
+                        )
+
+        if len(token_list) != 0:
+            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
+            if LM.lm_type == "rnn":
+                tokens_to_score = (
+                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
+                )
+                hs = torch.cat(hs, dim=1).to(device)
+                cs = torch.cat(cs, dim=1).to(device)
+                state = (hs, cs)
+            else:
+                # for transformer LM
+                tokens_list = [torch.tensor(tokens) for tokens in token_list]
+                tokens_to_score = (
+                    torch.nn.utils.rnn.pad_sequence(
+                        tokens_list, batch_first=True, padding_value=0.0
+                    )
+                    .to(device)
+                    .to(torch.int64)
+                )
+
+                state = None
+
+            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
+
+        count = 0  # index, used to locate score and lm states
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                ys = hyp.ys[:]
+
+                lm_score = hyp.lm_score
+                state = hyp.state
+
+                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    ys.append(new_token)
+                    new_timestamp.append(t)
+
+                    hyp_log_prob += lm_score[new_token] * lm_scale  # add the lm score
+
+                    lm_score = scores[count]
+                    if LM.lm_type == "rnn":
+                        state = (
+                            lm_states[0][:, count, :].unsqueeze(1),
+                            lm_states[1][:, count, :].unsqueeze(1),
+                        )
+                    count += 1
+
+                new_hyp = Hypothesis(
+                    ys=ys,
+                    log_prob=hyp_log_prob,
+                    state=state,
+                    lm_score=lm_score,
+                    timestamp=new_timestamp,
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    sorted_timestamps = [h.timestamp for h in best_hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
diff --git a/egs/librispeech/SSL/hubert/ctc_decode.py b/egs/librispeech/SSL/hubert/ctc_decode.py
new file mode 100644
index 0000000000..1f0f9bfac3
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/ctc_decode.py
@@ -0,0 +1,847 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Liyong Guo,
+#                                                 Quandong Wang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+(1) ctc-decoding
+./zipformer/ctc_decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --use-ctc 1 \
+    --max-duration 600 \
+    --decoding-method ctc-decoding
+
+(2) 1best
+./zipformer/ctc_decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --use-ctc 1 \
+    --max-duration 600 \
+    --hlg-scale 0.6 \
+    --decoding-method 1best
+
+(3) nbest
+./zipformer/ctc_decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --use-ctc 1 \
+    --max-duration 600 \
+    --hlg-scale 0.6 \
+    --decoding-method nbest
+
+(4) nbest-rescoring
+./zipformer/ctc_decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --use-ctc 1 \
+    --max-duration 600 \
+    --hlg-scale 0.6 \
+    --nbest-scale 1.0 \
+    --lm-dir data/lm \
+    --decoding-method nbest-rescoring
+
+(5) whole-lattice-rescoring
+./zipformer/ctc_decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --use-ctc 1 \
+    --max-duration 600 \
+    --hlg-scale 0.6 \
+    --nbest-scale 1.0 \
+    --lm-dir data/lm \
+    --decoding-method whole-lattice-rescoring
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from train import add_model_arguments, get_model, get_params
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.decode import (
+    get_lattice,
+    nbest_decoding,
+    nbest_oracle,
+    one_best_decoding,
+    rescore_with_n_best_list,
+    rescore_with_whole_lattice,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    get_texts,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="ctc-decoding",
+        help="""Decoding method.
+        Supported values are:
+        - (1) ctc-decoding. Use CTC decoding. It uses a sentence piece
+          model, i.e., lang_dir/bpe.model, to convert word pieces to words.
+          It needs neither a lexicon nor an n-gram LM.
+        - (2) 1best. Extract the best path from the decoding lattice as the
+          decoding result.
+        - (3) nbest. Extract n paths from the decoding lattice; the path
+          with the highest score is the decoding result.
+        - (4) nbest-rescoring. Extract n paths from the decoding lattice,
+          rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
+          the highest score is the decoding result.
+        - (5) whole-lattice-rescoring. Rescore the decoding lattice with an
+          n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
+          is the decoding result.
+          you have trained an RNN LM using ./rnn_lm/train.py
+        - (6) nbest-oracle. Its WER is the lower bound of any n-best
+          rescoring method can achieve. Useful for debugging n-best
+          rescoring method.
+        """,
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=100,
+        help="""Number of paths for n-best based decoding method.
+        Used only when "method" is one of the following values:
+        nbest, nbest-rescoring, and nbest-oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=1.0,
+        help="""The scale to be applied to `lattice.scores`.
+        It's needed if you use any kinds of n-best based rescoring.
+        Used only when "method" is one of the following values:
+        nbest, nbest-rescoring, and nbest-oracle
+        A smaller value results in more unique paths.
+        """,
+    )
+
+    parser.add_argument(
+        "--hlg-scale",
+        type=float,
+        default=0.6,
+        help="""The scale to be applied to `hlg.scores`.
+        """,
+    )
+
+    parser.add_argument(
+        "--lm-dir",
+        type=str,
+        default="data/lm",
+        help="""The n-gram LM dir.
+        It should contain either G_4_gram.pt or G_4_gram.fst.txt
+        """,
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_decoding_params() -> AttributeDict:
+    """Parameters for decoding."""
+    params = AttributeDict(
+        {
+            "frame_shift_ms": 10,
+            "search_beam": 20,
+            "output_beam": 8,
+            "min_active_states": 30,
+            "max_active_states": 10000,
+            "use_double_scores": True,
+        }
+    )
+    return params
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    HLG: Optional[k2.Fsa],
+    H: Optional[k2.Fsa],
+    bpe_model: Optional[spm.SentencePieceProcessor],
+    batch: dict,
+    word_table: k2.SymbolTable,
+    G: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+    - key: It indicates the setting used for decoding. For example,
+           if no rescoring is used, the key is the string `no_rescore`.
+           If LM rescoring is used, the key is the string `lm_scale_xxx`,
+           where `xxx` is the value of `lm_scale`. An example key is
+           `lm_scale_0.7`
+    - value: It contains the decoding result. `len(value)` equals to
+             batch size. `value[i]` is the decoding result for the i-th
+             utterance in the given batch.
+
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+
+        - params.decoding_method is "1best", it uses 1best decoding without LM rescoring.
+        - params.decoding_method is "nbest", it uses nbest decoding without LM rescoring.
+        - params.decoding_method is "nbest-rescoring", it uses nbest LM rescoring.
+        - params.decoding_method is "whole-lattice-rescoring", it uses whole lattice LM
+          rescoring.
+
+      model:
+        The neural model.
+      HLG:
+        The decoding graph. Used only when params.decoding_method is NOT ctc-decoding.
+      H:
+        The ctc topo. Used only when params.decoding_method is ctc-decoding.
+      bpe_model:
+        The BPE model. Used only when params.decoding_method is ctc-decoding.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      word_table:
+        The word symbol table.
+      G:
+        An LM. It is not None when params.decoding_method is "nbest-rescoring"
+        or "whole-lattice-rescoring". In general, the G in HLG
+        is a 3-gram LM, while this G is a 4-gram LM.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict. Note: If it decodes to nothing, then return None.
+    """
+    if HLG is not None:
+        device = HLG.device
+    else:
+        device = H.device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    if params.causal:
+        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        pad_len = 30
+        feature_lens += pad_len
+        feature = torch.nn.functional.pad(
+            feature,
+            pad=(0, 0, 0, pad_len),
+            value=LOG_EPS,
+        )
+
+    encoder_out, encoder_out_lens = model.forward_encoder(feature, feature_lens)
+    ctc_output = model.ctc_output(encoder_out)  # (N, T, C)
+
+    supervision_segments = torch.stack(
+        (
+            supervisions["sequence_idx"],
+            torch.div(
+                supervisions["start_frame"],
+                params.subsampling_factor,
+                rounding_mode="floor",
+            ),
+            torch.div(
+                supervisions["num_frames"],
+                params.subsampling_factor,
+                rounding_mode="floor",
+            ),
+        ),
+        1,
+    ).to(torch.int32)
+
+    if H is None:
+        assert HLG is not None
+        decoding_graph = HLG
+    else:
+        assert HLG is None
+        assert bpe_model is not None
+        decoding_graph = H
+
+    lattice = get_lattice(
+        nnet_output=ctc_output,
+        decoding_graph=decoding_graph,
+        supervision_segments=supervision_segments,
+        search_beam=params.search_beam,
+        output_beam=params.output_beam,
+        min_active_states=params.min_active_states,
+        max_active_states=params.max_active_states,
+        subsampling_factor=params.subsampling_factor,
+    )
+
+    if params.decoding_method == "ctc-decoding":
+        best_path = one_best_decoding(
+            lattice=lattice, use_double_scores=params.use_double_scores
+        )
+        # Note: `best_path.aux_labels` contains token IDs, not word IDs
+        # since we are using H, not HLG here.
+        #
+        # token_ids is a lit-of-list of IDs
+        token_ids = get_texts(best_path)
+
+        # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
+        hyps = bpe_model.decode(token_ids)
+
+        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
+        hyps = [s.split() for s in hyps]
+        key = "ctc-decoding"
+        return {key: hyps}
+
+    if params.decoding_method == "nbest-oracle":
+        # Note: You can also pass rescored lattices to it.
+        # We choose the HLG decoded lattice for speed reasons
+        # as HLG decoding is faster and the oracle WER
+        # is only slightly worse than that of rescored lattices.
+        best_path = nbest_oracle(
+            lattice=lattice,
+            num_paths=params.num_paths,
+            ref_texts=supervisions["text"],
+            word_table=word_table,
+            nbest_scale=params.nbest_scale,
+            oov="<UNK>",
+        )
+        hyps = get_texts(best_path)
+        hyps = [[word_table[i] for i in ids] for ids in hyps]
+        key = f"oracle_{params.num_paths}_nbest_scale_{params.nbest_scale}"  # noqa
+        return {key: hyps}
+
+    if params.decoding_method in ["1best", "nbest"]:
+        if params.decoding_method == "1best":
+            best_path = one_best_decoding(
+                lattice=lattice, use_double_scores=params.use_double_scores
+            )
+            key = "no_rescore"
+        else:
+            best_path = nbest_decoding(
+                lattice=lattice,
+                num_paths=params.num_paths,
+                use_double_scores=params.use_double_scores,
+                nbest_scale=params.nbest_scale,
+            )
+            key = f"no_rescore-nbest-scale-{params.nbest_scale}-{params.num_paths}"  # noqa
+
+        hyps = get_texts(best_path)
+        hyps = [[word_table[i] for i in ids] for ids in hyps]
+        return {key: hyps}
+
+    assert params.decoding_method in [
+        "nbest-rescoring",
+        "whole-lattice-rescoring",
+    ]
+
+    lm_scale_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
+    lm_scale_list += [0.8, 0.9, 1.0, 1.1, 1.2, 1.3]
+    lm_scale_list += [1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
+
+    if params.decoding_method == "nbest-rescoring":
+        best_path_dict = rescore_with_n_best_list(
+            lattice=lattice,
+            G=G,
+            num_paths=params.num_paths,
+            lm_scale_list=lm_scale_list,
+            nbest_scale=params.nbest_scale,
+        )
+    elif params.decoding_method == "whole-lattice-rescoring":
+        best_path_dict = rescore_with_whole_lattice(
+            lattice=lattice,
+            G_with_epsilon_loops=G,
+            lm_scale_list=lm_scale_list,
+        )
+    else:
+        assert False, f"Unsupported decoding method: {params.decoding_method}"
+
+    ans = dict()
+    if best_path_dict is not None:
+        for lm_scale_str, best_path in best_path_dict.items():
+            hyps = get_texts(best_path)
+            hyps = [[word_table[i] for i in ids] for ids in hyps]
+            ans[lm_scale_str] = hyps
+    else:
+        ans = None
+    return ans
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    HLG: Optional[k2.Fsa],
+    H: Optional[k2.Fsa],
+    bpe_model: Optional[spm.SentencePieceProcessor],
+    word_table: k2.SymbolTable,
+    G: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      HLG:
+        The decoding graph. Used only when params.decoding_method is NOT ctc-decoding.
+      H:
+        The ctc topo. Used only when params.decoding_method is ctc-decoding.
+      bpe_model:
+        The BPE model. Used only when params.decoding_method is ctc-decoding.
+      word_table:
+        It is the word symbol table.
+      G:
+        An LM. It is not None when params.decoding_method is "nbest-rescoring"
+        or "whole-lattice-rescoring". In general, the G in HLG
+        is a 3-gram LM, while this G is a 4-gram LM.
+    Returns:
+      Return a dict, whose key may be "no-rescore" if no LM rescoring
+      is used, or it may be "lm_scale_0.7" if LM rescoring is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            HLG=HLG,
+            H=H,
+            bpe_model=bpe_model,
+            batch=batch,
+            word_table=word_table,
+            G=G,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % 100 == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(f, f"{test_set_name}-{key}", results)
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    args.lang_dir = Path(args.lang_dir)
+    args.lm_dir = Path(args.lm_dir)
+
+    params = get_params()
+    # add decoding params
+    params.update(get_decoding_params())
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "ctc-decoding",
+        "1best",
+        "nbest",
+        "nbest-rescoring",
+        "whole-lattice-rescoring",
+        "nbest-oracle",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"-left-context-{params.left_context_frames}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+    logging.info(params)
+
+    lexicon = Lexicon(params.lang_dir)
+    max_token_id = max(lexicon.tokens)
+    num_classes = max_token_id + 1  # +1 for the blank
+
+    params.vocab_size = num_classes
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = 0
+
+    if params.decoding_method == "ctc-decoding":
+        HLG = None
+        H = k2.ctc_topo(
+            max_token=max_token_id,
+            modified=False,
+            device=device,
+        )
+        bpe_model = spm.SentencePieceProcessor()
+        bpe_model.load(str(params.lang_dir / "bpe.model"))
+    else:
+        H = None
+        bpe_model = None
+        HLG = k2.Fsa.from_dict(
+            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+        )
+        assert HLG.requires_grad is False
+
+        HLG.scores *= params.hlg_scale
+        if not hasattr(HLG, "lm_scores"):
+            HLG.lm_scores = HLG.scores.clone()
+
+    if params.decoding_method in (
+        "nbest-rescoring",
+        "whole-lattice-rescoring",
+    ):
+        if not (params.lm_dir / "G_4_gram.pt").is_file():
+            logging.info("Loading G_4_gram.fst.txt")
+            logging.warning("It may take 8 minutes.")
+            with open(params.lm_dir / "G_4_gram.fst.txt") as f:
+                first_word_disambig_id = lexicon.word_table["#0"]
+
+                G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+                # G.aux_labels is not needed in later computations, so
+                # remove it here.
+                del G.aux_labels
+                # CAUTION: The following line is crucial.
+                # Arcs entering the back-off state have label equal to #0.
+                # We have to change it to 0 here.
+                G.labels[G.labels >= first_word_disambig_id] = 0
+                # See https://github.com/k2-fsa/k2/issues/874
+                # for why we need to set G.properties to None
+                G.__dict__["_properties"] = None
+                G = k2.Fsa.from_fsas([G]).to(device)
+                G = k2.arc_sort(G)
+                # Save a dummy value so that it can be loaded in C++.
+                # See https://github.com/pytorch/pytorch/issues/67902
+                # for why we need to do this.
+                G.dummy = 1
+
+                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
+        else:
+            logging.info("Loading pre-compiled G_4_gram.pt")
+            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            G = k2.Fsa.from_dict(d)
+
+        if params.decoding_method == "whole-lattice-rescoring":
+            # Add epsilon self-loops to G as we will compose
+            # it with the whole lattice later
+            G = k2.add_epsilon_self_loops(G)
+            G = k2.arc_sort(G)
+            G = G.to(device)
+
+        # G.lm_scores is used to replace HLG.lm_scores during
+        # LM rescoring.
+        G.lm_scores = G.scores.clone()
+    else:
+        G = None
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dl = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            HLG=HLG,
+            H=H,
+            bpe_model=bpe_model,
+            word_table=lexicon.word_table,
+            G=G,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/SSL/hubert/dataset.py b/egs/librispeech/SSL/hubert/dataset.py
new file mode 100644
index 0000000000..106b27a2c5
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/dataset.py
@@ -0,0 +1,154 @@
+# Copyright      2023  Xiaomi Corporation        (authors: Yifan Yang)
+#
+# See ../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+import torch
+from lhotse import validate
+from lhotse.audio.utils import suppress_audio_loading_errors
+from lhotse.cut import CutSet
+from lhotse.dataset.collation import read_audio_from_cuts
+from torch.utils.data.dataloader import default_collate
+from transformers import Wav2Vec2FeatureExtractor
+
+
+class HubertDataset(torch.utils.data.Dataset):
+    """
+    In this implementation, there will always be a single channel.
+
+    Returns:
+
+    .. code-block::
+
+        {
+            'audio': (B x NumSamples) float tensor
+            'audio_lens': (B, ) int tensor
+        }
+    """
+
+    def __init__(self, collate: bool = True) -> None:
+        super().__init__()
+        self.feature_extractor = Wav2Vec2FeatureExtractor(
+            feature_size=1,
+            sampling_rate=16000,
+            padding_side="right",
+            padding_value=0.0,
+            do_normalize=True,
+            return_attention_mask=True,
+        )
+
+    def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
+        self._validate(cuts)
+        audio, _ = read_audio_from_cuts(cuts, return_tensors=False)
+        audio = self.feature_extractor(
+            audio,
+            padding=True,
+            return_tensors="pt",
+            sampling_rate=16000,
+        ).input_values
+        audio_lens = torch.tensor([cut.num_samples for cut in cuts], dtype=torch.int32)
+
+        return {
+            "cuts": cuts,
+            "audio": audio,
+            "audio_lens": audio_lens,
+        }
+
+    def _validate(self, cuts: CutSet) -> None:
+        validate(cuts)
+        assert all(cut.has_recording for cut in cuts)
+
+
+class HubertAsrDataset(torch.utils.data.Dataset):
+    """
+    In this implementation, there will always be a single channel.
+
+    Returns:
+
+    .. code-block::
+
+        {
+            'audio': (B x NumSamples) float tensor
+            'audio_lens': (B, ) int tensor
+        }
+    """
+
+    def __init__(self, collate: bool = True) -> None:
+        super().__init__()
+        self.feature_extractor = Wav2Vec2FeatureExtractor(
+            feature_size=1,
+            sampling_rate=16000,
+            padding_side="right",
+            padding_value=0.0,
+            do_normalize=True,
+            return_attention_mask=True,
+        )
+
+    def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
+        self._validate(cuts)
+        audio, _ = read_audio_from_cuts(cuts, return_tensors=False)
+        audio = self.feature_extractor(
+            audio,
+            padding=True,
+            return_tensors="pt",
+            sampling_rate=16000,
+        ).input_values
+        audio_lens = torch.tensor([cut.num_samples for cut in cuts], dtype=torch.int32)
+
+        return {
+            "cuts": cuts,
+            "audio": audio,
+            "audio_lens": audio_lens,
+            "supervisions": default_collate(
+                [
+                    {
+                        "text": supervision.text,
+                    }
+                    for sequence_idx, cut in enumerate(cuts)
+                    for supervision in cut.supervisions
+                ]
+            ),
+        }
+
+    def _validate(self, cuts: CutSet) -> None:
+        validate(cuts)
+        assert all(cut.has_recording for cut in cuts)
+
+
+if __name__ == "__main__":
+    from lhotse import load_manifest_lazy
+    from lhotse.dataset import DynamicBucketingSampler
+    from torch.utils.data import DataLoader
+
+    dataset = HubertAsrDataset()
+    cuts = load_manifest_lazy("data/fbank/librispeech_cuts_train-clean-100.jsonl.gz")
+    sampler = DynamicBucketingSampler(
+        cuts,
+        max_duration=100,
+        shuffle=False,
+    )
+    dl = DataLoader(
+        dataset,
+        batch_size=None,
+        sampler=sampler,
+        num_workers=2,
+    )
+
+    for batch_idx, batch in enumerate(dl):
+        import pdb
+
+        pdb.set_trace()
+        pass
diff --git a/egs/librispeech/SSL/hubert/decode.py b/egs/librispeech/SSL/hubert/decode.py
new file mode 100644
index 0000000000..604d714531
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/decode.py
@@ -0,0 +1,1032 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search (one best)
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest)
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(6) fast beam search (nbest oracle WER)
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(7) fast beam search (with LG)
+./hubert/decode.py \
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./hubert/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+"""
+
+
+import argparse
+import logging
+import math
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest,
+    fast_beam_search_nbest_LG,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+    modified_beam_search_lm_rescore,
+    modified_beam_search_lm_rescore_LODR,
+    modified_beam_search_lm_shallow_fusion,
+    modified_beam_search_LODR,
+)
+from train import add_model_arguments, get_model, get_params
+
+from icefall import ContextGraph, LmScorer, NgramLm
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="hubert/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+          - modified_beam_search_LODR
+          - fast_beam_search
+          - fast_beam_search_nbest
+          - fast_beam_search_nbest_oracle
+          - fast_beam_search_nbest_LG
+        If you use fast_beam_search_nbest_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding-method is fast_beam_search_nbest_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding-method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--use-shallow-fusion",
+        type=str2bool,
+        default=False,
+        help="""Use neural network LM for shallow fusion.
+        If you want to use LODR, you will also need to set this to true
+        """,
+    )
+
+    parser.add_argument(
+        "--lm-type",
+        type=str,
+        default="rnn",
+        help="Type of NN lm",
+        choices=["rnn", "transformer"],
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.3,
+        help="""The scale of the neural network LM
+        Used only when `--use-shallow-fusion` is set to True.
+        """,
+    )
+
+    parser.add_argument(
+        "--tokens-ngram",
+        type=int,
+        default=2,
+        help="""The order of the ngram lm.
+        """,
+    )
+
+    parser.add_argument(
+        "--backoff-id",
+        type=int,
+        default=500,
+        help="ID of the backoff symbol in the ngram LM",
+    )
+
+    parser.add_argument(
+        "--context-score",
+        type=float,
+        default=2,
+        help="""
+        The bonus score of each token for the context biasing words/phrases.
+        Used only when --decoding-method is modified_beam_search and
+        modified_beam_search_LODR.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-file",
+        type=str,
+        default="",
+        help="""
+        The path of the context biasing lists, one word/phrase each line
+        Used only when --decoding-method is modified_beam_search and
+        modified_beam_search_LODR.
+        """,
+    )
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+    context_graph: Optional[ContextGraph] = None,
+    LM: Optional[LmScorer] = None,
+    ngram_lm=None,
+    ngram_lm_scale: float = 0.0,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding-method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+      LM:
+        A neural network language model.
+      ngram_lm:
+        A ngram language model
+      ngram_lm_scale:
+        The scale for the ngram language model.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    encoder_out, encoder_out_lens = model.forward_encoder(feature, feature_lens)
+
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "fast_beam_search_nbest_LG":
+        hyp_tokens = fast_beam_search_nbest_LG(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in hyp_tokens:
+            hyps.append([word_table[i] for i in hyp])
+    elif params.decoding_method == "fast_beam_search_nbest":
+        hyp_tokens = fast_beam_search_nbest(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            context_graph=context_graph,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_lm_shallow_fusion":
+        hyp_tokens = modified_beam_search_lm_shallow_fusion(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_LODR":
+        hyp_tokens = modified_beam_search_LODR(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LODR_lm=ngram_lm,
+            LODR_lm_scale=ngram_lm_scale,
+            LM=LM,
+            context_graph=context_graph,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_lm_rescore":
+        lm_scale_list = [0.01 * i for i in range(10, 50)]
+        ans_dict = modified_beam_search_lm_rescore(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+            lm_scale_list=lm_scale_list,
+        )
+    elif params.decoding_method == "modified_beam_search_lm_rescore_LODR":
+        lm_scale_list = [0.02 * i for i in range(2, 30)]
+        ans_dict = modified_beam_search_lm_rescore_LODR(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+            LODR_lm=ngram_lm,
+            sp=sp,
+            lm_scale_list=lm_scale_list,
+        )
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp).split())
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": hyps}
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    elif "modified_beam_search" in params.decoding_method:
+        prefix = f"beam_size_{params.beam_size}"
+        if params.decoding_method in (
+            "modified_beam_search_lm_rescore",
+            "modified_beam_search_lm_rescore_LODR",
+        ):
+            ans = dict()
+            assert ans_dict is not None
+            for key, hyps in ans_dict.items():
+                hyps = [sp.decode(hyp).split() for hyp in hyps]
+                ans[f"{prefix}_{key}"] = hyps
+            return ans
+        else:
+            if params.has_contexts:
+                prefix += f"-context-score-{params.context_score}"
+            return {prefix: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+    context_graph: Optional[ContextGraph] = None,
+    LM: Optional[LmScorer] = None,
+    ngram_lm=None,
+    ngram_lm_scale: float = 0.0,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding-method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+            decoding_graph=decoding_graph,
+            context_graph=context_graph,
+            word_table=word_table,
+            batch=batch,
+            LM=LM,
+            ngram_lm=ngram_lm,
+            ngram_lm_scale=ngram_lm_scale,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    LmScorer.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "fast_beam_search",
+        "fast_beam_search_nbest",
+        "fast_beam_search_nbest_LG",
+        "fast_beam_search_nbest_oracle",
+        "modified_beam_search",
+        "modified_beam_search_LODR",
+        "modified_beam_search_lm_shallow_fusion",
+        "modified_beam_search_lm_rescore",
+        "modified_beam_search_lm_rescore_LODR",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if os.path.exists(params.context_file):
+        params.has_contexts = True
+    else:
+        params.has_contexts = False
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+            if "LG" in params.decoding_method:
+                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        if params.decoding_method in (
+            "modified_beam_search",
+            "modified_beam_search_LODR",
+        ):
+            if params.has_contexts:
+                params.suffix += f"-context-score-{params.context_score}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_shallow_fusion:
+        params.suffix += f"-{params.lm_type}-lm-scale-{params.lm_scale}"
+
+        if "LODR" in params.decoding_method:
+            params.suffix += (
+                f"-LODR-{params.tokens_ngram}gram-scale-{params.ngram_lm_scale}"
+            )
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    # only load the neural network LM if required
+    if params.use_shallow_fusion or params.decoding_method in (
+        "modified_beam_search_lm_rescore",
+        "modified_beam_search_lm_rescore_LODR",
+        "modified_beam_search_lm_shallow_fusion",
+        "modified_beam_search_LODR",
+    ):
+        LM = LmScorer(
+            lm_type=params.lm_type,
+            params=params,
+            device=device,
+            lm_scale=params.lm_scale,
+        )
+        LM.to(device)
+        LM.eval()
+    else:
+        LM = None
+
+    # only load N-gram LM when needed
+    if params.decoding_method == "modified_beam_search_lm_rescore_LODR":
+        try:
+            import kenlm
+        except ImportError:
+            print("Please install kenlm first. You can use")
+            print(" pip install https://github.com/kpu/kenlm/archive/master.zip")
+            print("to install it")
+            import sys
+
+            sys.exit(-1)
+        ngram_file_name = str(params.lang_dir / f"{params.tokens_ngram}gram.arpa")
+        logging.info(f"lm filename: {ngram_file_name}")
+        ngram_lm = kenlm.Model(ngram_file_name)
+        ngram_lm_scale = None  # use a list to search
+
+    elif params.decoding_method == "modified_beam_search_LODR":
+        lm_filename = f"{params.tokens_ngram}gram.fst.txt"
+        logging.info(f"Loading token level lm: {lm_filename}")
+        ngram_lm = NgramLm(
+            str(params.lang_dir / lm_filename),
+            backoff_id=params.backoff_id,
+            is_binary=False,
+        )
+        logging.info(f"num states: {ngram_lm.lm.num_states}")
+        ngram_lm_scale = params.ngram_lm_scale
+    else:
+        ngram_lm = None
+        ngram_lm_scale = None
+
+    if "fast_beam_search" in params.decoding_method:
+        if params.decoding_method == "fast_beam_search_nbest_LG":
+            lexicon = Lexicon(params.lang_dir)
+            word_table = lexicon.word_table
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            word_table = None
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+        word_table = None
+
+    if "modified_beam_search" in params.decoding_method:
+        if os.path.exists(params.context_file):
+            contexts = []
+            for line in open(params.context_file).readlines():
+                contexts.append((sp.encode(line.strip()), 0.0))
+            context_graph = ContextGraph(params.context_score)
+            context_graph.build(contexts)
+        else:
+            context_graph = None
+    else:
+        context_graph = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dl = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+            word_table=word_table,
+            decoding_graph=decoding_graph,
+            context_graph=context_graph,
+            LM=LM,
+            ngram_lm=ngram_lm,
+            ngram_lm_scale=ngram_lm_scale,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/SSL/hubert/decoder.py b/egs/librispeech/SSL/hubert/decoder.py
new file mode 100644
index 0000000000..7ce44495bf
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/decoder.py
@@ -0,0 +1,134 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scaling import Balancer
+
+
+class Decoder(nn.Module):
+    """This class modifies the stateless decoder from the following paper:
+
+        RNN-transducer with stateless prediction network
+        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
+
+    It removes the recurrent connection from the decoder, i.e., the prediction
+    network. Different from the above paper, it adds an extra Conv1d
+    right after the embedding layer.
+
+    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
+    """
+
+    def __init__(
+        self,
+        vocab_size: int,
+        decoder_dim: int,
+        blank_id: int,
+        context_size: int,
+    ):
+        """
+        Args:
+          vocab_size:
+            Number of tokens of the modeling unit including blank.
+          decoder_dim:
+            Dimension of the input embedding, and of the decoder output.
+          blank_id:
+            The ID of the blank symbol.
+          context_size:
+            Number of previous words to use to predict the next word.
+            1 means bigram; 2 means trigram. n means (n+1)-gram.
+        """
+        super().__init__()
+
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=decoder_dim,
+        )
+        # the balancers are to avoid any drift in the magnitude of the
+        # embeddings, which would interact badly with parameter averaging.
+        self.balancer = Balancer(
+            decoder_dim,
+            channel_dim=-1,
+            min_positive=0.0,
+            max_positive=1.0,
+            min_abs=0.5,
+            max_abs=1.0,
+            prob=0.05,
+        )
+
+        self.blank_id = blank_id
+
+        assert context_size >= 1, context_size
+        self.context_size = context_size
+        self.vocab_size = vocab_size
+
+        if context_size > 1:
+            self.conv = nn.Conv1d(
+                in_channels=decoder_dim,
+                out_channels=decoder_dim,
+                kernel_size=context_size,
+                padding=0,
+                groups=decoder_dim // 4,  # group size == 4
+                bias=False,
+            )
+            self.balancer2 = Balancer(
+                decoder_dim,
+                channel_dim=-1,
+                min_positive=0.0,
+                max_positive=1.0,
+                min_abs=0.5,
+                max_abs=1.0,
+                prob=0.05,
+            )
+        else:
+            # To avoid `RuntimeError: Module 'Decoder' has no attribute 'conv'`
+            # when inference with torch.jit.script and context_size == 1
+            self.conv = nn.Identity()
+            self.balancer2 = nn.Identity()
+
+    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
+        """
+        Args:
+          y:
+            A 2-D tensor of shape (N, U).
+          need_pad:
+            True to left pad the input. Should be True during training.
+            False to not pad the input. Should be False during inference.
+        Returns:
+          Return a tensor of shape (N, U, decoder_dim).
+        """
+        y = y.to(torch.int64)
+        # this stuff about clamp() is a temporary fix for a mismatch
+        # at utterance start, we use negative ids in beam_search.py
+        embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1)
+
+        embedding_out = self.balancer(embedding_out)
+
+        if self.context_size > 1:
+            embedding_out = embedding_out.permute(0, 2, 1)
+            if need_pad is True:
+                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
+            else:
+                # During inference time, there is no need to do extra padding
+                # as we only need one output
+                assert embedding_out.size(-1) == self.context_size
+            embedding_out = self.conv(embedding_out)
+            embedding_out = embedding_out.permute(0, 2, 1)
+            embedding_out = F.relu(embedding_out)
+            embedding_out = self.balancer2(embedding_out)
+
+        return embedding_out
diff --git a/egs/librispeech/SSL/hubert/finetune.py b/egs/librispeech/SSL/hubert/finetune.py
new file mode 100644
index 0000000000..612a8a2358
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/finetune.py
@@ -0,0 +1,1447 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Mingshuang Luo,
+#                                                       Zengwei Yao,
+#                                                       Yifan Yang,
+#                                                       Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+# For hubert model finetuning:
+./hubert/finetune.py \
+  --world-size 8 \
+  --num-epochs 20 \
+  --start-epoch 1 \
+  --use-fp16 0 \
+  --exp-dir hubert/exp \
+  --full-libri 1 \
+  --max-duration 80
+
+It supports finetuning with:
+  - transducer loss (default), with `--use-transducer True --use-ctc False`
+  - ctc loss (not recommended), with `--use-transducer False --use-ctc True`
+  - transducer loss & ctc loss, with `--use-transducer True --use-ctc True`
+"""
+
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from decoder import Decoder
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from model import AsrModel
+from optim import Eden, ScaledAdam
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from transformers import HubertConfig, HubertModel
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    get_parameter_groups_with_lrs,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def get_adjusted_batch_count(params: AttributeDict) -> float:
+    # returns the number of batches we would have used so far if we had used the reference
+    # duration.  This is for purposes of set_batch_count().
+    return (
+        params.batch_idx_train
+        * (params.max_duration * params.world_size)
+        / params.ref_duration
+    )
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for name, module in model.named_modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+        if hasattr(module, "name"):
+            module.name = name
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--activation-dropout",
+        type=float,
+        default=0.1,
+    )
+    parser.add_argument(
+        "--apply-spec-augment",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument(
+        "--attention-dropout",
+        type=float,
+        default=0.1,
+    )
+    parser.add_argument(
+        "--conv-bias",
+        type=str2bool,
+        default=False,
+    )
+    parser.add_argument(
+        "--conv-dim",
+        type=str,
+        default="512,512,512,512,512,512,512",
+    )
+    parser.add_argument(
+        "--conv-kernel",
+        type=str,
+        default="10,3,3,3,3,2,2",
+    )
+    parser.add_argument(
+        "--conv-stride",
+        type=str,
+        default="5,2,2,2,2,2,2",
+    )
+    parser.add_argument(
+        "--do-stable-layer-norm",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument(
+        "--feat-extract-activation",
+        type=str,
+        default="gelu",
+    )
+    parser.add_argument(
+        "--feat-extract-norm",
+        type=str,
+        default="layer",
+    )
+    parser.add_argument(
+        "--feat-proj-dropout",
+        type=float,
+        default=0.0,
+    )
+    parser.add_argument(
+        "--feat-proj-layer-norm",
+        type=str2bool,
+        default=True,
+    )
+    parser.add_argument(
+        "--final-dropout",
+        type=float,
+        default=0.1,
+    )
+    parser.add_argument(
+        "--hidden-act",
+        type=str,
+        default="gelu",
+    )
+    parser.add_argument(
+        "--hidden-dropout",
+        type=float,
+        default=0.1,
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=1024,
+    )
+    parser.add_argument(
+        "--initializer-range",
+        type=float,
+        default=0.02,
+    )
+    parser.add_argument(
+        "--intermediate-size",
+        type=int,
+        default=4096,
+    )
+    parser.add_argument(
+        "--layer-norm-eps",
+        type=float,
+        default=1e-5,
+    )
+    parser.add_argument(
+        "--layerdrop",
+        type=float,
+        default=0.1,
+    )
+    parser.add_argument(
+        "--mask-feature-length",
+        type=int,
+        default=10,
+    )
+    parser.add_argument(
+        "--mask-feature-min-masks",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--mask-feature-prob",
+        type=float,
+        default=0.0,
+    )
+    parser.add_argument(
+        "--mask-time-length",
+        type=int,
+        default=10,
+    )
+    parser.add_argument(
+        "--mask-time-min-masks",
+        type=int,
+        default=2,
+    )
+    parser.add_argument(
+        "--mask-time-prob",
+        type=float,
+        default=0.05,
+    )
+    parser.add_argument(
+        "--num-attention-heads",
+        type=int,
+        default=16,
+    )
+    parser.add_argument(
+        "--num-conv-pos-embedding-groups",
+        type=int,
+        default=16,
+    )
+    parser.add_argument(
+        "--num-conv-pos-embeddings",
+        type=int,
+        default=128,
+    )
+    parser.add_argument(
+        "--num-hidden-layers",
+        type=int,
+        default=24,
+    )
+
+    parser.add_argument(
+        "--encoder-dim",
+        type=int,
+        default=1024,
+        help="Embedding dimension in encoder model.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-transducer",
+        type=str2bool,
+        default=True,
+        help="If True, use Transducer head.",
+    )
+
+    parser.add_argument(
+        "--use-ctc",
+        type=str2bool,
+        default=False,
+        help="If True, use CTC head.",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=20,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="hubert/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.0005, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=100000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=100,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=600,
+        help="Reference batch duration for purposes of adjusting batch counts for setting various "
+        "schedules inside the model",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network)" "part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--sanity-check",
+        type=str2bool,
+        default=False,
+        help="Check if any of the batches in epoch 1 would cause OOM.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=4000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 1.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--accum-grad",
+        type=int,
+        default=1,
+        help="""update gradient when batch_idx_train % accum_grad == 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - warm_step: The warmup period that dictates the decay of the
+              scale on "simple" (un-pruned) loss.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 3000,  # For the 100h subset, use 800
+            # parameters for pruned RNN-T loss
+            "warm_step": 2000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def _to_int_tuple(s: str):
+    return tuple(map(int, s.split(",")))
+
+
+def _get_feat_extract_output_lengths(
+    params, input_lengths: Union[torch.LongTensor, int]
+):
+    def _conv_out_length(input_length, kernel_size, stride):
+        return (input_length - kernel_size) // stride + 1
+
+    for kernel_size, stride in zip(
+        _to_int_tuple(params.conv_kernel),
+        _to_int_tuple(params.conv_stride),
+    ):
+        input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+    return input_lengths
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    config = HubertConfig(
+        hidden_size=params.hidden_size,
+        num_hidden_layers=params.num_hidden_layers,
+        num_attention_heads=params.num_attention_heads,
+        intermediate_size=params.intermediate_size,
+        hidden_act=params.hidden_act,
+        hidden_dropout=params.hidden_dropout,
+        activation_dropout=params.activation_dropout,
+        attention_dropout=params.attention_dropout,
+        feat_proj_layer_norm=params.feat_proj_layer_norm,
+        feat_proj_dropout=params.feat_proj_dropout,
+        final_dropout=params.final_dropout,
+        layerdrop=params.layerdrop,
+        initializer_range=params.initializer_range,
+        layer_norm_eps=params.layer_norm_eps,
+        feat_extract_norm=params.feat_extract_norm,
+        feat_extract_activation=params.feat_extract_activation,
+        conv_dim=_to_int_tuple(params.conv_dim),
+        conv_stride=_to_int_tuple(params.conv_stride),
+        conv_kernel=_to_int_tuple(params.conv_kernel),
+        conv_bias=params.conv_bias,
+        num_conv_pos_embeddings=params.num_conv_pos_embeddings,
+        num_conv_pos_embedding_groups=params.num_conv_pos_embedding_groups,
+        do_stable_layer_norm=params.do_stable_layer_norm,
+        apply_spec_augment=params.apply_spec_augment,
+        mask_time_prob=params.mask_time_prob,
+        mask_time_length=params.mask_time_length,
+        mask_time_min_masks=params.mask_time_min_masks,
+        mask_feature_prob=params.mask_feature_prob,
+        mask_feature_length=params.mask_feature_length,
+        mask_feature_min_masks=params.mask_feature_min_masks,
+    )
+
+    encoder = HubertModel(config)
+    return encoder
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=params.encoder_dim,
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_model(params: AttributeDict) -> nn.Module:
+    assert params.use_transducer or params.use_ctc, (
+        f"At least one of them should be True, "
+        f"but got params.use_transducer={params.use_transducer}, "
+        f"params.use_ctc={params.use_ctc}"
+    )
+
+    encoder = get_encoder_model(params)
+
+    if params.use_transducer:
+        decoder = get_decoder_model(params)
+        joiner = get_joiner_model(params)
+    else:
+        decoder = None
+        joiner = None
+
+    model = AsrModel(
+        encoder=encoder,
+        decoder=decoder,
+        joiner=joiner,
+        encoder_dim=params.encoder_dim,
+        decoder_dim=params.decoder_dim,
+        vocab_size=params.vocab_size,
+        use_transducer=params.use_transducer,
+        use_ctc=params.use_ctc,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Zipformer in our case.
+      batch:
+        A batch of data. See `dataset.HubertAsrDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    audio = batch["audio"].to(device)
+    audio_lens = batch["audio_lens"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.warm_step
+
+    texts = batch["supervisions"]["text"]
+    y = sp.encode(texts, out_type=int)
+    y = k2.RaggedTensor(y)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss, ctc_loss = model(
+            x=audio,
+            x_lens=audio_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+        )
+
+        loss = 0.0
+
+        if params.use_transducer:
+            s = params.simple_loss_scale
+            # take down the scale on the simple loss from 1.0 at the start
+            # to params.simple_loss scale by warm_step.
+            simple_loss_scale = (
+                s
+                if batch_idx_train >= warm_step
+                else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+            )
+            pruned_loss_scale = (
+                1.0
+                if batch_idx_train >= warm_step
+                else 0.1 + 0.9 * (batch_idx_train / warm_step)
+            )
+            loss += simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+
+        if params.use_ctc:
+            loss += params.ctc_loss_scale * ctc_loss
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            _get_feat_extract_output_lengths(params, audio_lens).sum().item()
+        )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    if params.use_transducer:
+        info["simple_loss"] = simple_loss.detach().cpu().item()
+        info["pruned_loss"] = pruned_loss.detach().cpu().item()
+    if params.use_ctc:
+        info["ctc_loss"] = ctc_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    saved_bad_model = False
+
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint_impl(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            set_batch_count(model, get_adjusted_batch_count(params))
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss / params.accum_grad).backward()
+
+            if params.batch_idx_train % params.accum_grad == 0:
+                scheduler.step_batch(params.batch_idx_train)
+
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+
+        except:  # noqa
+            save_bad_model()
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    if not params.use_transducer:
+        params.ctc_loss_scale = 1.0
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            512
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    train_cuts = librispeech.train_clean_100_cuts()
+    if params.full_libri:
+        train_cuts += librispeech.train_clean_360_cuts()
+        train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        if c.duration < 1.0 or c.duration > 20.0:
+            # logging.warning(
+            #     f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
+            # )
+            return False
+
+        # In pruned RNN-T, we require that T >= S
+        # where T is the number of feature frames after subsampling
+        # and S is the number of tokens in the utterance
+
+        # In HuBERT, the conv module uses the following expression
+        # for subsampling
+        T = _get_feat_extract_output_lengths(params, c.num_samples)
+        tokens = sp.encode(c.supervisions[0].text, out_type=str)
+
+        if T < len(tokens):
+            logging.warning(
+                f"Exclude cut with ID {c.id} from training. "
+                f"Number of frames (before subsampling): {c.num_frames}. "
+                f"Number of frames (after subsampling): {T}. "
+                f"Text: {c.supervisions[0].text}. "
+                f"Tokens: {tokens}. "
+                f"Number of tokens: {len(tokens)}"
+            )
+            return False
+
+        return True
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = librispeech.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_cuts = librispeech.dev_clean_cuts()
+    valid_cuts += librispeech.dev_other_cuts()
+    valid_dl = librispeech.valid_dataloaders(valid_cuts)
+
+    if params.sanity_check and not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `dataset.HubertAsrDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    audio = batch["audio"]
+    logging.info(f"audio shape: {audio.shape}")
+
+    y = sp.encode(batch["supervisions"]["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    sp: spm.SentencePieceProcessor,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+        )
+
+
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/SSL/hubert/joiner.py b/egs/librispeech/SSL/hubert/joiner.py
new file mode 100644
index 0000000000..dfb0a0057b
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/joiner.py
@@ -0,0 +1,67 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from scaling import ScaledLinear
+
+
+class Joiner(nn.Module):
+    def __init__(
+        self,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        super().__init__()
+
+        self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim, initial_scale=0.25)
+        self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim, initial_scale=0.25)
+        self.output_linear = nn.Linear(joiner_dim, vocab_size)
+
+    def forward(
+        self,
+        encoder_out: torch.Tensor,
+        decoder_out: torch.Tensor,
+        project_input: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+          encoder_out:
+            Output from the encoder. Its shape is (N, T, s_range, C).
+          decoder_out:
+            Output from the decoder. Its shape is (N, T, s_range, C).
+           project_input:
+            If true, apply input projections encoder_proj and decoder_proj.
+            If this is false, it is the user's responsibility to do this
+            manually.
+        Returns:
+          Return a tensor of shape (N, T, s_range, C).
+        """
+        assert encoder_out.ndim == decoder_out.ndim, (
+            encoder_out.shape,
+            decoder_out.shape,
+        )
+
+        if project_input:
+            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
+        else:
+            logit = encoder_out + decoder_out
+
+        logit = self.output_linear(torch.tanh(logit))
+
+        return logit
diff --git a/egs/librispeech/SSL/hubert/model.py b/egs/librispeech/SSL/hubert/model.py
new file mode 100644
index 0000000000..ce203e3e0c
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/model.py
@@ -0,0 +1,343 @@
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from scaling import ScaledLinear
+
+from icefall.utils import add_sos, make_pad_mask
+
+
+class AsrModel(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder: Optional[nn.Module] = None,
+        joiner: Optional[nn.Module] = None,
+        encoder_dim: int = 1024,
+        decoder_dim: int = 512,
+        vocab_size: int = 500,
+        use_transducer: bool = True,
+        use_ctc: bool = False,
+    ):
+        """A joint CTC & Transducer ASR model.
+
+        - Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks (http://imagine.enpc.fr/~obozinsg/teaching/mva_gm/papers/ctc.pdf)
+        - Sequence Transduction with Recurrent Neural Networks (https://arxiv.org/pdf/1211.3711.pdf)
+        - Pruned RNN-T for fast, memory-efficient ASR training (https://arxiv.org/pdf/2206.13236.pdf)
+
+        Args:
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dim) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+            It is used when use_transducer is True.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output contains
+            unnormalized probs, i.e., not processed by log-softmax.
+            It is used when use_transducer is True.
+          use_transducer:
+            Whether use transducer head. Default: True.
+          use_ctc:
+            Whether use CTC head. Default: False.
+        """
+        super().__init__()
+
+        assert (
+            use_transducer or use_ctc
+        ), f"At least one of them should be True, but got use_transducer={use_transducer}, use_ctc={use_ctc}"
+
+        self.encoder = encoder
+
+        self.use_transducer = use_transducer
+        if use_transducer:
+            # Modules for Transducer head
+            assert decoder is not None
+            assert hasattr(decoder, "blank_id")
+            assert joiner is not None
+
+            self.decoder = decoder
+            self.joiner = joiner
+
+            self.simple_am_proj = ScaledLinear(
+                encoder_dim, vocab_size, initial_scale=0.25
+            )
+            self.simple_lm_proj = ScaledLinear(
+                decoder_dim, vocab_size, initial_scale=0.25
+            )
+        else:
+            assert decoder is None
+            assert joiner is None
+
+        self.use_ctc = use_ctc
+        if use_ctc:
+            # Modules for CTC head
+            self.ctc_output = nn.Sequential(
+                nn.Dropout(p=0.1),
+                nn.Linear(encoder_dim, vocab_size),
+                nn.LogSoftmax(dim=-1),
+            )
+
+    def forward_encoder(
+        self, x: torch.Tensor, x_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute encoder outputs.
+        Args:
+          x:
+            A 2-D tensor of shape (N, T).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+
+        Returns:
+          encoder_out:
+            Encoder output, of shape (N, T, C).
+          encoder_out_lens:
+            Encoder output lengths, of shape (N,).
+        """
+        encoder_out_lens = self.encoder._get_feat_extract_output_lengths(x_lens)
+        assert torch.all(encoder_out_lens > 0), (x_lens, encoder_out_lens)
+
+        src_key_padding_mask = make_pad_mask(x_lens)
+        encoder_out = self.encoder(x, src_key_padding_mask).last_hidden_state
+
+        return encoder_out, encoder_out_lens
+
+    def forward_ctc(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute CTC loss.
+        Args:
+          encoder_out:
+            Encoder output, of shape (N, T, C).
+          encoder_out_lens:
+            Encoder output lengths, of shape (N,).
+          targets:
+            Target Tensor of shape (sum(target_lengths)). The targets are assumed
+            to be un-padded and concatenated within 1 dimension.
+        """
+        # Compute CTC log-prob
+        ctc_output = self.ctc_output(encoder_out)  # (N, T, C)
+
+        ctc_loss = torch.nn.functional.ctc_loss(
+            log_probs=ctc_output.permute(1, 0, 2),  # (T, N, C)
+            targets=targets,
+            input_lengths=encoder_out_lens,
+            target_lengths=target_lengths,
+            reduction="sum",
+        )
+        return ctc_loss
+
+    def forward_transducer(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        y_lens: torch.Tensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute Transducer loss.
+        Args:
+          encoder_out:
+            Encoder output, of shape (N, T, C).
+          encoder_out_lens:
+            Encoder output lengths, of shape (N,).
+          y:
+            A ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+        """
+        # Now for the decoder, i.e., the prediction network
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros(
+            (encoder_out.size(0), 4),
+            dtype=torch.int64,
+            device=encoder_out.device,
+        )
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = encoder_out_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        # if self.training and random.random() < 0.25:
+        #    lm = penalize_abs_values_gt(lm, 100.0, 1.0e-04)
+        # if self.training and random.random() < 0.25:
+        #    am = penalize_abs_values_gt(am, 30.0, 1.0e-04)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction="sum",
+                return_grad=True,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction="sum",
+            )
+
+        return simple_loss, pruned_loss
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 2-D tensor of shape (N, T).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+        Returns:
+          Return the transducer losses and CTC loss,
+          in form of (simple_loss, pruned_loss, ctc_loss)
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 2, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0) == y.dim0, (x.shape, x_lens.shape, y.dim0)
+
+        # Compute encoder outputs
+        encoder_out, encoder_out_lens = self.forward_encoder(x, x_lens)
+
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        if self.use_transducer:
+            # Compute transducer loss
+            simple_loss, pruned_loss = self.forward_transducer(
+                encoder_out=encoder_out,
+                encoder_out_lens=encoder_out_lens,
+                y=y.to(x.device),
+                y_lens=y_lens,
+                prune_range=prune_range,
+                am_scale=am_scale,
+                lm_scale=lm_scale,
+            )
+        else:
+            simple_loss = torch.empty(0)
+            pruned_loss = torch.empty(0)
+
+        if self.use_ctc:
+            # Compute CTC loss
+            targets = y.values
+            ctc_loss = self.forward_ctc(
+                encoder_out=encoder_out,
+                encoder_out_lens=encoder_out_lens,
+                targets=targets,
+                target_lengths=y_lens,
+            )
+        else:
+            ctc_loss = torch.empty(0)
+
+        return simple_loss, pruned_loss, ctc_loss
diff --git a/egs/librispeech/SSL/hubert/optim.py b/egs/librispeech/SSL/hubert/optim.py
new file mode 100644
index 0000000000..b83359a1ac
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/optim.py
@@ -0,0 +1,1244 @@
+#      Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
+#
+# See ../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import logging
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from lhotse.utils import fix_random_seed
+from torch import Tensor, nn
+from torch.optim import Optimizer
+
+
+class BatchedOptimizer(Optimizer):
+    """
+    This class adds to class Optimizer the capability to optimize parameters in batches:
+    it will stack the parameters and their grads for you so the optimizer can work
+    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
+    as it reduces the number of kernels launched in the optimizer.
+
+    Args:
+      params:
+    """
+
+    def __init__(self, params, defaults):
+        super(BatchedOptimizer, self).__init__(params, defaults)
+
+    @contextlib.contextmanager
+    def batched_params(self, param_group, group_params_names):
+        """
+        This function returns (technically, yields) a list of
+          of tuples (p, state), where
+        p is a `fake` parameter that is stacked (over axis 0) from real parameters
+        that share the same shape, and its gradient is also stacked;
+        `state` is the state corresponding to this batch of parameters
+        (it will be physically located in the "state" for one of the real
+        parameters, the last one that has any particular shape and dtype).
+
+        This function is decorated as a context manager so that it can
+        write parameters back to their "real" locations.
+
+        The idea is, instead of doing:
+        <code>
+          for p in group["params"]:
+             state = self.state[p]
+             ...
+        </code>
+        you can do:
+        <code>
+          with self.batched_params(group["params"]) as batches:
+             for p, state, p_names in batches:
+                 ...
+        </code>
+
+        Args:
+          group: a parameter group, which is a list of parameters; should be
+                one of self.param_groups.
+          group_params_names: name for each parameter in group,
+                which is List[str].
+        """
+        batches = defaultdict(
+            list
+        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
+        batches_names = defaultdict(
+            list
+        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
+
+        assert len(param_group) == len(group_params_names)
+        for p, named_p in zip(param_group, group_params_names):
+            key = (str(p.dtype), *p.shape)
+            batches[key].append(p)
+            batches_names[key].append(named_p)
+
+        batches_names_keys = list(batches_names.keys())
+        sorted_idx = sorted(
+            range(len(batches_names)), key=lambda i: batches_names_keys[i]
+        )
+        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
+        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
+
+        stacked_params_dict = dict()
+
+        # turn batches into a list, in deterministic order.
+        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
+        # one for each batch in `batches`.
+        tuples = []
+
+        for batch, batch_names in zip(batches, batches_names):
+            p = batch[0]
+            # we arbitrarily store the state in the
+            # state corresponding to the 1st parameter in the
+            # group.  class Optimizer will take care of saving/loading state.
+            state = self.state[p]
+            p_stacked = torch.stack(batch)
+            grad = torch.stack(
+                [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
+            )
+            p_stacked.grad = grad
+            stacked_params_dict[key] = p_stacked
+            tuples.append((p_stacked, state, batch_names))
+
+        yield tuples  # <-- calling code will do the actual optimization here!
+
+        for (stacked_params, _state, _names), batch in zip(tuples, batches):
+            for i, p in enumerate(batch):  # batch is list of Parameter
+                p.copy_(stacked_params[i])
+
+
+class ScaledAdam(BatchedOptimizer):
+    """
+     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
+     proportional to the norm of that parameter; and also learn the scale of the parameter,
+     in log space, subject to upper and lower limits (as if we had factored each parameter as
+     param = underlying_param * log_scale.exp())
+
+
+     Args:
+          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
+                   Unlike common optimizers, which accept model.parameters() or groups of parameters(),
+                   this optimizer could accept model.named_parameters() or groups of named_parameters().
+                   See comments of function _get_names_of_parameters for its 4 possible cases.
+              lr:  The learning rate.  We will typically use a learning rate schedule that starts
+                   at 0.03 and decreases over time, i.e. much higher than other common
+                   optimizers.
+     clipping_scale: (e.g. 2.0)
+                   A scale for gradient-clipping: if specified, the normalized gradients
+                   over the whole model will be clipped to have 2-norm equal to
+                   `clipping_scale` times the median 2-norm over the most recent period
+                   of `clipping_update_period` minibatches.  By "normalized gradients",
+                   we mean after multiplying by the rms parameter value for this tensor
+                   [for non-scalars]; this is appropriate because our update is scaled
+                   by this quantity.
+            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
+                   Must satisfy 0 < beta <= beta2 < 1.
+     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
+                   scale of each parameter tensor and scalar parameters of the mode..
+                   If each parameter were decomposed
+                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
+                   would be a the scaling factor on the learning rate of p_scale.
+              eps:  A general-purpose epsilon to prevent division by zero
+    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
+                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                   parameter tensor to be >= this value)
+    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
+                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                   parameter tensor to be <= this value)
+       scalar_max: Maximum absolute value for scalar parameters (applicable if your
+                   model has any parameters with numel() == 1).
+    size_update_period: The periodicity, in steps, with which we update the size (scale)
+                   of the parameter tensor.  This is provided to save a little time
+                   in the update.
+     clipping_update_period: if clipping_scale is specified, this is the period
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=3e-02,
+        clipping_scale=None,
+        betas=(0.9, 0.98),
+        scalar_lr_scale=0.1,
+        eps=1.0e-08,
+        param_min_rms=1.0e-05,
+        param_max_rms=3.0,
+        scalar_max=10.0,
+        size_update_period=4,
+        clipping_update_period=100,
+    ):
+        defaults = dict(
+            lr=lr,
+            clipping_scale=clipping_scale,
+            betas=betas,
+            scalar_lr_scale=scalar_lr_scale,
+            eps=eps,
+            param_min_rms=param_min_rms,
+            param_max_rms=param_max_rms,
+            scalar_max=scalar_max,
+            size_update_period=size_update_period,
+            clipping_update_period=clipping_update_period,
+        )
+
+        # If params only contains parameters or group of parameters,
+        # i.e when parameter names are not given,
+        # this flag will be set to False in funciton _get_names_of_parameters.
+        self.show_dominant_parameters = True
+        param_groups, parameters_names = self._get_names_of_parameters(params)
+        super(ScaledAdam, self).__init__(param_groups, defaults)
+        assert len(self.param_groups) == len(parameters_names)
+        self.parameters_names = parameters_names
+
+    def _get_names_of_parameters(
+        self, params_or_named_params
+    ) -> Tuple[List[Dict], List[List[str]]]:
+        """
+        Args:
+          params_or_named_params: according to the way ScaledAdam is initialized in train.py,
+            this argument could be one of following 4 cases,
+            case 1, a generator of parameter, e.g.:
+              optimizer = ScaledAdam(model.parameters(), lr=params.base_lr, clipping_scale=3.0)
+
+            case 2, a list of parameter groups with different config, e.g.:
+              model_param_groups = [
+                      {'params': model.encoder.parameters(), 'lr': 0.05},
+                      {'params': model.decoder.parameters(), 'lr': 0.01},
+                      {'params': model.joiner.parameters(), 'lr': 0.03},
+                      ]
+              optimizer = ScaledAdam(model_param_groups, lr=params.base_lr, clipping_scale=3.0)
+
+            case 3, a generator of named_parameter, e.g.:
+              optimizer = ScaledAdam(model.named_parameters(), lr=params.base_lr, clipping_scale=3.0)
+
+            case 4, a list of named_parameter groups with different config, e.g.:
+              model_named_param_groups = [
+                      {'named_params': model.encoder.named_parameters(), 'lr': 0.05},
+                      {'named_params': model.decoder.named_parameters(), 'lr': 0.01},
+                      {'named_params': model.joiner.named_parameters(), 'lr': 0.03},
+                      ]
+              optimizer = ScaledAdam(model_named_param_groups, lr=params.base_lr, clipping_scale=3.0)
+
+          For case 1 and case 2, input params is used to initialize the underlying torch.optimizer.
+          For case 3 and case 4, firstly, names and params are extracted from input named_params,
+            then, these extracted params are used to initialize the underlying torch.optimizer,
+            and these extracted names are mainly used by function
+            `_show_gradient_dominating_parameter`
+
+        Returns:
+          Returns a tuple containing 2 elements:
+            - `param_groups` with type List[Dict], each Dict element is a parameter group.
+              An example of `param_groups` could be:
+              [
+                  {'params': `one iterable of Parameter`, 'lr': 0.05},
+                  {'params': `another iterable of Parameter`, 'lr': 0.08},
+                  {'params': `a third iterable of Parameter`, 'lr': 0.1},
+              ]
+            - `param_gruops_names` with type List[List[str]],
+               each `List[str]` is for a group['params'] in param_groups,
+               and each `str` is the name of a parameter.
+               A dummy name "foo" is related to each parameter,
+               if input are params without names, i.e. case 1 or case 2.
+        """
+        # variable naming convention in this function:
+        #   p is short for param.
+        #   np is short for named_param.
+        #   p_or_np is short for param_or_named_param.
+        #   cur is short for current.
+        #   group is a dict, e.g. {'params': iterable of parameter, 'lr': 0.05, other fields}.
+        #   groups is a List[group]
+
+        iterable_or_groups = list(params_or_named_params)
+        if len(iterable_or_groups) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # The first value of returned tuple.  A list of dicts containing at
+        # least 'params' as a key.
+        param_groups = []
+
+        # The second value of returned tuple,
+        # a List[List[str]], each sub-List is for a group.
+        param_groups_names = []
+
+        if not isinstance(iterable_or_groups[0], dict):
+            # case 1 or case 3,
+            # the input is an iterable of parameter or named parameter.
+            param_iterable_cur_group = []
+            param_names_cur_group = []
+            for p_or_np in iterable_or_groups:
+                if isinstance(p_or_np, tuple):
+                    # case 3
+                    name, param = p_or_np
+                else:
+                    # case 1
+                    assert isinstance(p_or_np, torch.Tensor)
+                    param = p_or_np
+                    # Assign a dummy name as a placeholder
+                    name = "foo"
+                    self.show_dominant_parameters = False
+                param_iterable_cur_group.append(param)
+                param_names_cur_group.append(name)
+            param_groups.append({"params": param_iterable_cur_group})
+            param_groups_names.append(param_names_cur_group)
+        else:
+            # case 2 or case 4
+            # the input is groups of parameter or named parameter.
+            for cur_group in iterable_or_groups:
+                assert "named_params" in cur_group
+                name_list = [x[0] for x in cur_group["named_params"]]
+                p_list = [x[1] for x in cur_group["named_params"]]
+                del cur_group["named_params"]
+                cur_group["params"] = p_list
+                param_groups.append(cur_group)
+                param_groups_names.append(name_list)
+
+        return param_groups, param_groups_names
+
+    def __setstate__(self, state):
+        super(ScaledAdam, self).__setstate__(state)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        batch = True
+
+        for group, group_params_names in zip(self.param_groups, self.parameters_names):
+            with self.batched_params(group["params"], group_params_names) as batches:
+                # batches is list of pairs (stacked_param, state).  stacked_param is like
+                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
+                # a stacking dim, it is not a real dim.
+
+                if (
+                    len(batches[0][1]) == 0
+                ):  # if len(first state) == 0: not yet initialized
+                    clipping_scale = 1
+                else:
+                    clipping_scale = self._get_clipping_scale(group, batches)
+
+                for p, state, _ in batches:
+                    # Perform optimization step.
+                    # grad is not going to be None, we handled that when creating the batches.
+                    grad = p.grad
+                    if grad.is_sparse:
+                        raise RuntimeError(
+                            "ScaledAdam optimizer does not support sparse gradients"
+                        )
+                    # State initialization
+                    if len(state) == 0:
+                        self._init_state(group, p, state)
+
+                    self._step_one_batch(group, p, state, clipping_scale)
+
+        return loss
+
+    def _init_state(self, group: dict, p: Tensor, state: dict):
+        """
+        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
+        is actually the batch dimension, corresponding to batched-together
+        parameters of a given shape.
+
+
+        Args:
+           group:   Dict to look up configuration values.
+               p: The parameter that we are initializing the state for
+           state: Dict from string to whatever state we are initializing
+        """
+        size_update_period = group["size_update_period"]
+
+        state["step"] = 0
+
+        kwargs = {"device": p.device, "dtype": p.dtype}
+
+        # 'delta' implements conventional momentum.  There are
+        # several different kinds of update going on, so rather than
+        # compute "exp_avg" like in Adam, we store and decay a
+        # parameter-change "delta", which combines all forms of
+        # update.  this is equivalent to how it's done in Adam,
+        # except for the first few steps.
+        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+        batch_size = p.shape[0]
+        numel = p.numel() // batch_size
+
+        if numel > 1:
+            # "param_rms" just periodically records the scalar root-mean-square value of
+            # the parameter tensor.
+            # it has a shape like (batch_size, 1, 1, 1, 1)
+            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
+            state["param_rms"] = param_rms
+
+            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
+            state["scale_grads"] = torch.zeros(
+                size_update_period, *param_rms.shape, **kwargs
+            )
+
+        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
+        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+    def _get_clipping_scale(
+        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
+    ) -> float:
+        """
+        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
+        by this amount before applying the rest of the update.
+
+        Args:
+           group: the parameter group, an item in self.param_groups
+           tuples: a list of tuples of (param, state, param_names)
+                where param is a batched set of parameters,
+                with a .grad (1st dim is batch dim)
+                and state is the state-dict where optimization parameters are kept.
+                param_names is a List[str] while each str is name for a parameter
+                in batched set of parameters "param".
+        """
+        assert len(tuples) >= 1
+        clipping_scale = group["clipping_scale"]
+        (first_p, first_state, _) = tuples[0]
+        step = first_state["step"]
+        if clipping_scale is None or step == 0:
+            # no clipping.  return early on step == 0 because the other
+            # parameters' state won't have been initialized yet.
+            return 1.0
+        clipping_update_period = group["clipping_update_period"]
+        scalar_lr_scale = group["scalar_lr_scale"]
+
+        tot_sumsq = torch.tensor(0.0, device=first_p.device)
+        for p, state, param_names in tuples:
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError(
+                    "ScaledAdam optimizer does not support sparse gradients"
+                )
+            if p.numel() == p.shape[0]:  # a batch of scalars
+                tot_sumsq += (grad**2).sum() * (
+                    scalar_lr_scale**2
+                )  # sum() to change shape [1] to []
+            else:
+                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
+
+        tot_norm = tot_sumsq.sqrt()
+        if "model_norms" not in first_state:
+            first_state["model_norms"] = torch.zeros(
+                clipping_update_period, device=p.device
+            )
+        first_state["model_norms"][step % clipping_update_period] = tot_norm
+
+        irregular_estimate_steps = [
+            i for i in [10, 20, 40] if i < clipping_update_period
+        ]
+        if step % clipping_update_period == 0 or step in irregular_estimate_steps:
+            # Print some stats.
+            # We don't reach here if step == 0 because we would have returned
+            # above.
+            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
+            if step in irregular_estimate_steps:
+                sorted_norms = sorted_norms[-step:]
+            num_norms = sorted_norms.numel()
+            quartiles = []
+            for n in range(0, 5):
+                index = min(num_norms - 1, (num_norms // 4) * n)
+                quartiles.append(sorted_norms[index].item())
+
+            median = quartiles[2]
+            if median - median != 0:
+                raise RuntimeError("Too many grads were not finite")
+            threshold = clipping_scale * median
+            if step in irregular_estimate_steps:
+                # use larger thresholds on first few steps of estimating threshold,
+                # as norm may be changing rapidly.
+                threshold = threshold * 2.0
+            first_state["model_norm_threshold"] = threshold
+            percent_clipped = (
+                first_state["num_clipped"] * 100.0 / num_norms
+                if "num_clipped" in first_state
+                else 0.0
+            )
+            first_state["num_clipped"] = 0
+            quartiles = " ".join(["%.3e" % x for x in quartiles])
+            logging.warn(
+                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
+                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
+            )
+
+        try:
+            model_norm_threshold = first_state["model_norm_threshold"]
+        except KeyError:
+            return 1.0  # threshold has not yet been set.
+
+        ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
+        if ans != ans:  # e.g. ans is nan
+            ans = 0.0
+        if ans < 1.0:
+            first_state["num_clipped"] += 1
+        if ans < 0.1:
+            logging.warn(
+                f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
+            )
+            if self.show_dominant_parameters:
+                assert p.shape[0] == len(param_names)
+                self._show_gradient_dominating_parameter(
+                    tuples, tot_sumsq, group["scalar_lr_scale"]
+                )
+
+        if ans == 0.0:
+            for p, state, param_names in tuples:
+                p.grad.zero_()  # get rid of infinity()
+
+        return ans
+
+    def _show_gradient_dominating_parameter(
+        self,
+        tuples: List[Tuple[Tensor, dict, List[str]]],
+        tot_sumsq: Tensor,
+        scalar_lr_scale: float,
+    ):
+        """
+        Show information of parameter which dominates tot_sumsq.
+
+        Args:
+           tuples: a list of tuples of (param, state, param_names)
+                where param is a batched set of parameters,
+                with a .grad (1st dim is batch dim)
+                and state is the state-dict where optimization parameters are kept.
+                param_names is a List[str] while each str is name for a parameter
+                in batched set of parameters "param".
+            tot_sumsq: sumsq of all parameters. Though it's could be calculated
+                from tuples, we still pass it to save some time.
+        """
+        all_sumsq_orig = {}
+        for p, state, batch_param_names in tuples:
+            # p is a stacked batch parameters.
+            batch_grad = p.grad
+            if p.numel() == p.shape[0]:  # a batch of scalars
+                # Dummy values used by following `zip` statement.
+                batch_rms_orig = torch.full(
+                    p.shape, scalar_lr_scale, device=batch_grad.device
+                )
+            else:
+                batch_rms_orig = state["param_rms"]
+            batch_sumsq_orig = (batch_grad * batch_rms_orig) ** 2
+            if batch_grad.ndim > 1:
+                # need to guard it with if-statement because sum() sums over
+                # all dims if dim == ().
+                batch_sumsq_orig = batch_sumsq_orig.sum(
+                    dim=list(range(1, batch_grad.ndim))
+                )
+            for name, sumsq_orig, rms, grad in zip(
+                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
+            ):
+                proportion_orig = sumsq_orig / tot_sumsq
+                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
+
+        sorted_by_proportion = {
+            k: v
+            for k, v in sorted(
+                all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True
+            )
+        }
+        dominant_param_name = next(iter(sorted_by_proportion))
+        (
+            dominant_proportion,
+            dominant_sumsq,
+            dominant_rms,
+            dominant_grad,
+        ) = sorted_by_proportion[dominant_param_name]
+        logging.warn(
+            f"Parameter dominating tot_sumsq {dominant_param_name}"
+            f" with proportion {dominant_proportion:.2f},"
+            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
+            f"={dominant_sumsq:.3e},"
+            f" grad_sumsq={(dominant_grad**2).sum():.3e},"
+            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
+        )
+
+    def _step_one_batch(
+        self, group: dict, p: Tensor, state: dict, clipping_scale: float
+    ):
+        """
+        Do the step for one parameter, which is actually going to be a batch of
+        `real` parameters, with dim 0 as the batch dim.
+        Args:
+                  group:  dict to look up configuration values
+                    p: parameter to update (actually multiple parameters stacked together
+                       as a batch)
+                  state: state-dict for p, to look up the optimizer state
+        """
+        lr = group["lr"]
+        size_update_period = group["size_update_period"]
+        beta1 = group["betas"][0]
+
+        grad = p.grad
+        if clipping_scale != 1.0:
+            grad *= clipping_scale
+        step = state["step"]
+        delta = state["delta"]
+
+        delta.mul_(beta1)
+        batch_size = p.shape[0]
+        numel = p.numel() // batch_size
+        if numel > 1:
+            # Update the size/scale of p, and set param_rms
+            scale_grads = state["scale_grads"]
+            scale_grads[step % size_update_period] = (p * grad).sum(
+                dim=list(range(1, p.ndim)), keepdim=True
+            )
+            if step % size_update_period == size_update_period - 1:
+                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
+                param_rms.copy_(
+                    (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
+                )
+                if step > 0:
+                    # self._size_update() learns the overall scale on the
+                    # parameter, by shrinking or expanding it.
+                    self._size_update(group, scale_grads, p, state)
+
+        if numel == 1:
+            # For parameters with 1 element we just use regular Adam.
+            # Updates delta.
+            self._step_scalar(group, p, state)
+        else:
+            self._step(group, p, state)
+
+        state["step"] = step + 1
+
+    def _size_update(
+        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
+    ) -> None:
+        """
+               Called only where p.numel() > 1, this updates the scale of the parameter.
+               If we imagine: p =  underlying_param * scale.exp(), and we are doing
+               gradient descent on underlying param and on scale, this function does the update
+               on `scale`.
+
+               Args:
+              group: dict to look up configuration values
+        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
+                      grads w.r.t. the scales.
+                  p:  The parameter to update
+               state: The state-dict of p
+        """
+
+        param_rms = state["param_rms"]
+        beta1, beta2 = group["betas"]
+        size_lr = group["lr"] * group["scalar_lr_scale"]
+        param_min_rms = group["param_min_rms"]
+        param_max_rms = group["param_max_rms"]
+        eps = group["eps"]
+        step = state["step"]
+        batch_size = p.shape[0]
+
+        size_update_period = scale_grads.shape[0]
+        # correct beta2 for the size update period: we will have
+        # faster decay at this level.
+        beta2_corr = beta2**size_update_period
+
+        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
+        scale_exp_avg_sq.mul_(beta2_corr).add_(
+            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
+            alpha=1 - beta2_corr,
+        )  # shape is (batch_size, 1, 1, ...)
+
+        # The 1st time we reach here is when size_step == 1.
+        size_step = (step + 1) // size_update_period
+        bias_correction2 = 1 - beta2_corr**size_step
+        # we don't bother with bias_correction1; this will help prevent divergence
+        # at the start of training.
+
+        denom = scale_exp_avg_sq.sqrt() + eps
+
+        scale_step = (
+            -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
+        )
+
+        is_too_small = param_rms < param_min_rms
+
+        # when the param gets too small, just don't shrink it any further.
+        scale_step.masked_fill_(is_too_small, 0.0)
+
+        # and ensure the parameter rms after update never exceeds param_max_rms.
+        # We have to look at the trained model for parameters at or around the
+        # param_max_rms, because sometimes they can indicate a problem with the
+        # topology or settings.
+        scale_step = torch.minimum(scale_step, (param_max_rms - param_rms) / param_rms)
+
+        delta = state["delta"]
+        # the factor of (1-beta1) relates to momentum.
+        delta.add_(p * scale_step, alpha=(1 - beta1))
+
+    def _step(self, group: dict, p: Tensor, state: dict):
+        """
+        This function does the core update of self.step(), in the case where the members of
+        the batch have more than 1 element.
+
+        Args:
+            group: A dict which will be used to look up configuration values
+                p: The parameter to be updated
+             grad: The grad of p
+            state: The state-dict corresponding to parameter p
+
+        This function modifies p.
+        """
+        grad = p.grad
+        lr = group["lr"]
+        beta1, beta2 = group["betas"]
+        eps = group["eps"]
+        param_min_rms = group["param_min_rms"]
+        step = state["step"]
+
+        exp_avg_sq = state["exp_avg_sq"]
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
+
+        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
+        bias_correction2 = 1 - beta2 ** (this_step + 1)
+        if bias_correction2 < 0.99:
+            # note: not in-place.
+            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
+
+        denom = exp_avg_sq.sqrt()
+        denom += eps
+        grad = grad / denom
+
+        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
+
+        delta = state["delta"]
+        delta.add_(grad * alpha)
+        p.add_(delta)
+
+    def _step_scalar(self, group: dict, p: Tensor, state: dict):
+        """
+        A simplified form of the core update for scalar tensors, where we cannot get a good
+        estimate of the parameter rms.
+        """
+        beta1, beta2 = group["betas"]
+        scalar_max = group["scalar_max"]
+        eps = group["eps"]
+        lr = group["lr"] * group["scalar_lr_scale"]
+        grad = p.grad
+
+        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
+        # slower update at the start will help stability anyway.
+        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
+        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
+
+        delta = state["delta"]
+        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
+        p.clamp_(min=-scalar_max, max=scalar_max)
+        p.add_(delta)
+
+
+class LRScheduler(object):
+    """
+    Base-class for learning rate schedulers where the learning-rate depends on both the
+    batch and the epoch.
+    """
+
+    def __init__(self, optimizer: Optimizer, verbose: bool = False):
+        # Attach optimizer
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
+        self.optimizer = optimizer
+        self.verbose = verbose
+
+        for group in optimizer.param_groups:
+            group.setdefault("base_lr", group["lr"])
+
+        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
+
+        self.epoch = 0
+        self.batch = 0
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        """
+        return {
+            "base_lrs": self.base_lrs,
+            "epoch": self.epoch,
+            "batch": self.batch,
+        }
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_lr(self) -> List[float]:
+        """Return last computed learning rate by current scheduler.  Will be a list of float."""
+        return self._last_lr
+
+    def get_lr(self):
+        # Compute list of learning rates from self.epoch and self.batch and
+        # self.base_lrs; this must be overloaded by the user.
+        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
+        raise NotImplementedError
+
+    def step_batch(self, batch: Optional[int] = None) -> None:
+        # Step the batch index, or just set it.  If `batch` is specified, it
+        # must be the batch index from the start of training, i.e. summed over
+        # all epochs.
+        # You can call this in any order; if you don't provide 'batch', it should
+        # of course be called once per batch.
+        if batch is not None:
+            self.batch = batch
+        else:
+            self.batch = self.batch + 1
+        self._set_lrs()
+
+    def step_epoch(self, epoch: Optional[int] = None):
+        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
+        # you should call this at the start of the epoch; if you don't provide the 'epoch'
+        # arg, you should call it at the end of the epoch.
+        if epoch is not None:
+            self.epoch = epoch
+        else:
+            self.epoch = self.epoch + 1
+        self._set_lrs()
+
+    def _set_lrs(self):
+        values = self.get_lr()
+        assert len(values) == len(self.optimizer.param_groups)
+
+        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
+            param_group, lr = data
+            param_group["lr"] = lr
+            self.print_lr(self.verbose, i, lr)
+        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+
+    def print_lr(self, is_verbose, group, lr):
+        """Display the current learning rate."""
+        if is_verbose:
+            logging.warn(
+                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
+                f" of group {group} to {lr:.4e}."
+            )
+
+
+class Eden(LRScheduler):
+    """
+    Eden scheduler.
+    The basic formula (before warmup) is:
+      lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
+                     (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
+    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
+    and then stays constant at 1.
+
+    If you don't have the concept of epochs, or one epoch takes a very long time,
+    you can replace the notion of 'epoch' with some measure of the amount of data
+    processed, e.g. hours of data or frames of data, with 'lr_epochs' being set to
+    some measure representing "quite a lot of data": say, one fifth or one third
+    of an entire training run, but it doesn't matter much.  You could also use
+    Eden2 which has only the notion of batches.
+
+    We suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
+
+    Args:
+        optimizer: the optimizer to change the learning rates on
+        lr_batches: the number of batches after which we start significantly
+              decreasing the learning rate, suggest 5000.
+        lr_epochs: the number of epochs after which we start significantly
+              decreasing the learning rate, suggest 6 if you plan to do e.g.
+              20 to 40 epochs, but may need smaller number if dataset is huge
+              and you will do few epochs.
+    """
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        lr_batches: Union[int, float],
+        lr_epochs: Union[int, float],
+        warmup_batches: Union[int, float] = 500.0,
+        warmup_start: float = 0.5,
+        verbose: bool = False,
+    ):
+        super(Eden, self).__init__(optimizer, verbose)
+        self.lr_batches = lr_batches
+        self.lr_epochs = lr_epochs
+        self.warmup_batches = warmup_batches
+
+        assert 0.0 <= warmup_start <= 1.0, warmup_start
+        self.warmup_start = warmup_start
+
+    def get_lr(self):
+        factor = (
+            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
+        ) ** -0.25 * (
+            ((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25
+        )
+        warmup_factor = (
+            1.0
+            if self.batch >= self.warmup_batches
+            else self.warmup_start
+            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
+            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
+        )
+
+        return [x * factor * warmup_factor for x in self.base_lrs]
+
+
+class Eden2(LRScheduler):
+    """
+    Eden2 scheduler, simpler than Eden because it does not use the notion of epoch,
+    only batches.
+
+    The basic formula (before warmup) is:
+      lr = base_lr * ((batch**2 + lr_batches**2) / lr_batches**2) ** -0.5) * warmup
+
+    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
+    and then stays constant at 1.
+
+
+     E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
+
+    Args:
+        optimizer: the optimizer to change the learning rates on
+        lr_batches: the number of batches after which we start significantly
+              decreasing the learning rate, suggest 5000.
+    """
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        lr_batches: Union[int, float],
+        warmup_batches: Union[int, float] = 500.0,
+        warmup_start: float = 0.5,
+        verbose: bool = False,
+    ):
+        super().__init__(optimizer, verbose)
+        self.lr_batches = lr_batches
+        self.warmup_batches = warmup_batches
+
+        assert 0.0 <= warmup_start <= 1.0, warmup_start
+        self.warmup_start = warmup_start
+
+    def get_lr(self):
+        factor = (
+            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
+        ) ** -0.5
+        warmup_factor = (
+            1.0
+            if self.batch >= self.warmup_batches
+            else self.warmup_start
+            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
+            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
+        )
+
+        return [x * factor * warmup_factor for x in self.base_lrs]
+
+
+def _test_eden():
+    m = torch.nn.Linear(100, 100)
+    optim = ScaledAdam(m.parameters(), lr=0.03)
+
+    scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
+
+    for epoch in range(10):
+        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
+
+        for step in range(20):
+            x = torch.randn(200, 100).detach()
+            x.requires_grad = True
+            y = m(x)
+            dy = torch.randn(200, 100).detach()
+            f = (y * dy).sum()
+            f.backward()
+
+            optim.step()
+            scheduler.step_batch()
+            optim.zero_grad()
+
+    logging.info(f"last lr = {scheduler.get_last_lr()}")
+    logging.info(f"state dict = {scheduler.state_dict()}")
+
+
+# This is included mostly as a baseline for ScaledAdam.
+class Eve(Optimizer):
+    """
+    Implements Eve algorithm.  This is a modified version of AdamW with a special
+    way of setting the weight-decay / shrinkage-factor, which is designed to make the
+    rms of the parameters approach a particular target_rms (default: 0.1).  This is
+    for use with networks with 'scaled' versions of modules (see scaling.py), which
+    will be close to invariant to the absolute scale on the parameter matrix.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+    Eve is unpublished so far.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
+            this value means that the weight would decay significantly after
+            about 3k minibatches.  Is not multiplied by learning rate, but
+            is conditional on RMS-value of parameter being > target_rms.
+        target_rms (float, optional): target root-mean-square value of
+           parameters, if they fall below this we will stop applying weight decay.
+
+
+    .. _Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.98),
+        eps=1e-8,
+        weight_decay=1e-3,
+        target_rms=0.1,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0 <= weight_decay <= 0.1:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0 < target_rms <= 10.0:
+            raise ValueError("Invalid target_rms value: {}".format(target_rms))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            target_rms=target_rms,
+        )
+        super(Eve, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Eve, self).__setstate__(state)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+
+                # Perform optimization step
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError("AdamW does not support sparse gradients")
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+
+                beta1, beta2 = group["betas"]
+
+                state["step"] += 1
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+                denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
+                    group["eps"]
+                )
+
+                step_size = group["lr"] / bias_correction1
+                target_rms = group["target_rms"]
+                weight_decay = group["weight_decay"]
+
+                if p.numel() > 1:
+                    # avoid applying this weight-decay on "scaling factors"
+                    # (which are scalar).
+                    is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
+                    p.mul_(1 - (weight_decay * is_above_target_rms))
+
+                p.addcdiv_(exp_avg, denom, value=-step_size)
+
+                if random.random() < 0.0005:
+                    step = (exp_avg / denom) * step_size
+                    logging.info(
+                        f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
+                    )
+
+        return loss
+
+
+def _test_scaled_adam(hidden_dim: int):
+    import timeit
+
+    from scaling import ScaledLinear
+
+    E = 100
+    B = 4
+    T = 2
+    logging.info("in test_eve_cain")
+    # device = torch.device('cuda')
+    device = torch.device("cpu")
+    dtype = torch.float32
+
+    fix_random_seed(42)
+    # these input_magnitudes and output_magnitudes are to test that
+    # Abel is working as we expect and is able to adjust scales of
+    # different dims differently.
+    input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
+    output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
+
+    for iter in [1, 0]:
+        fix_random_seed(42)
+        Linear = torch.nn.Linear if iter == 0 else ScaledLinear
+
+        m = torch.nn.Sequential(
+            Linear(E, hidden_dim),
+            torch.nn.PReLU(),
+            Linear(hidden_dim, hidden_dim),
+            torch.nn.PReLU(),
+            Linear(hidden_dim, E),
+        ).to(device)
+
+        train_pairs = [
+            (
+                100.0
+                * torch.randn(B, T, E, device=device, dtype=dtype)
+                * input_magnitudes,
+                torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes,
+            )
+            for _ in range(20)
+        ]
+
+        if iter == 0:
+            optim = Eve(m.parameters(), lr=0.003)
+        elif iter == 1:
+            optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
+        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
+
+        start = timeit.default_timer()
+        avg_loss = 0.0
+        for epoch in range(180):
+            scheduler.step_epoch()
+            # if epoch == 100 and iter in [2,3]:
+            #    optim.reset_speedup()  # check it doesn't crash.
+
+            # if epoch == 130:
+            #    opts = diagnostics.TensorDiagnosticOptions(
+            #        512
+            #    )  # allow 4 megabytes per sub-module
+            #    diagnostic = diagnostics.attach_diagnostics(m, opts)
+
+            for n, (x, y) in enumerate(train_pairs):
+                y_out = m(x)
+                loss = ((y_out - y) ** 2).mean() * 100.0
+                if epoch == 0 and n == 0:
+                    avg_loss = loss.item()
+                else:
+                    avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
+                if n == 0 and epoch % 5 == 0:
+                    # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
+                    # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
+                    # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
+                    # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
+                    # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
+                    # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
+                    # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
+                    # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
+                    lr = scheduler.get_last_lr()[0]
+                    logging.info(
+                        f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
+                    )  # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
+                loss.log().backward()
+                optim.step()
+                optim.zero_grad()
+                scheduler.step_batch()
+
+        # diagnostic.print_diagnostics()
+
+        stop = timeit.default_timer()
+        logging.info(f"Iter={iter}, Time taken: {stop - start}")
+
+        logging.info(f"last lr = {scheduler.get_last_lr()}")
+        # logging.info("state dict = ", scheduler.state_dict())
+        # logging.info("optim state_dict = ", optim.state_dict())
+        logging.info(f"input_magnitudes = {input_magnitudes}")
+        logging.info(f"output_magnitudes = {output_magnitudes}")
+
+
+if __name__ == "__main__":
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    logging.getLogger().setLevel(logging.INFO)
+    import subprocess
+
+    s = subprocess.check_output(
+        "git status -uno .; git log -1; git diff HEAD .", shell=True
+    )
+    logging.info(s)
+    import sys
+
+    if len(sys.argv) > 1:
+        hidden_dim = int(sys.argv[1])
+    else:
+        hidden_dim = 200
+
+    _test_scaled_adam(hidden_dim)
+    _test_eden()
diff --git a/egs/librispeech/SSL/hubert/scaling.py b/egs/librispeech/SSL/hubert/scaling.py
new file mode 100644
index 0000000000..29ac33c02b
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/scaling.py
@@ -0,0 +1,1908 @@
+# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import math
+import random
+from typing import Optional, Tuple, Union
+
+import k2
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
+    max_value = torch.max(x, y)
+    diff = torch.abs(x - y)
+    return max_value + torch.log1p(torch.exp(-diff))
+
+
+# RuntimeError: Exporting the operator logaddexp to ONNX opset version
+# 14 is not supported. Please feel free to request support or submit
+# a pull request on PyTorch GitHub.
+#
+# The following function is to solve the above error when exporting
+# models to ONNX via torch.jit.trace()
+def logaddexp(x: Tensor, y: Tensor) -> Tensor:
+    # Caution(fangjun): Put torch.jit.is_scripting() before
+    # torch.onnx.is_in_onnx_export();
+    # otherwise, it will cause errors for torch.jit.script().
+    #
+    # torch.logaddexp() works for both torch.jit.script() and
+    # torch.jit.trace() but it causes errors for ONNX export.
+    #
+    if torch.jit.is_scripting():
+        # Note: We cannot use torch.jit.is_tracing() here as it also
+        # matches torch.onnx.export().
+        return torch.logaddexp(x, y)
+    elif torch.onnx.is_in_onnx_export():
+        return logaddexp_onnx(x, y)
+    else:
+        # for torch.jit.trace()
+        return torch.logaddexp(x, y)
+
+
+class PiecewiseLinear(object):
+    """
+    Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
+    the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
+    respectively.
+    """
+
+    def __init__(self, *args):
+        assert len(args) >= 1, len(args)
+        if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
+            self.pairs = list(args[0].pairs)
+        else:
+            self.pairs = [(float(x), float(y)) for x, y in args]
+        for x, y in self.pairs:
+            assert isinstance(x, (float, int)), type(x)
+            assert isinstance(y, (float, int)), type(y)
+
+        for i in range(len(self.pairs) - 1):
+            assert self.pairs[i + 1][0] > self.pairs[i][0], (
+                i,
+                self.pairs[i],
+                self.pairs[i + 1],
+            )
+
+    def __str__(self):
+        # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
+        return f"PiecewiseLinear({str(self.pairs)[1:-1]})"
+
+    def __call__(self, x):
+        if x <= self.pairs[0][0]:
+            return self.pairs[0][1]
+        elif x >= self.pairs[-1][0]:
+            return self.pairs[-1][1]
+        else:
+            cur_x, cur_y = self.pairs[0]
+            for i in range(1, len(self.pairs)):
+                next_x, next_y = self.pairs[i]
+                if x >= cur_x and x <= next_x:
+                    return cur_y + (next_y - cur_y) * (x - cur_x) / (next_x - cur_x)
+                cur_x, cur_y = next_x, next_y
+            assert False
+
+    def __mul__(self, alpha):
+        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
+
+    def __add__(self, x):
+        if isinstance(x, (float, int)):
+            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
+        s, x = self.get_common_basis(x)
+        return PiecewiseLinear(
+            *[(sp[0], sp[1] + xp[1]) for sp, xp in zip(s.pairs, x.pairs)]
+        )
+
+    def max(self, x):
+        if isinstance(x, (float, int)):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(
+            *[(sp[0], max(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
+        )
+
+    def min(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(
+            *[(sp[0], min(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
+        )
+
+    def __eq__(self, other):
+        return self.pairs == other.pairs
+
+    def get_common_basis(self, p: "PiecewiseLinear", include_crossings: bool = False):
+        """
+        Returns (self_mod, p_mod) which are equivalent piecewise linear
+        functions to self and p, but with the same x values.
+
+          p: the other piecewise linear function
+          include_crossings: if true, include in the x values positions
+              where the functions indicate by this and p crosss.
+        """
+        assert isinstance(p, PiecewiseLinear), type(p)
+
+        # get sorted x-values without repetition.
+        x_vals = sorted(set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+
+        if include_crossings:
+            extra_x_vals = []
+            for i in range(len(x_vals) - 1):
+                if (y_vals1[i] > y_vals2[i]) != (y_vals1[i + 1] > y_vals2[i + 1]):
+                    # if the two lines in this subsegment potentially cross each other..
+                    diff_cur = abs(y_vals1[i] - y_vals2[i])
+                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
+                    # `pos`, between 0 and 1, gives the relative x position,
+                    # with 0 being x_vals[i] and 1 being x_vals[i+1].
+                    pos = diff_cur / (diff_cur + diff_next)
+                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
+                    extra_x_vals.append(extra_x_val)
+            if len(extra_x_vals) > 0:
+                x_vals = sorted(set(x_vals + extra_x_vals))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+        return (
+            PiecewiseLinear(*zip(x_vals, y_vals1)),
+            PiecewiseLinear(*zip(x_vals, y_vals2)),
+        )
+
+
+class ScheduledFloat(torch.nn.Module):
+    """
+    This object is a torch.nn.Module only because we want it to show up in [top_level module].modules();
+    it does not have a working forward() function.  You are supposed to cast it to float, as
+    in, float(parent_module.whatever), and use it as something like a dropout prob.
+
+    It is a floating point value whose value changes depending on the batch count of the
+    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
+    in sorted order on x; x corresponds to the batch index.  For batch-index values before the
+    first x or after the last x, we just use the first or last y value.
+
+    Example:
+       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)
+
+    `default` is used when self.batch_count is not set or not in training mode or in
+     torch.jit scripting mode.
+    """
+
+    def __init__(self, *args, default: float = 0.0):
+        super().__init__()
+        # self.batch_count and self.name will be written to in the training loop.
+        self.batch_count = None
+        self.name = None
+        self.default = default
+        self.schedule = PiecewiseLinear(*args)
+
+    def extra_repr(self) -> str:
+        return (
+            f"batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}"
+        )
+
+    def __float__(self):
+        batch_count = self.batch_count
+        if (
+            batch_count is None
+            or not self.training
+            or torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+        ):
+            return float(self.default)
+        else:
+            ans = self.schedule(self.batch_count)
+            if random.random() < 0.0002:
+                logging.info(
+                    f"ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}"
+                )
+            return ans
+
+    def __add__(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule + x, default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule + x.schedule, default=self.default + x.default
+            )
+
+    def max(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule.max(x), default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule.max(x.schedule), default=max(self.default, x.default)
+            )
+
+
+FloatLike = Union[float, ScheduledFloat]
+
+
+def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
+    """
+    A randomized way of casting a floating point value to half precision.
+    """
+    if x.dtype == torch.float16:
+        return x
+    x_abs = x.abs()
+    is_too_small = x_abs < min_abs
+    # for elements where is_too_small is true, random_val will contain +-min_abs with
+    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
+    # for those elements].
+    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
+    return torch.where(is_too_small, random_val, x).to(torch.float16)
+
+
+class CutoffEstimator:
+    """
+    Estimates cutoffs of an arbitrary numerical quantity such that a specified
+    proportion of items will be above the cutoff on average.
+
+      p is the proportion of items that should be above the cutoff.
+    """
+
+    def __init__(self, p: float):
+        self.p = p
+        # total count of items
+        self.count = 0
+        # total count of items that were above the cutoff
+        self.count_above = 0
+        # initial cutoff value
+        self.cutoff = 0
+
+    def __call__(self, x: float) -> bool:
+        """
+        Returns true if x is above the cutoff.
+        """
+        ans = x > self.cutoff
+        self.count += 1
+        if ans:
+            self.count_above += 1
+        cur_p = self.count_above / self.count
+        delta_p = cur_p - self.p
+        if (delta_p > 0) == ans:
+            q = abs(delta_p)
+            self.cutoff = x * q + self.cutoff * (1 - q)
+        return ans
+
+
+class SoftmaxFunction(torch.autograd.Function):
+    """
+    Tries to handle half-precision derivatives in a randomized way that should
+    be more accurate for training than the default behavior.
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor, dim: int):
+        ans = x.softmax(dim=dim)
+        # if x dtype is float16, x.softmax() returns a float32 because
+        # (presumably) that op does not support float16, and autocast
+        # is enabled.
+        if torch.is_autocast_enabled():
+            ans = ans.to(torch.float16)
+        ctx.save_for_backward(ans)
+        ctx.x_dtype = x.dtype
+        ctx.dim = dim
+        return ans
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        (ans,) = ctx.saved_tensors
+        with torch.cuda.amp.autocast(enabled=False):
+            ans_grad = ans_grad.to(torch.float32)
+            ans = ans.to(torch.float32)
+            x_grad = ans_grad * ans
+            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
+            return x_grad, None
+
+
+def softmax(x: Tensor, dim: int):
+    if not x.requires_grad or torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x.softmax(dim=dim)
+
+    return SoftmaxFunction.apply(x, dim)
+
+
+class MaxEigLimiterFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        coeffs: Tensor,
+        direction: Tensor,
+        channel_dim: int,
+        grad_scale: float,
+    ) -> Tensor:
+        ctx.channel_dim = channel_dim
+        ctx.grad_scale = grad_scale
+        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
+        return x
+
+    @staticmethod
+    def backward(ctx, x_grad, *args):
+        with torch.enable_grad():
+            (x_orig, coeffs, new_direction) = ctx.saved_tensors
+            x_orig.requires_grad = True
+            num_channels = x_orig.shape[ctx.channel_dim]
+            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
+            new_direction.requires_grad = False
+            x = x - x.mean(dim=0)
+            x_var = (x**2).mean()
+            x_residual = x - coeffs * new_direction
+            x_residual_var = (x_residual**2).mean()
+            # `variance_proportion` is the proportion of the variance accounted for
+            # by the top eigen-direction.  This is to be minimized.
+            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
+            variance_proportion.backward()
+        x_orig_grad = x_orig.grad
+        x_extra_grad = (
+            x_orig.grad
+            * ctx.grad_scale
+            * x_grad.norm()
+            / (x_orig_grad.norm() + 1.0e-20)
+        )
+        return x_grad + x_extra_grad.detach(), None, None, None, None
+
+
+class BiasNormFunction(torch.autograd.Function):
+    # This computes:
+    #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
+    #   return x * scales
+    # (after unsqueezing the bias), but it does it in a memory-efficient way so that
+    # it can just store the returned value (chances are, this will also be needed for
+    # some other reason, related to the next operation, so we can save memory).
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        bias: Tensor,
+        log_scale: Tensor,
+        channel_dim: int,
+        store_output_for_backprop: bool,
+    ) -> Tensor:
+        assert bias.ndim == 1
+        if channel_dim < 0:
+            channel_dim = channel_dim + x.ndim
+        ctx.store_output_for_backprop = store_output_for_backprop
+        ctx.channel_dim = channel_dim
+        for _ in range(channel_dim + 1, x.ndim):
+            bias = bias.unsqueeze(-1)
+        scales = (
+            torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
+        ) * log_scale.exp()
+        ans = x * scales
+        ctx.save_for_backward(
+            ans.detach() if store_output_for_backprop else x,
+            scales.detach(),
+            bias.detach(),
+            log_scale.detach(),
+        )
+        return ans
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor) -> Tensor:
+        ans_or_x, scales, bias, log_scale = ctx.saved_tensors
+        if ctx.store_output_for_backprop:
+            x = ans_or_x / scales
+        else:
+            x = ans_or_x
+        x = x.detach()
+        x.requires_grad = True
+        bias.requires_grad = True
+        log_scale.requires_grad = True
+        with torch.enable_grad():
+            # recompute scales from x, bias and log_scale.
+            scales = (
+                torch.mean((x - bias) ** 2, dim=ctx.channel_dim, keepdim=True) ** -0.5
+            ) * log_scale.exp()
+            ans = x * scales
+            ans.backward(gradient=ans_grad)
+        return x.grad, bias.grad.flatten(), log_scale.grad, None, None
+
+
+class BiasNorm(torch.nn.Module):
+    """
+    This is intended to be a simpler, and hopefully cheaper, replacement for
+    LayerNorm.  The observation this is based on, is that Transformer-type
+    networks, especially with pre-norm, sometimes seem to set one of the
+    feature dimensions to a large constant value (e.g. 50), which "defeats"
+    the LayerNorm because the output magnitude is then not strongly dependent
+    on the other (useful) features.  Presumably the weight and bias of the
+    LayerNorm are required to allow it to do this.
+
+    Instead, we give the BiasNorm a trainable bias that it can use when
+    computing the scale for normalization.  We also give it a (scalar)
+    trainable scale on the output.
+
+
+    Args:
+       num_channels: the number of channels, e.g. 512.
+       channel_dim: the axis/dimension corresponding to the channel,
+         interpreted as an offset from the input's ndim if negative.
+         This is NOT the num_channels; it should typically be one of
+         {-2, -1, 0, 1, 2, 3}.
+      log_scale: the initial log-scale that we multiply the output by; this
+         is learnable.
+      log_scale_min: FloatLike, minimum allowed value of log_scale
+      log_scale_max: FloatLike, maximum allowed value of log_scale
+      store_output_for_backprop: only possibly affects memory use; recommend
+         to set to True if you think the output of this module is more likely
+         than the input of this module to be required to be stored for the
+         backprop.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int = -1,  # CAUTION: see documentation.
+        log_scale: float = 1.0,
+        log_scale_min: float = -1.5,
+        log_scale_max: float = 1.5,
+        store_output_for_backprop: bool = False,
+    ) -> None:
+        super(BiasNorm, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.log_scale = nn.Parameter(torch.tensor(log_scale))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+
+        self.log_scale_min = log_scale_min
+        self.log_scale_max = log_scale_max
+
+        self.store_output_for_backprop = store_output_for_backprop
+
+    def forward(self, x: Tensor) -> Tensor:
+        assert x.shape[self.channel_dim] == self.num_channels
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            channel_dim = self.channel_dim
+            if channel_dim < 0:
+                channel_dim += x.ndim
+            bias = self.bias
+            for _ in range(channel_dim + 1, x.ndim):
+                bias = bias.unsqueeze(-1)
+            scales = (
+                torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
+            ) * self.log_scale.exp()
+            return x * scales
+
+        log_scale = limit_param_value(
+            self.log_scale,
+            min=float(self.log_scale_min),
+            max=float(self.log_scale_max),
+            training=self.training,
+        )
+
+        return BiasNormFunction.apply(
+            x, self.bias, log_scale, self.channel_dim, self.store_output_for_backprop
+        )
+
+
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
+    """
+    Behaves like a constructor of a modified version of nn.Linear
+    that gives an easy way to set the default initial parameter scale.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Linear(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+
+
+def ScaledConv1d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv1d:
+    """
+    Behaves like a constructor of a modified version of nn.Conv1d
+    that gives an easy way to set the default initial parameter scale.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Conv1d(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+
+
+def ScaledConv2d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv2d:
+    """
+    Behaves like a constructor of a modified version of nn.Conv2d
+    that gives an easy way to set the default initial parameter scale.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False, but:
+    NO PADDING-RELATED ARGS.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Conv2d(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+
+
+class ChunkCausalDepthwiseConv1d(torch.nn.Module):
+    """
+    Behaves like a depthwise 1d convolution, except that it is causal in
+    a chunkwise way, as if we had a block-triangular attention mask.
+    The chunk size is provided at test time (it should probably be
+    kept in sync with the attention mask).
+
+    This has a little more than twice the parameters of a conventional
+    depthwise conv1d module: we implement it by having one
+    depthwise convolution, of half the width, that is causal (via
+    right-padding); and one depthwise convolution that is applied only
+    within chunks, that we multiply by a scaling factor which depends
+    on the position within the chunk.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        initial_scale: float = 1.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        assert kernel_size % 2 == 1
+
+        half_kernel_size = (kernel_size + 1) // 2
+        # will pad manually, on one side.
+        self.causal_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=half_kernel_size,
+            padding=0,
+            bias=True,
+        )
+
+        self.chunkwise_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            bias=bias,
+        )
+
+        # first row is correction factors added to the scale near the left edge of the chunk,
+        # second row is correction factors added to the scale near the right edge of the chunk,
+        # both of these are added to a default scale of 1.0.
+        self.chunkwise_conv_scale = nn.Parameter(torch.zeros(2, channels, kernel_size))
+        self.kernel_size = kernel_size
+
+        with torch.no_grad():
+            self.causal_conv.weight[:] *= initial_scale
+            self.chunkwise_conv.weight[:] *= initial_scale
+            if bias:
+                torch.nn.init.uniform_(
+                    self.causal_conv.bias, -0.1 * initial_scale, 0.1 * initial_scale
+                )
+
+    def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor:
+        """
+             Forward function.  Args:
+               x: a Tensor of shape (batch_size, channels, seq_len)
+        chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
+        """
+        (batch_size, num_channels, seq_len) = x.shape
+
+        # half_kernel_size = self.kernel_size + 1 // 2
+        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
+        # in the causal conv.  It's the amount by which we must pad on the left,
+        # to make the convolution causal.
+        left_pad = self.kernel_size // 2
+
+        if chunk_size < 0 or chunk_size > seq_len:
+            chunk_size = seq_len
+        right_pad = -seq_len % chunk_size
+
+        x = torch.nn.functional.pad(x, (left_pad, right_pad))
+
+        x_causal = self.causal_conv(x[..., : left_pad + seq_len])
+        assert x_causal.shape == (batch_size, num_channels, seq_len)
+
+        x_chunk = x[..., left_pad:]
+        num_chunks = x_chunk.shape[2] // chunk_size
+        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks, chunk_size)
+        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(
+            batch_size * num_chunks, num_channels, chunk_size
+        )
+        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
+
+        chunk_scale = self._get_chunk_scale(chunk_size)
+
+        x_chunk = x_chunk * chunk_scale
+        x_chunk = x_chunk.reshape(
+            batch_size, num_chunks, num_channels, chunk_size
+        ).permute(0, 2, 1, 3)
+        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks * chunk_size)[
+            ..., :seq_len
+        ]
+
+        return x_chunk + x_causal
+
+    def _get_chunk_scale(self, chunk_size: int):
+        """Returns tensor of shape (num_channels, chunk_size) that will be used to
+        scale the output of self.chunkwise_conv."""
+        left_edge = self.chunkwise_conv_scale[0]
+        right_edge = self.chunkwise_conv_scale[1]
+        if chunk_size < self.kernel_size:
+            left_edge = left_edge[:, :chunk_size]
+            right_edge = right_edge[:, -chunk_size:]
+        else:
+            t = chunk_size - self.kernel_size
+            channels = left_edge.shape[0]
+            pad = torch.zeros(
+                channels, t, device=left_edge.device, dtype=left_edge.dtype
+            )
+            left_edge = torch.cat((left_edge, pad), dim=-1)
+            right_edge = torch.cat((pad, right_edge), dim=-1)
+        return 1.0 + (left_edge + right_edge)
+
+    def streaming_forward(
+        self,
+        x: Tensor,
+        cache: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """Streaming Forward function.
+
+        Args:
+            x: a Tensor of shape (batch_size, channels, seq_len)
+            cache: cached left context of shape (batch_size, channels, left_pad)
+        """
+        (batch_size, num_channels, seq_len) = x.shape
+
+        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
+        # in the causal conv.  It's the amount by which we must pad on the left,
+        # to make the convolution causal.
+        left_pad = self.kernel_size // 2
+
+        # Pad cache
+        assert cache.shape[-1] == left_pad, (cache.shape[-1], left_pad)
+        x = torch.cat([cache, x], dim=2)
+        # Update cache
+        cache = x[..., -left_pad:]
+
+        x_causal = self.causal_conv(x)
+        assert x_causal.shape == (batch_size, num_channels, seq_len)
+
+        x_chunk = x[..., left_pad:]
+        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
+
+        chunk_scale = self._get_chunk_scale(chunk_size=seq_len)
+        x_chunk = x_chunk * chunk_scale
+
+        return x_chunk + x_causal, cache
+
+
+class BalancerFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        min_mean: float,
+        max_mean: float,
+        min_rms: float,
+        max_rms: float,
+        grad_scale: float,
+        channel_dim: int,
+    ) -> Tensor:
+        if channel_dim < 0:
+            channel_dim += x.ndim
+        ctx.channel_dim = channel_dim
+        ctx.save_for_backward(x)
+        ctx.config = (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim)
+        return x
+
+    @staticmethod
+    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None]:
+        (x,) = ctx.saved_tensors
+        (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim) = ctx.config
+
+        try:
+            with torch.enable_grad():
+                with torch.cuda.amp.autocast(enabled=False):
+                    x = x.to(torch.float32)
+                    x = x.detach()
+                    x.requires_grad = True
+                    mean_dims = [i for i in range(x.ndim) if i != channel_dim]
+                    uncentered_var = (x**2).mean(dim=mean_dims, keepdim=True)
+                    mean = x.mean(dim=mean_dims, keepdim=True)
+                    stddev = (uncentered_var - (mean * mean)).clamp(min=1.0e-20).sqrt()
+                    rms = uncentered_var.clamp(min=1.0e-20).sqrt()
+
+                    m = mean / stddev
+                    # part of loss that relates to mean / stddev
+                    m_loss = (m - m.clamp(min=min_mean, max=max_mean)).abs()
+
+                    # put a much larger scale on the RMS-max-limit loss, so that if both it and the
+                    # m_loss are violated we fix the RMS loss first.
+                    rms_clamped = rms.clamp(min=min_rms, max=max_rms)
+                    r_loss = (rms_clamped / rms).log().abs()
+
+                    loss = m_loss + r_loss
+
+                    loss.backward(gradient=torch.ones_like(loss))
+                    loss_grad = x.grad
+                    loss_grad_rms = (
+                        (loss_grad**2)
+                        .mean(dim=mean_dims, keepdim=True)
+                        .sqrt()
+                        .clamp(min=1.0e-20)
+                    )
+
+                    loss_grad = loss_grad * (grad_scale / loss_grad_rms)
+
+                    x_grad_float = x_grad.to(torch.float32)
+                    # scale each element of loss_grad by the absolute value of the corresponding
+                    # element of x_grad, which we view as a noisy estimate of its magnitude for that
+                    # (frame and dimension).  later we can consider factored versions.
+                    x_grad_mod = x_grad_float + (x_grad_float.abs() * loss_grad)
+                    x_grad = x_grad_mod.to(x_grad.dtype)
+        except Exception as e:
+            logging.info(
+                f"Caught exception in Balancer backward: {e}, size={list(x_grad.shape)}, will continue."
+            )
+
+        return x_grad, None, None, None, None, None, None
+
+
+class Balancer(torch.nn.Module):
+    """
+    Modifies the backpropped derivatives of a function to try to encourage, for
+    each channel, that it is positive at least a proportion `threshold` of the
+    time.  It does this by multiplying negative derivative values by up to
+    (1+max_factor), and positive derivative values by up to (1-max_factor),
+    interpolated from 1 at the threshold to those extremal values when none
+    of the inputs are positive.
+
+    Args:
+           num_channels: the number of channels
+           channel_dim: the dimension/axis corresponding to the channel, e.g.
+               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
+           min_positive: the minimum, per channel, of the proportion of the time
+               that (x > 0), below which we start to modify the derivatives.
+           max_positive: the maximum, per channel, of the proportion of the time
+               that (x > 0), above which we start to modify the derivatives.
+           scale_gain_factor: determines the 'gain' with which we increase the
+              change in gradient once the constraints on min_abs and max_abs
+              are violated.
+           min_abs:  the minimum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+           max_abs:  the maximum average-absolute-value difference from the mean
+               value per channel, which we allow, before we start to modify
+               the derivatives to prevent this.
+         prob: determines the minimum probability with which we modify the
+             gradients for the {min,max}_positive and {min,max}_abs constraints,
+             on each forward().  This is done randomly to prevent all layers
+             from doing it at the same time.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int,
+        min_positive: FloatLike = 0.05,
+        max_positive: FloatLike = 0.95,
+        min_abs: FloatLike = 0.2,
+        max_abs: FloatLike = 100.0,
+        grad_scale: FloatLike = 0.04,
+        prob: Optional[FloatLike] = None,
+    ):
+        super().__init__()
+
+        if prob is None:
+            prob = ScheduledFloat((0.0, 0.5), (8000.0, 0.125), default=0.4)
+        self.prob = prob
+        # 5% of the time we will return and do nothing because memory usage is
+        # too high.
+        self.mem_cutoff = CutoffEstimator(0.05)
+
+        # actually self.num_channels is no longer needed except for an assertion.
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.min_positive = min_positive
+        self.max_positive = max_positive
+        self.min_abs = min_abs
+        self.max_abs = max_abs
+        self.grad_scale = grad_scale
+
+    def forward(self, x: Tensor) -> Tensor:
+        if (
+            torch.jit.is_scripting()
+            or not x.requires_grad
+            or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
+        ):
+            return _no_op(x)
+
+        prob = float(self.prob)
+        if random.random() < prob:
+            # The following inner-functions convert from the way we historically specified
+            # these limitations, as limits on the absolute value and the proportion of positive
+            # values, to limits on the RMS value and the (mean / stddev).
+            def _abs_to_rms(x):
+                # for normally distributed data, if the expected absolute value is x, the
+                # expected rms value will be sqrt(pi/2) * x.
+                return 1.25331413732 * x
+
+            def _proportion_positive_to_mean(x):
+                def _atanh(x):
+                    eps = 1.0e-10
+                    # eps is to prevent crashes if x is exactly 0 or 1.
+                    # we'll just end up returning a fairly large value.
+                    return (math.log(1 + x + eps) - math.log(1 - x + eps)) / 2.0
+
+                def _approx_inverse_erf(x):
+                    # 1 / (sqrt(pi) * ln(2)),
+                    # see https://math.stackexchange.com/questions/321569/approximating-the-error-function-erf-by-analytical-functions
+                    # this approximation is extremely crude and gets progressively worse for
+                    # x very close to -1 or +1, but we mostly care about the "middle" region
+                    # e.g. _approx_inverse_erf(0.05) = 0.0407316414078772,
+                    # and math.erf(0.0407316414078772) = 0.045935330944660666,
+                    # which is pretty close to 0.05.
+                    return 0.8139535143 * _atanh(x)
+
+                # first convert x from the range 0..1 to the range -1..1 which the error
+                # function returns
+                x = -1 + (2 * x)
+                return _approx_inverse_erf(x)
+
+            min_mean = _proportion_positive_to_mean(float(self.min_positive))
+            max_mean = _proportion_positive_to_mean(float(self.max_positive))
+            min_rms = _abs_to_rms(float(self.min_abs))
+            max_rms = _abs_to_rms(float(self.max_abs))
+            grad_scale = float(self.grad_scale)
+
+            assert x.shape[self.channel_dim] == self.num_channels
+
+            return BalancerFunction.apply(
+                x, min_mean, max_mean, min_rms, max_rms, grad_scale, self.channel_dim
+            )
+        else:
+            return _no_op(x)
+
+
+def penalize_abs_values_gt(
+    x: Tensor, limit: float, penalty: float, name: str = None
+) -> Tensor:
+    """
+    Returns x unmodified, but in backprop will put a penalty for the excess of
+    the absolute values of elements of x over the limit "limit".  E.g. if
+    limit == 10.0, then if x has any values over 10 it will get a penalty.
+
+    Caution: the value of this penalty will be affected by grad scaling used
+    in automatic mixed precision training.  For this reasons we use this,
+    it shouldn't really matter, or may even be helpful; we just use this
+    to disallow really implausible values of scores to be given to softmax.
+
+    The name is for randomly printed debug info.
+    """
+    x_sign = x.sign()
+    over_limit = (x.abs() - limit) > 0
+    # The following is a memory efficient way to penalize the absolute values of
+    # x that's over the limit.  (The memory efficiency comes when you think
+    # about which items torch needs to cache for the autograd, and which ones it
+    # can throw away).  The numerical value of aux_loss as computed here will
+    # actually be larger than it should be, by limit * over_limit.sum(), but it
+    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
+    # limit).relu().
+    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
+    # note: we don't do sum() here on aux)_loss, but it's as if we had done
+    # sum() due to how with_loss() works.
+    x = with_loss(x, aux_loss, name)
+    # you must use x for something, or this will be ineffective.
+    return x
+
+
+def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
+    if x.ndim == 2:
+        return x.diag()
+    else:
+        (batch, dim, dim) = x.shape
+        x = x.reshape(batch, dim * dim)
+        x = x[:, :: dim + 1]
+        assert x.shape == (batch, dim)
+        return x
+
+
+def _whitening_metric(x: Tensor, num_groups: int):
+    """
+    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
+    of the centered feature covariance are the same within each group's covariance matrix
+    and also between groups.
+    Args:
+        x: a Tensor of shape (*, num_channels)
+     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
+    Returns:
+        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
+    greater than 1.0 otherwise.
+    """
+    assert x.dtype != torch.float16
+    x = x.reshape(-1, x.shape[-1])
+    (num_frames, num_channels) = x.shape
+    assert num_channels % num_groups == 0
+    channels_per_group = num_channels // num_groups
+    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
+    # x now has shape (num_groups, num_frames, channels_per_group)
+    # subtract the mean so we use the centered, not uncentered, covariance.
+    # My experience has been that when we "mess with the gradients" like this,
+    # it's better not do anything that tries to move the mean around, because
+    # that can easily cause instability.
+    x = x - x.mean(dim=1, keepdim=True)
+    # x_covar: (num_groups, channels_per_group, channels_per_group)
+    x_covar = torch.matmul(x.transpose(1, 2), x)
+    x_covar_mean_diag = _diag(x_covar).mean()
+    # the following expression is what we'd get if we took the matrix product
+    # of each covariance and measured the mean of its trace, i.e.
+    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
+    x_covarsq_mean_diag = (x_covar**2).sum() / (num_groups * channels_per_group)
+    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
+    metric = x_covarsq_mean_diag / (x_covar_mean_diag**2 + 1.0e-20)
+    return metric
+
+
+class WhiteningPenaltyFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, module: nn.Module) -> Tensor:
+        ctx.save_for_backward(x)
+        ctx.module = module
+        return x
+
+    @staticmethod
+    def backward(ctx, x_grad: Tensor):
+        (x_orig,) = ctx.saved_tensors
+        w = ctx.module
+
+        try:
+            with torch.enable_grad():
+                with torch.cuda.amp.autocast(enabled=False):
+                    x_detached = x_orig.to(torch.float32).detach()
+                    x_detached.requires_grad = True
+
+                    metric = _whitening_metric(x_detached, w.num_groups)
+
+                    if random.random() < 0.005 or __name__ == "__main__":
+                        logging.info(
+                            f"Whitening: name={w.name}, num_groups={w.num_groups}, num_channels={x_orig.shape[-1]}, "
+                            f"metric={metric.item():.2f} vs. limit={float(w.whitening_limit)}"
+                        )
+
+                    if metric < float(w.whitening_limit):
+                        w.prob = w.min_prob
+                        return x_grad, None
+                    else:
+                        w.prob = w.max_prob
+                        metric.backward()
+                        penalty_grad = x_detached.grad
+                        scale = w.grad_scale * (
+                            x_grad.to(torch.float32).norm()
+                            / (penalty_grad.norm() + 1.0e-20)
+                        )
+                        penalty_grad = penalty_grad * scale
+                        return x_grad + penalty_grad.to(x_grad.dtype), None
+        except Exception as e:
+            logging.info(
+                f"Caught exception in Whiten backward: {e}, size={list(x_grad.shape)}, will continue."
+            )
+        return x_grad, None
+
+
+class Whiten(nn.Module):
+    def __init__(
+        self,
+        num_groups: int,
+        whitening_limit: FloatLike,
+        prob: Union[float, Tuple[float, float]],
+        grad_scale: FloatLike,
+    ):
+        """
+        Args:
+          num_groups: the number of groups to divide the channel dim into before
+            whitening.  We will attempt to make the feature covariance
+            within each group, after mean subtraction, as "white" as possible,
+            while having the same trace across all groups.
+         whitening_limit: a value greater than 1.0, that dictates how much
+           freedom we have to violate the constraints.  1.0 would mean perfectly
+           white, with exactly the same trace across groups; larger values
+           give more freedom.  E.g. 2.0.
+         prob: the probability with which we apply the gradient modification
+           (also affects the grad scale).  May be supplied as a float,
+           or as a pair (min_prob, max_prob)
+
+          grad_scale: determines the scale on the gradient term from this object,
+            relative to the rest of the gradient on the attention weights.
+            E.g. 0.02 (you may want to use smaller values than this if prob is large)
+        """
+        super(Whiten, self).__init__()
+        assert num_groups >= 1
+        assert float(whitening_limit) >= 1
+        assert grad_scale >= 0
+        self.num_groups = num_groups
+        self.whitening_limit = whitening_limit
+        self.grad_scale = grad_scale
+
+        if isinstance(prob, float):
+            prob = (prob, prob)
+        (self.min_prob, self.max_prob) = prob
+        assert 0 < self.min_prob <= self.max_prob <= 1
+        self.prob = self.max_prob
+        self.name = None  # will be set in training loop
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        In the forward pass, this function just returns the input unmodified.
+        In the backward pass, it will modify the gradients to ensure that the
+        distribution in each group has close to (lambda times I) as the covariance
+        after mean subtraction, with the same lambda across groups.
+        For whitening_limit > 1, there will be more freedom to violate this
+        constraint.
+
+        Args:
+           x: the input of shape (*, num_channels)
+
+        Returns:
+            x, unmodified.   You should make sure
+        you use the returned value, or the graph will be freed
+        and nothing will happen in backprop.
+        """
+        grad_scale = float(self.grad_scale)
+        if not x.requires_grad or random.random() > self.prob or grad_scale == 0:
+            return _no_op(x)
+        else:
+            return WhiteningPenaltyFunction.apply(x, self)
+
+
+class WithLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, y: Tensor, name: str):
+        ctx.y_shape = y.shape
+        if random.random() < 0.002 and name is not None:
+            loss_sum = y.sum().item()
+            logging.info(f"WithLoss: name={name}, loss-sum={loss_sum:.3e}")
+        return x
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        return (
+            ans_grad,
+            torch.ones(ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device),
+            None,
+        )
+
+
+def with_loss(x, y, name):
+    # returns x but adds y.sum() to the loss function.
+    return WithLoss.apply(x, y, name)
+
+
+class ScaleGradFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, alpha: float) -> Tensor:
+        ctx.alpha = alpha
+        return x
+
+    @staticmethod
+    def backward(ctx, grad: Tensor):
+        return grad * ctx.alpha, None
+
+
+def scale_grad(x: Tensor, alpha: float):
+    return ScaleGradFunction.apply(x, alpha)
+
+
+class ScaleGrad(nn.Module):
+    def __init__(self, alpha: float):
+        super().__init__()
+        self.alpha = alpha
+
+    def forward(self, x: Tensor) -> Tensor:
+        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
+            return x
+        return scale_grad(x, self.alpha)
+
+
+class LimitParamValue(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: Tensor, min: float, max: float):
+        ctx.save_for_backward(x)
+        assert max >= min
+        ctx.min = min
+        ctx.max = max
+        return x
+
+    @staticmethod
+    def backward(ctx, x_grad: Tensor):
+        (x,) = ctx.saved_tensors
+        # where x < ctx.min, ensure all grads are negative (this will tend to make
+        # x more positive).
+        x_grad = x_grad * torch.where(
+            torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0
+        )
+        # where x > ctx.max, ensure all grads are positive (this will tend to make
+        # x more negative).
+        x_grad *= torch.where(torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0)
+        return x_grad, None, None
+
+
+def limit_param_value(
+    x: Tensor, min: float, max: float, prob: float = 0.6, training: bool = True
+):
+    # You apply this to (typically) an nn.Parameter during training to ensure that its
+    # (elements mostly) stays within a supplied range.  This is done by modifying the
+    # gradients in backprop.
+    # It's not necessary to do this on every batch: do it only some of the time,
+    # to save a little time.
+    if training and random.random() < prob:
+        return LimitParamValue.apply(x, min, max)
+    else:
+        return x
+
+
+def _no_op(x: Tensor) -> Tensor:
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    else:
+        # a no-op function that will have a node in the autograd graph,
+        # to avoid certain bugs relating to backward hooks
+        return x.chunk(1, dim=-1)[0]
+
+
+class Identity(torch.nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return _no_op(x)
+
+
+class DoubleSwishFunction(torch.autograd.Function):
+    """
+      double_swish(x) = x * torch.sigmoid(x-1)
+
+    This is a definition, originally motivated by its close numerical
+    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
+
+    Memory-efficient derivative computation:
+     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
+     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
+     Now, s'(x) = s(x) * (1-s(x)).
+     double_swish'(x) =  x * s'(x) + s(x).
+                      =  x * s(x) * (1-s(x)) + s(x).
+                     = double_swish(x) * (1-s(x)) + s(x)
+     ... so we just need to remember s(x) but not x itself.
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+
+        s = torch.sigmoid(x - 1.0)
+        y = x * s
+
+        if requires_grad:
+            deriv = y * (1 - s) + s
+
+            # notes on derivative of x * sigmoid(x - 1):
+            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
+            # min \simeq -0.043638.  Take floor as -0.044 so it's a lower bund
+            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
+            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
+            # floors), should be expectation-preserving.
+            floor = -0.044
+            ceil = 1.2
+            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                deriv
+            )
+            if __name__ == "__main__":
+                # for self-testing only.
+                assert d_scaled.min() >= 0.0
+                assert d_scaled.max() < 256.0
+            d_int = d_scaled.to(torch.uint8)
+            ctx.save_for_backward(d_int)
+        if x.dtype == torch.float16 or torch.is_autocast_enabled():
+            y = y.to(torch.float16)
+        return y
+
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d,) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+        floor = -0.043637
+        ceil = 1.2
+
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+
+
+class DoubleSwish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
+        that we approximate closely with x * sigmoid(x-1).
+        """
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return x * torch.sigmoid(x - 1.0)
+        return DoubleSwishFunction.apply(x)
+
+
+# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates.
+class Dropout2(nn.Module):
+    def __init__(self, p: FloatLike):
+        super().__init__()
+        self.p = p
+
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.nn.functional.dropout(x, p=float(self.p), training=self.training)
+
+
+class MulForDropout3(torch.autograd.Function):
+    # returns (x * y * alpha) where alpha is a float and y doesn't require
+    # grad and is zero-or-one.
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, y, alpha):
+        assert not y.requires_grad
+        ans = x * y * alpha
+        ctx.save_for_backward(ans)
+        ctx.alpha = alpha
+        return ans
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, ans_grad):
+        (ans,) = ctx.saved_tensors
+        x_grad = ctx.alpha * ans_grad * (ans != 0)
+        return x_grad, None, None
+
+
+# Dropout3 is just like normal dropout, except it supports schedules on the dropout rates,
+# and it lets you choose one dimension to share the dropout mask over
+class Dropout3(nn.Module):
+    def __init__(self, p: FloatLike, shared_dim: int):
+        super().__init__()
+        self.p = p
+        self.shared_dim = shared_dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        p = float(self.p)
+        if not self.training or p == 0:
+            return _no_op(x)
+        scale = 1.0 / (1 - p)
+        rand_shape = list(x.shape)
+        rand_shape[self.shared_dim] = 1
+        mask = torch.rand(*rand_shape, device=x.device) > p
+        ans = MulForDropout3.apply(x, mask, scale)
+        return ans
+
+
+class SwooshLFunction(torch.autograd.Function):
+    """
+    swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+
+        coeff = -0.08
+
+        with torch.cuda.amp.autocast(enabled=False):
+            with torch.enable_grad():
+                x = x.detach()
+                x.requires_grad = True
+                y = torch.logaddexp(zero, x - 4.0) + coeff * x - 0.035
+
+                if not requires_grad:
+                    return y
+
+                y.backward(gradient=torch.ones_like(y))
+
+                grad = x.grad
+                floor = coeff
+                ceil = 1.0 + coeff + 0.005
+
+                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                    grad
+                )
+                if __name__ == "__main__":
+                    # for self-testing only.
+                    assert d_scaled.min() >= 0.0
+                    assert d_scaled.max() < 256.0
+
+                d_int = d_scaled.to(torch.uint8)
+                ctx.save_for_backward(d_int)
+                if x.dtype == torch.float16 or torch.is_autocast_enabled():
+                    y = y.to(torch.float16)
+                return y
+
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d,) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+
+        coeff = -0.08
+        floor = coeff
+        ceil = 1.0 + coeff + 0.005
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+
+
+class SwooshL(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation."""
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+            return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
+        if not x.requires_grad:
+            return k2.swoosh_l_forward(x)
+        else:
+            return k2.swoosh_l(x)
+        # return SwooshLFunction.apply(x)
+
+
+class SwooshLOnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation."""
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
+
+
+class SwooshRFunction(torch.autograd.Function):
+    """
+     swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
+
+    derivatives are between -0.08 and 0.92.
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            with torch.enable_grad():
+                x = x.detach()
+                x.requires_grad = True
+                y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
+
+                if not requires_grad:
+                    return y
+                y.backward(gradient=torch.ones_like(y))
+
+                grad = x.grad
+                floor = -0.08
+                ceil = 0.925
+
+                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
+                    grad
+                )
+                if __name__ == "__main__":
+                    # for self-testing only.
+                    assert d_scaled.min() >= 0.0
+                    assert d_scaled.max() < 256.0
+
+                d_int = d_scaled.to(torch.uint8)
+                ctx.save_for_backward(d_int)
+                if x.dtype == torch.float16 or torch.is_autocast_enabled():
+                    y = y.to(torch.float16)
+                return y
+
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d,) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+        floor = -0.08
+        ceil = 0.925
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+
+
+class SwooshR(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation."""
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+            return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
+        if not x.requires_grad:
+            return k2.swoosh_r_forward(x)
+        else:
+            return k2.swoosh_r(x)
+        # return SwooshRFunction.apply(x)
+
+
+class SwooshROnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation."""
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687
+
+
+# simple version of SwooshL that does not redefine the backprop, used in
+# ActivationDropoutAndLinearFunction.
+def SwooshLForward(x: Tensor):
+    x_offset = x - 4.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
+    return log_sum - 0.08 * x - 0.035
+
+
+# simple version of SwooshR that does not redefine the backprop, used in
+# ActivationDropoutAndLinearFunction.
+def SwooshRForward(x: Tensor):
+    x_offset = x - 1.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
+    return log_sum - 0.08 * x - 0.313261687
+
+
+class ActivationDropoutAndLinearFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x: Tensor,
+        weight: Tensor,
+        bias: Optional[Tensor],
+        activation: str,
+        dropout_p: float,
+        dropout_shared_dim: Optional[int],
+    ):
+        if dropout_p != 0.0:
+            dropout_shape = list(x.shape)
+            if dropout_shared_dim is not None:
+                dropout_shape[dropout_shared_dim] = 1
+            # else it won't be very memory efficient.
+            dropout_mask = (1.0 / (1.0 - dropout_p)) * (
+                torch.rand(*dropout_shape, device=x.device, dtype=x.dtype) > dropout_p
+            )
+        else:
+            dropout_mask = None
+
+        ctx.save_for_backward(x, weight, bias, dropout_mask)
+
+        ctx.activation = activation
+
+        forward_activation_dict = {
+            "SwooshL": k2.swoosh_l_forward,
+            "SwooshR": k2.swoosh_r_forward,
+        }
+        # it will raise a KeyError if this fails.  This will be an error.  We let it
+        # propagate to the user.
+        activation_func = forward_activation_dict[activation]
+        x = activation_func(x)
+        if dropout_mask is not None:
+            x = x * dropout_mask
+        x = torch.nn.functional.linear(x, weight, bias)
+        return x
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, ans_grad: Tensor):
+        saved = ctx.saved_tensors
+        (x, weight, bias, dropout_mask) = saved
+
+        forward_and_deriv_activation_dict = {
+            "SwooshL": k2.swoosh_l_forward_and_deriv,
+            "SwooshR": k2.swoosh_r_forward_and_deriv,
+        }
+        # the following lines a KeyError if the activation is unrecognized.
+        # This will be an error.  We let it propagate to the user.
+        func = forward_and_deriv_activation_dict[ctx.activation]
+
+        y, func_deriv = func(x)
+        if dropout_mask is not None:
+            y = y * dropout_mask
+        # now compute derivative of y w.r.t. weight and bias..
+        # y: (..., in_channels), ans_grad: (..., out_channels),
+        (out_channels, in_channels) = weight.shape
+
+        in_channels = y.shape[-1]
+        g = ans_grad.reshape(-1, out_channels)
+        weight_deriv = torch.matmul(g.t(), y.reshape(-1, in_channels))
+        y_deriv = torch.matmul(ans_grad, weight)
+        bias_deriv = None if bias is None else g.sum(dim=0)
+        x_deriv = y_deriv * func_deriv
+        if dropout_mask is not None:
+            # order versus func_deriv does not matter
+            x_deriv = x_deriv * dropout_mask
+
+        return x_deriv, weight_deriv, bias_deriv, None, None, None
+
+
+class ActivationDropoutAndLinear(torch.nn.Module):
+    """
+     This merges an activation function followed by dropout and then a nn.Linear module;
+     it does so in a memory efficient way so that it only stores the input to the whole
+     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
+     equivalent to:
+       nn.Sequential(SwooshL(),
+                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
+                     ScaledLinear(in_channels, out_channels, bias=bias,
+                                  initial_scale=initial_scale))
+    If dropout_shared_dim is None, the dropout would be equivalent to
+    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
+    mask is smaller.
+
+     Args:
+        in_channels: number of input channels, e.g. 256
+        out_channels: number of output channels, e.g. 256
+        bias: if true, have a bias
+        activation: the activation function, for now just support SwooshL.
+        dropout_p: the dropout probability or schedule (happens after nonlinearity).
+        dropout_shared_dim: the dimension, if any, across which the dropout mask is
+             shared (e.g. the time dimension).  If None, this may be less memory
+             efficient if there are modules before this one that cache the input
+             for their backprop (e.g. Balancer or Whiten).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        bias: bool = True,
+        activation: str = "SwooshL",
+        dropout_p: FloatLike = 0.0,
+        dropout_shared_dim: Optional[int] = -1,
+        initial_scale: float = 1.0,
+    ):
+        super().__init__()
+        # create a temporary module of nn.Linear that we'll steal the
+        # weights and bias from
+        l = ScaledLinear(
+            in_channels, out_channels, bias=bias, initial_scale=initial_scale
+        )
+
+        self.weight = l.weight
+        # register_parameter properly handles making it a parameter when l.bias
+        # is None. I think there is some reason for doing it this way rather
+        # than just setting it to None but I don't know what it is, maybe
+        # something to do with exporting the module..
+        self.register_parameter("bias", l.bias)
+
+        self.activation = activation
+        self.dropout_p = dropout_p
+        self.dropout_shared_dim = dropout_shared_dim
+
+    def forward(self, x: Tensor):
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            if self.activation == "SwooshL":
+                x = SwooshLForward(x)
+            elif self.activation == "SwooshR":
+                x = SwooshRForward(x)
+            else:
+                assert False, self.activation
+            return torch.nn.functional.linear(x, self.weight, self.bias)
+
+        return ActivationDropoutAndLinearFunction.apply(
+            x,
+            self.weight,
+            self.bias,
+            self.activation,
+            float(self.dropout_p),
+            self.dropout_shared_dim,
+        )
+
+
+def convert_num_channels(x: Tensor, num_channels: int) -> Tensor:
+    if num_channels <= x.shape[-1]:
+        return x[..., :num_channels]
+    else:
+        shape = list(x.shape)
+        shape[-1] = num_channels - shape[-1]
+        zeros = torch.zeros(shape, dtype=x.dtype, device=x.device)
+        return torch.cat((x, zeros), dim=-1)
+
+
+def _test_whiten():
+    for proportion in [0.1, 0.5, 10.0]:
+        logging.info(f"_test_whiten(): proportion = {proportion}")
+        x = torch.randn(100, 128)
+        direction = torch.randn(128)
+        coeffs = torch.randn(100, 1)
+        x += proportion * direction * coeffs
+
+        x.requires_grad = True
+
+        m = Whiten(
+            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
+        )  # grad_scale
+
+        for _ in range(4):
+            y = m(x)
+
+        y_grad = torch.randn_like(x)
+        y.backward(gradient=y_grad)
+
+        if proportion < 0.2:
+            assert torch.allclose(x.grad, y_grad)
+        elif proportion > 1.0:
+            assert not torch.allclose(x.grad, y_grad)
+
+
+def _test_balancer_sign():
+    probs = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = 1.0 * ((2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0)
+    x = x.detach()
+    x.requires_grad = True
+    m = Balancer(
+        probs.numel(),
+        channel_dim=0,
+        min_positive=0.05,
+        max_positive=0.95,
+        min_abs=0.0,
+        prob=1.0,
+    )
+
+    y_grad = torch.sign(torch.randn(probs.numel(), N))
+
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_balancer_sign: x = ", x)
+    print("_test_balancer_sign: y grad = ", y_grad)
+    print("_test_balancer_sign: x grad = ", x.grad)
+
+
+def _test_balancer_magnitude():
+    magnitudes = torch.arange(0, 1, 0.01)
+    N = 1000
+    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(-1)
+    x = x.detach()
+    x.requires_grad = True
+    m = Balancer(
+        magnitudes.numel(),
+        channel_dim=0,
+        min_positive=0.0,
+        max_positive=1.0,
+        min_abs=0.2,
+        max_abs=0.7,
+        prob=1.0,
+    )
+
+    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
+
+    y = m(x)
+    y.backward(gradient=y_grad)
+    print("_test_balancer_magnitude: x = ", x)
+    print("_test_balancer_magnitude: y grad = ", y_grad)
+    print("_test_balancer_magnitude: x grad = ", x.grad)
+
+
+def _test_double_swish_deriv():
+    x = torch.randn(10, 12, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    m = DoubleSwish()
+
+    tol = (1.2 - (-0.043637)) / 255.0
+    torch.autograd.gradcheck(m, x, atol=tol)
+
+    # for self-test.
+    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    y = m(x)
+
+
+def _test_swooshl_deriv():
+    x = torch.randn(10, 12, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    m = SwooshL()
+
+    tol = 1.0 / 255.0
+    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
+
+    # for self-test.
+    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    y = m(x)
+
+
+def _test_swooshr_deriv():
+    x = torch.randn(10, 12, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    m = SwooshR()
+
+    tol = 1.0 / 255.0
+    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
+
+    # for self-test.
+    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
+    x.requires_grad = True
+    y = m(x)
+
+
+def _test_softmax():
+    a = torch.randn(2, 10, dtype=torch.float64)
+    b = a.clone()
+    a.requires_grad = True
+    b.requires_grad = True
+    a.softmax(dim=1)[:, 0].sum().backward()
+    print("a grad = ", a.grad)
+    softmax(b, dim=1)[:, 0].sum().backward()
+    print("b grad = ", b.grad)
+    assert torch.allclose(a.grad, b.grad)
+
+
+def _test_piecewise_linear():
+    p = PiecewiseLinear((0, 10.0))
+    for x in [-100, 0, 100]:
+        assert p(x) == 10.0
+    p = PiecewiseLinear((0, 10.0), (1, 0.0))
+    for x, y in [(-100, 10.0), (0, 10.0), (0.5, 5.0), (1, 0.0), (2, 0.0)]:
+        print("x, y = ", x, y)
+        assert p(x) == y, (x, p(x), y)
+
+    q = PiecewiseLinear((0.5, 15.0), (0.6, 1.0))
+    x_vals = [-1.0, 0.0, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.0, 2.0]
+    pq = p.max(q)
+    for x in x_vals:
+        y1 = max(p(x), q(x))
+        y2 = pq(x)
+        assert abs(y1 - y2) < 0.001
+    pq = p.min(q)
+    for x in x_vals:
+        y1 = min(p(x), q(x))
+        y2 = pq(x)
+        assert abs(y1 - y2) < 0.001
+    pq = p + q
+    for x in x_vals:
+        y1 = p(x) + q(x)
+        y2 = pq(x)
+        assert abs(y1 - y2) < 0.001
+
+
+def _test_activation_dropout_and_linear():
+    in_channels = 20
+    out_channels = 30
+
+    for bias in [True, False]:
+        # actually we don't test for dropout_p != 0.0 because forward functions will give
+        # different answers.  This is because we are using the k2 implementation of
+        # swoosh_l an swoosh_r inside SwooshL() and SwooshR(), and they call randn()
+        # internally, messing up the random state.
+        for dropout_p in [0.0]:
+            for activation in ["SwooshL", "SwooshR"]:
+                m1 = nn.Sequential(
+                    SwooshL() if activation == "SwooshL" else SwooshR(),
+                    Dropout3(p=dropout_p, shared_dim=-1),
+                    ScaledLinear(
+                        in_channels, out_channels, bias=bias, initial_scale=0.5
+                    ),
+                )
+                m2 = ActivationDropoutAndLinear(
+                    in_channels,
+                    out_channels,
+                    bias=bias,
+                    initial_scale=0.5,
+                    activation=activation,
+                    dropout_p=dropout_p,
+                )
+                with torch.no_grad():
+                    m2.weight[:] = m1[2].weight
+                    if bias:
+                        m2.bias[:] = m1[2].bias
+                # make sure forward gives same result.
+                x1 = torch.randn(10, in_channels)
+                x1.requires_grad = True
+
+                # TEMP.
+                assert torch.allclose(
+                    SwooshRFunction.apply(x1), SwooshRForward(x1), atol=1.0e-03
+                )
+
+                x2 = x1.clone().detach()
+                x2.requires_grad = True
+                seed = 10
+                torch.manual_seed(seed)
+                y1 = m1(x1)
+                y_grad = torch.randn_like(y1)
+                y1.backward(gradient=y_grad)
+                torch.manual_seed(seed)
+                y2 = m2(x2)
+                y2.backward(gradient=y_grad)
+
+                print(
+                    f"bias = {bias}, dropout_p = {dropout_p}, activation = {activation}"
+                )
+                print("y1 = ", y1)
+                print("y2 = ", y2)
+                assert torch.allclose(y1, y2, atol=0.02)
+                assert torch.allclose(m1[2].weight.grad, m2.weight.grad, atol=1.0e-05)
+                if bias:
+                    assert torch.allclose(m1[2].bias.grad, m2.bias.grad, atol=1.0e-05)
+                print("x1.grad = ", x1.grad)
+                print("x2.grad = ", x2.grad)
+
+                def isclose(a, b):
+                    # return true if cosine similarity is > 0.9.
+                    return (a * b).sum() > 0.9 * (
+                        (a**2).sum() * (b**2).sum()
+                    ).sqrt()
+
+                # the SwooshL() implementation has a noisy gradient due to 1-byte
+                # storage of it.
+                assert isclose(x1.grad, x2.grad)
+
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    _test_piecewise_linear()
+    _test_softmax()
+    _test_whiten()
+    _test_balancer_sign()
+    _test_balancer_magnitude()
+    _test_double_swish_deriv()
+    _test_swooshr_deriv()
+    _test_swooshl_deriv()
+    _test_activation_dropout_and_linear()
diff --git a/egs/librispeech/SSL/hubert/ssl_datamodule.py b/egs/librispeech/SSL/hubert/ssl_datamodule.py
new file mode 100644
index 0000000000..e7cd28aa8d
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/ssl_datamodule.py
@@ -0,0 +1,262 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2023  Xiaomi Corporation     (Author: Yifan Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+from dataset import HubertDataset
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.dataset import DynamicBucketingSampler, SimpleCutSampler
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class LibriSpeechSslDataModule:
+    """
+    DataModule for SSL experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in SSL
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="SSL data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies.",
+        )
+        group.add_argument(
+            "--full-libri",
+            type=str2bool,
+            default=True,
+            help="When enabled use 960h LibriSpeech. " "Otherwise, use 100h subset.",
+        )
+
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/fbank"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        logging.info("About to create train dataset")
+        train = HubertDataset()
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SimpleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        logging.info("About to create dev dataset")
+        validate = HubertDataset()
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = HubertDataset(
+            return_cuts=self.args.return_cuts,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_clean_100_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-100 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
+        )
+
+    @lru_cache()
+    def train_clean_360_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-360 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
+        )
+
+    @lru_cache()
+    def train_other_500_cuts(self) -> CutSet:
+        logging.info("About to get train-other-500 cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
+        )
+
+    @lru_cache()
+    def dev_clean_cuts(self) -> CutSet:
+        logging.info("About to get dev-clean cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
+        )
+
+    @lru_cache()
+    def dev_other_cuts(self) -> CutSet:
+        logging.info("About to get dev-other cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
+        )
+
+    @lru_cache()
+    def test_clean_cuts(self) -> CutSet:
+        logging.info("About to get test-clean cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
+        )
+
+    @lru_cache()
+    def test_other_cuts(self) -> CutSet:
+        logging.info("About to get test-other cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
+        )
diff --git a/egs/librispeech/SSL/hubert/subsampling.py b/egs/librispeech/SSL/hubert/subsampling.py
new file mode 100644
index 0000000000..b2f769d3f6
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/subsampling.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Daniel Povey,
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Tuple
+
+import torch
+from scaling import (
+    Balancer,
+    BiasNorm,
+    Dropout3,
+    FloatLike,
+    Optional,
+    ScaledConv2d,
+    ScaleGrad,
+    ScheduledFloat,
+    SwooshL,
+    SwooshR,
+    Whiten,
+)
+from torch import Tensor, nn
+
+
+class ConvNeXt(nn.Module):
+    """
+    Our interpretation of the ConvNeXt module as used in https://arxiv.org/pdf/2206.14747.pdf
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        hidden_ratio: int = 3,
+        kernel_size: Tuple[int, int] = (7, 7),
+        layerdrop_rate: FloatLike = None,
+    ):
+        super().__init__()
+        self.padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
+        hidden_channels = channels * hidden_ratio
+        if layerdrop_rate is None:
+            layerdrop_rate = ScheduledFloat((0.0, 0.2), (20000.0, 0.015))
+        self.layerdrop_rate = layerdrop_rate
+
+        self.depthwise_conv = nn.Conv2d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=kernel_size,
+            padding=self.padding,
+        )
+
+        self.pointwise_conv1 = nn.Conv2d(
+            in_channels=channels, out_channels=hidden_channels, kernel_size=1
+        )
+
+        self.hidden_balancer = Balancer(
+            hidden_channels,
+            channel_dim=1,
+            min_positive=0.3,
+            max_positive=1.0,
+            min_abs=0.75,
+            max_abs=5.0,
+        )
+
+        self.activation = SwooshL()
+        self.pointwise_conv2 = ScaledConv2d(
+            in_channels=hidden_channels,
+            out_channels=channels,
+            kernel_size=1,
+            initial_scale=0.01,
+        )
+
+        self.out_balancer = Balancer(
+            channels,
+            channel_dim=1,
+            min_positive=0.4,
+            max_positive=0.6,
+            min_abs=1.0,
+            max_abs=6.0,
+        )
+        self.out_whiten = Whiten(
+            num_groups=1,
+            whitening_limit=5.0,
+            prob=(0.025, 0.25),
+            grad_scale=0.01,
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
+            return self.forward_internal(x)
+        layerdrop_rate = float(self.layerdrop_rate)
+
+        if layerdrop_rate != 0.0:
+            batch_size = x.shape[0]
+            mask = (
+                torch.rand((batch_size, 1, 1, 1), dtype=x.dtype, device=x.device)
+                > layerdrop_rate
+            )
+        else:
+            mask = None
+        # turns out this caching idea does not work with --world-size > 1
+        # return caching_eval(self.forward_internal, x, mask)
+        return self.forward_internal(x, mask)
+
+    def forward_internal(
+        self, x: Tensor, layer_skip_mask: Optional[Tensor] = None
+    ) -> Tensor:
+        """
+        x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
+
+        The returned value has the same shape as x.
+        """
+        bypass = x
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv1(x)
+        x = self.hidden_balancer(x)
+        x = self.activation(x)
+        x = self.pointwise_conv2(x)
+
+        if layer_skip_mask is not None:
+            x = x * layer_skip_mask
+
+        x = bypass + x
+        x = self.out_balancer(x)
+
+        if x.requires_grad:
+            x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
+            x = self.out_whiten(x)
+            x = x.transpose(1, 3)  # (N, C, H, W)
+
+        return x
+
+    def streaming_forward(
+        self,
+        x: Tensor,
+        cached_left_pad: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
+            cached_left_pad: (batch_size, num_channels, left_pad, num_freqs)
+
+        Returns:
+            - The returned value has the same shape as x.
+            - Updated cached_left_pad.
+        """
+        padding = self.padding
+
+        # The length without right padding for depth-wise conv
+        T = x.size(2) - padding[0]
+
+        bypass = x[:, :, :T, :]
+
+        # Pad left side
+        assert cached_left_pad.size(2) == padding[0], (
+            cached_left_pad.size(2),
+            padding[0],
+        )
+        x = torch.cat([cached_left_pad, x], dim=2)
+        # Update cached left padding
+        cached_left_pad = x[:, :, T : padding[0] + T, :]
+
+        # depthwise_conv
+        x = torch.nn.functional.conv2d(
+            x,
+            weight=self.depthwise_conv.weight,
+            bias=self.depthwise_conv.bias,
+            padding=(0, padding[1]),
+            groups=self.depthwise_conv.groups,
+        )
+        x = self.pointwise_conv1(x)
+        x = self.hidden_balancer(x)
+        x = self.activation(x)
+        x = self.pointwise_conv2(x)
+
+        x = bypass + x
+        return x, cached_left_pad
+
+
+class Conv2dSubsampling(nn.Module):
+    """Convolutional 2D subsampling (to 1/2 length).
+
+    Convert an input of shape (N, T, idim) to an output
+    with shape (N, T', odim), where
+    T' = (T-3)//2 - 2 == (T-7)//2
+
+    It is based on
+    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        layer1_channels: int = 8,
+        layer2_channels: int = 32,
+        layer3_channels: int = 128,
+        dropout: FloatLike = 0.1,
+    ) -> None:
+        """
+        Args:
+          in_channels:
+            Number of channels in. The input shape is (N, T, in_channels).
+            Caution: It requires: T >=7, in_channels >=7
+          out_channels
+            Output dim. The output shape is (N, (T-3)//2, out_channels)
+          layer1_channels:
+            Number of channels in layer1
+          layer1_channels:
+            Number of channels in layer2
+          bottleneck:
+            bottleneck dimension for 1d squeeze-excite
+        """
+        assert in_channels >= 7
+        super().__init__()
+
+        # The ScaleGrad module is there to prevent the gradients
+        # w.r.t. the weight or bias of the first Conv2d module in self.conv from
+        # exceeding the range of fp16 when using automatic mixed precision (amp)
+        # training.  (The second one is necessary to stop its bias from getting
+        # a too-large gradient).
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=1,
+                out_channels=layer1_channels,
+                kernel_size=3,
+                padding=(0, 1),  # (time, freq)
+            ),
+            ScaleGrad(0.2),
+            Balancer(layer1_channels, channel_dim=1, max_abs=1.0),
+            SwooshR(),
+            nn.Conv2d(
+                in_channels=layer1_channels,
+                out_channels=layer2_channels,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+            ),
+            Balancer(layer2_channels, channel_dim=1, max_abs=4.0),
+            SwooshR(),
+            nn.Conv2d(
+                in_channels=layer2_channels,
+                out_channels=layer3_channels,
+                kernel_size=3,
+                stride=(1, 2),  # (time, freq)
+            ),
+            Balancer(layer3_channels, channel_dim=1, max_abs=4.0),
+            SwooshR(),
+        )
+
+        # just one convnext layer
+        self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
+
+        # (in_channels-3)//4
+        self.out_width = (((in_channels - 1) // 2) - 1) // 2
+        self.layer3_channels = layer3_channels
+
+        self.out = nn.Linear(self.out_width * layer3_channels, out_channels)
+        # use a larger than normal grad_scale on this whitening module; there is
+        # only one such module, so there is not a concern about adding together
+        # many copies of this extra gradient term.
+        self.out_whiten = Whiten(
+            num_groups=1,
+            whitening_limit=ScheduledFloat((0.0, 4.0), (20000.0, 8.0), default=4.0),
+            prob=(0.025, 0.25),
+            grad_scale=0.02,
+        )
+
+        # max_log_eps=0.0 is to prevent both eps and the output of self.out from
+        # getting large, there is an unnecessary degree of freedom.
+        self.out_norm = BiasNorm(out_channels)
+        self.dropout = Dropout3(dropout, shared_dim=1)
+
+    def forward(
+        self, x: torch.Tensor, x_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+          x:
+            Its shape is (N, T, idim).
+          x_lens:
+            A tensor of shape (batch_size,) containing the number of frames in
+
+        Returns:
+          - a tensor of shape (N, (T-7)//2, odim)
+          - output lengths, of shape (batch_size,)
+        """
+        # On entry, x is (N, T, idim)
+        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
+        # scaling x by 0.1 allows us to use a larger grad-scale in fp16 "amp" (automatic mixed precision)
+        # training, since the weights in the first convolution are otherwise the limiting factor for getting infinite
+        # gradients.
+        x = self.conv(x)
+        x = self.convnext(x)
+
+        # Now x is of shape (N, odim, (T-7)//2, (idim-3)//4)
+        b, c, t, f = x.size()
+
+        x = x.transpose(1, 2).reshape(b, t, c * f)
+        # now x: (N, (T-7)//2, out_width * layer3_channels))
+
+        x = self.out(x)
+        # Now x is of shape (N, (T-7)//2, odim)
+        x = self.out_whiten(x)
+        x = self.out_norm(x)
+        x = self.dropout(x)
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            x_lens = (x_lens - 7) // 2
+        else:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                x_lens = (x_lens - 7) // 2
+        assert x.size(1) == x_lens.max().item(), (x.size(1), x_lens.max())
+
+        return x, x_lens
+
+    def streaming_forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        cached_left_pad: Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+          x:
+            Its shape is (N, T, idim).
+          x_lens:
+            A tensor of shape (batch_size,) containing the number of frames in
+
+        Returns:
+          - a tensor of shape (N, (T-7)//2, odim)
+          - output lengths, of shape (batch_size,)
+          - updated cache
+        """
+        # On entry, x is (N, T, idim)
+        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
+
+        # T' = (T-7)//2
+        x = self.conv(x)
+
+        # T' = (T-7)//2-3
+        x, cached_left_pad = self.convnext.streaming_forward(
+            x, cached_left_pad=cached_left_pad
+        )
+
+        # Now x is of shape (N, odim, T', ((idim-1)//2 - 1)//2)
+        b, c, t, f = x.size()
+
+        x = x.transpose(1, 2).reshape(b, t, c * f)
+        # now x: (N, T', out_width * layer3_channels))
+
+        x = self.out(x)
+        # Now x is of shape (N, T', odim)
+        x = self.out_norm(x)
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            assert self.convnext.padding[0] == 3
+            # The ConvNeXt module needs 3 frames of right padding after subsampling
+            x_lens = (x_lens - 7) // 2 - 3
+        else:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                # The ConvNeXt module needs 3 frames of right padding after subsampling
+                assert self.convnext.padding[0] == 3
+                x_lens = (x_lens - 7) // 2 - 3
+
+        assert x.size(1) == x_lens.max().item(), (x.shape, x_lens.max())
+
+        return x, x_lens, cached_left_pad
+
+    @torch.jit.export
+    def get_init_states(
+        self,
+        batch_size: int = 1,
+        device: torch.device = torch.device("cpu"),
+    ) -> Tensor:
+        """Get initial states for Conv2dSubsampling module.
+        It is the cached left padding for ConvNeXt module,
+        of shape (batch_size, num_channels, left_pad, num_freqs)
+        """
+        left_pad = self.convnext.padding[0]
+        freq = self.out_width
+        channels = self.layer3_channels
+        cached_embed_left_pad = torch.zeros(batch_size, channels, left_pad, freq).to(
+            device
+        )
+
+        return cached_embed_left_pad
diff --git a/egs/librispeech/SSL/shared b/egs/librispeech/SSL/shared
new file mode 120000
index 0000000000..4cbd91a7e9
--- /dev/null
+++ b/egs/librispeech/SSL/shared
@@ -0,0 +1 @@
+../../../icefall/shared
\ No newline at end of file

From 75c5389979f39ce8e79b0c265b35254bc6adeafd Mon Sep 17 00:00:00 2001
From: yifanyeung <yifanyeung@yifanyeung.local>
Date: Sat, 23 Dec 2023 15:27:34 +0800
Subject: [PATCH 2/4] update

---
 egs/librispeech/SSL/hubert/beam_search.py | 2943 +--------------------
 egs/librispeech/SSL/hubert/ctc_decode.py  |   24 +-
 egs/librispeech/SSL/hubert/dataset.py     |   12 +-
 egs/librispeech/SSL/hubert/decoder.py     |  135 +-
 egs/librispeech/SSL/hubert/finetune.py    |   36 +-
 egs/librispeech/SSL/hubert/joiner.py      |   68 +-
 egs/librispeech/SSL/hubert/optim.py       | 1245 +--------
 egs/librispeech/SSL/hubert/scaling.py     | 1909 +------------
 egs/librispeech/SSL/hubert/subsampling.py |  406 ---
 9 files changed, 42 insertions(+), 6736 deletions(-)
 mode change 100644 => 120000 egs/librispeech/SSL/hubert/beam_search.py
 mode change 100644 => 120000 egs/librispeech/SSL/hubert/decoder.py
 mode change 100644 => 120000 egs/librispeech/SSL/hubert/joiner.py
 mode change 100644 => 120000 egs/librispeech/SSL/hubert/optim.py
 mode change 100644 => 120000 egs/librispeech/SSL/hubert/scaling.py
 delete mode 100644 egs/librispeech/SSL/hubert/subsampling.py

diff --git a/egs/librispeech/SSL/hubert/beam_search.py b/egs/librispeech/SSL/hubert/beam_search.py
deleted file mode 100644
index 7fcd242fcd..0000000000
--- a/egs/librispeech/SSL/hubert/beam_search.py
+++ /dev/null
@@ -1,2942 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
-#                                                  Xiaoyu Yang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Tuple, Union
-
-import k2
-import sentencepiece as spm
-import torch
-from torch import nn
-
-from icefall import ContextGraph, ContextState, NgramLm, NgramLmStateCost
-from icefall.decode import Nbest, one_best_decoding
-from icefall.lm_wrapper import LmScorer
-from icefall.rnn_lm.model import RnnLmModel
-from icefall.transformer_lm.model import TransformerLM
-from icefall.utils import (
-    DecodingResults,
-    add_eos,
-    add_sos,
-    get_texts,
-    get_texts_with_timestamp,
-)
-
-
-def fast_beam_search_one_best(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-    ilme_scale: float = 0.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using fast beam search, and then
-    the shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        ilme_scale=ilme_scale,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-    )
-
-    best_path = one_best_decoding(lattice)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest_LG(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    nbest_scale: float = 0.5,
-    use_double_scores: bool = True,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    ilme_scale: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    The process to get the results is:
-     - (1) Use fast beam search to get a lattice
-     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
-     - (3) Unique the selected paths
-     - (4) Intersect the selected paths with the lattice and compute the
-           shortest path from the intersection result
-     - (5) The path with the largest score is used as the decoding output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-        ilme_scale=ilme_scale,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    # The following code is modified from nbest.intersect()
-    word_fsa = k2.invert(nbest.fsa)
-    if hasattr(lattice, "aux_labels"):
-        # delete token IDs as it is not needed
-        del word_fsa.aux_labels
-    word_fsa.scores.zero_()
-    word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa)
-    path_to_utt_map = nbest.shape.row_ids(1)
-
-    if hasattr(lattice, "aux_labels"):
-        # lattice has token IDs as labels and word IDs as aux_labels.
-        # inv_lattice has word IDs as labels and token IDs as aux_labels
-        inv_lattice = k2.invert(lattice)
-        inv_lattice = k2.arc_sort(inv_lattice)
-    else:
-        inv_lattice = k2.arc_sort(lattice)
-
-    if inv_lattice.shape[0] == 1:
-        path_lattice = k2.intersect_device(
-            inv_lattice,
-            word_fsa_with_epsilon_loops,
-            b_to_a_map=torch.zeros_like(path_to_utt_map),
-            sorted_match_a=True,
-        )
-    else:
-        path_lattice = k2.intersect_device(
-            inv_lattice,
-            word_fsa_with_epsilon_loops,
-            b_to_a_map=path_to_utt_map,
-            sorted_match_a=True,
-        )
-
-    # path_lattice has word IDs as labels and token IDs as aux_labels
-    path_lattice = k2.top_sort(k2.connect(path_lattice))
-    tot_scores = path_lattice.get_tot_scores(
-        use_double_scores=use_double_scores,
-        log_semiring=True,  # Note: we always use True
-    )
-    # See https://github.com/k2-fsa/icefall/pull/420 for why
-    # we always use log_semiring=True
-
-    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-    best_hyp_indexes = ragged_tot_scores.argmax()
-    best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    nbest_scale: float = 0.5,
-    use_double_scores: bool = True,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    The process to get the results is:
-     - (1) Use fast beam search to get a lattice
-     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
-     - (3) Unique the selected paths
-     - (4) Intersect the selected paths with the lattice and compute the
-           shortest path from the intersection result
-     - (5) The path with the largest score is used as the decoding output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        blank_penalty=blank_penalty,
-        temperature=temperature,
-        allow_partial=allow_partial,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    max_indexes = nbest.tot_scores().argmax()
-
-    best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search_nbest_oracle(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    num_paths: int,
-    ref_texts: List[List[int]],
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-    allow_partial: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    A lattice is first obtained using fast beam search, and then
-    we select `num_paths` linear paths from the lattice. The path
-    that has the minimum edit distance with the given reference transcript
-    is used as the output.
-
-    This is the best result we can achieve for any nbest based rescoring
-    methods.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      ref_texts:
-        A list-of-list of integers containing the reference transcripts.
-        If the decoding_graph is a trivial_graph, the integer ID is the
-        BPE token ID.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-        allow_partial=allow_partial,
-        blank_penalty=blank_penalty,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-
-    hyps = nbest.build_levenshtein_graphs()
-    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
-
-    levenshtein_alignment = k2.levenshtein_alignment(
-        refs=refs,
-        hyps=hyps,
-        hyp_to_ref_map=nbest.shape.row_ids(1),
-        sorted_match_ref=True,
-    )
-
-    tot_scores = levenshtein_alignment.get_tot_scores(
-        use_double_scores=False, log_semiring=False
-    )
-    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-
-    max_indexes = ragged_tot_scores.argmax()
-
-    best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-    if not return_timestamps:
-        return get_texts(best_path)
-    else:
-        return get_texts_with_timestamp(best_path)
-
-
-def fast_beam_search(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    temperature: float = 1.0,
-    subtract_ilme: bool = False,
-    ilme_scale: float = 0.1,
-    allow_partial: bool = False,
-    blank_penalty: float = 0.0,
-) -> k2.Fsa:
-    """It limits the maximum number of symbols per frame to 1.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi..
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      temperature:
-        Softmax temperature.
-    Returns:
-      Return an FsaVec with axes [utt][state][arc] containing the decoded
-      lattice. Note: When the input graph is a TrivialGraph, the returned
-      lattice is actually an acceptor.
-    """
-    assert encoder_out.ndim == 3
-
-    context_size = model.decoder.context_size
-    vocab_size = model.decoder.vocab_size
-
-    B, T, C = encoder_out.shape
-
-    config = k2.RnntDecodingConfig(
-        vocab_size=vocab_size,
-        decoder_history_len=context_size,
-        beam=beam,
-        max_contexts=max_contexts,
-        max_states=max_states,
-    )
-    individual_streams = []
-    for i in range(B):
-        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
-    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # shape is a RaggedShape of shape (B, context)
-        # contexts is a Tensor of shape (shape.NumElements(), context_size)
-        shape, contexts = decoding_streams.get_contexts()
-        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
-        contexts = contexts.to(torch.int64)
-        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
-        decoder_out = model.decoder(contexts, need_pad=False)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # current_encoder_out is of shape
-        # (shape.NumElements(), 1, joiner_dim)
-        # fmt: off
-        current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
-        )
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out.unsqueeze(2),
-            decoder_out.unsqueeze(1),
-            project_input=False,
-        )
-        logits = logits.squeeze(1).squeeze(1)
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)
-
-        if ilme_scale != 0:
-            ilme_logits = model.joiner(
-                torch.zeros_like(
-                    current_encoder_out, device=current_encoder_out.device
-                ).unsqueeze(2),
-                decoder_out.unsqueeze(1),
-                project_input=False,
-            )
-            ilme_logits = ilme_logits.squeeze(1).squeeze(1)
-            if blank_penalty != 0:
-                ilme_logits[:, 0] -= blank_penalty
-            ilme_log_probs = (ilme_logits / temperature).log_softmax(dim=-1)
-            log_probs -= ilme_scale * ilme_log_probs
-
-        decoding_streams.advance(log_probs)
-    decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(
-        encoder_out_lens.tolist(), allow_partial=allow_partial
-    )
-
-    return lattice
-
-
-def greedy_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    max_sym_per_frame: int,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """Greedy search for a single utterance.
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      max_sym_per_frame:
-        Maximum number of symbols per frame. If it is set to 0, the WER
-        would be 100%.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-
-    blank_id = model.decoder.blank_id
-    context_size = model.decoder.context_size
-    unk_id = getattr(model, "unk_id", blank_id)
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-    hyp = [blank_id] * context_size
-
-    # timestamp[i] is the frame index after subsampling
-    # on which hyp[i] is decoded
-    timestamp = []
-
-    # Maximum symbols per utterance.
-    max_sym_per_utt = 1000
-
-    # symbols per frame
-    sym_per_frame = 0
-
-    # symbols per utterance decoded so far
-    sym_per_utt = 0
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        if sym_per_frame >= max_sym_per_frame:
-            sym_per_frame = 0
-            t += 1
-            continue
-
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits is (1, 1, 1, vocab_size)
-
-        if blank_penalty != 0:
-            logits[:, :, :, 0] -= blank_penalty
-
-        y = logits.argmax().item()
-        if y not in (blank_id, unk_id):
-            hyp.append(y)
-            timestamp.append(t)
-            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
-                1, context_size
-            )
-
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-            sym_per_utt += 1
-            sym_per_frame += 1
-        else:
-            sym_per_frame = 0
-            t += 1
-    hyp = hyp[context_size:]  # remove blanks
-
-    if not return_timestamps:
-        return hyp
-    else:
-        return DecodingResults(
-            hyps=[hyp],
-            timestamps=[timestamp],
-        )
-
-
-def greedy_search_batch(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    blank_penalty: float = 0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C), where N >= 1.
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    device = next(model.parameters()).device
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
-
-    # timestamp[n][i] is the frame index after subsampling
-    # on which hyp[n][i] is decoded
-    timestamps = [[] for _ in range(N)]
-    # scores[n][i] is the logits on which hyp[n][i] is decoded
-    scores = [[] for _ in range(N)]
-
-    decoder_input = torch.tensor(
-        hyps,
-        device=device,
-        dtype=torch.int64,
-    )  # (N, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-    # decoder_out: (N, 1, decoder_out_dim)
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        decoder_out = decoder_out[:batch_size]
-
-        logits = model.joiner(
-            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
-        )
-        # logits'shape (batch_size, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
-        assert logits.ndim == 2, logits.shape
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        y = logits.argmax(dim=1).tolist()
-        emitted = False
-        for i, v in enumerate(y):
-            if v not in (blank_id, unk_id):
-                hyps[i].append(v)
-                timestamps[i].append(t)
-                scores[i].append(logits[i, v].item())
-                emitted = True
-        if emitted:
-            # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
-            decoder_input = torch.tensor(
-                decoder_input,
-                device=device,
-                dtype=torch.int64,
-            )
-            decoder_out = model.decoder(decoder_input, need_pad=False)
-            decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    sorted_ans = [h[context_size:] for h in hyps]
-    ans = []
-    ans_timestamps = []
-    ans_scores = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(timestamps[unsorted_indices[i]])
-        ans_scores.append(scores[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-            scores=ans_scores,
-        )
-
-
-@dataclass
-class Hypothesis:
-    # The predicted tokens so far.
-    # Newly predicted tokens are appended to `ys`.
-    ys: List[int]
-
-    # The log prob of ys.
-    # It contains only one entry.
-    log_prob: torch.Tensor
-
-    # timestamp[i] is the frame index after subsampling
-    # on which ys[i] is decoded
-    timestamp: List[int] = field(default_factory=list)
-
-    # the lm score for next token given the current ys
-    lm_score: Optional[torch.Tensor] = None
-
-    # the RNNLM states (h and c in LSTM)
-    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
-
-    # N-gram LM state
-    state_cost: Optional[NgramLmStateCost] = None
-
-    # Context graph state
-    context_state: Optional[ContextState] = None
-
-    @property
-    def key(self) -> str:
-        """Return a string representation of self.ys"""
-        return "_".join(map(str, self.ys))
-
-
-class HypothesisList(object):
-    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
-        """
-        Args:
-          data:
-            A dict of Hypotheses. Its key is its `value.key`.
-        """
-        if data is None:
-            self._data = {}
-        else:
-            self._data = data
-
-    @property
-    def data(self) -> Dict[str, Hypothesis]:
-        return self._data
-
-    def add(self, hyp: Hypothesis) -> None:
-        """Add a Hypothesis to `self`.
-
-        If `hyp` already exists in `self`, its probability is updated using
-        `log-sum-exp` with the existed one.
-
-        Args:
-          hyp:
-            The hypothesis to be added.
-        """
-        key = hyp.key
-        if key in self:
-            old_hyp = self._data[key]  # shallow copy
-            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
-        else:
-            self._data[key] = hyp
-
-    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
-        """Get the most probable hypothesis, i.e., the one with
-        the largest `log_prob`.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        Returns:
-          Return the hypothesis that has the largest `log_prob`.
-        """
-        if length_norm:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
-        else:
-            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
-
-    def remove(self, hyp: Hypothesis) -> None:
-        """Remove a given hypothesis.
-
-        Caution:
-          `self` is modified **in-place**.
-
-        Args:
-          hyp:
-            The hypothesis to be removed from `self`.
-            Note: It must be contained in `self`. Otherwise,
-            an exception is raised.
-        """
-        key = hyp.key
-        assert key in self, f"{key} does not exist"
-        del self._data[key]
-
-    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
-        """Remove all Hypotheses whose log_prob is less than threshold.
-
-        Caution:
-          `self` is not modified. Instead, a new HypothesisList is returned.
-
-        Returns:
-          Return a new HypothesisList containing all hypotheses from `self`
-          with `log_prob` being greater than the given `threshold`.
-        """
-        ans = HypothesisList()
-        for _, hyp in self._data.items():
-            if hyp.log_prob > threshold:
-                ans.add(hyp)  # shallow copy
-        return ans
-
-    def topk(self, k: int, length_norm: bool = False) -> "HypothesisList":
-        """Return the top-k hypothesis.
-
-        Args:
-          length_norm:
-            If True, the `log_prob` of a hypothesis is normalized by the
-            number of tokens in it.
-        """
-        hyps = list(self._data.items())
-
-        if length_norm:
-            hyps = sorted(
-                hyps, key=lambda h: h[1].log_prob / len(h[1].ys), reverse=True
-            )[:k]
-        else:
-            hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
-
-        ans = HypothesisList(dict(hyps))
-        return ans
-
-    def __contains__(self, key: str):
-        return key in self._data
-
-    def __iter__(self):
-        return iter(self._data.values())
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    def __str__(self) -> str:
-        s = []
-        for key in self:
-            s.append(key)
-        return ", ".join(s)
-
-
-def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
-    """Return a ragged shape with axes [utt][num_hyps].
-
-    Args:
-      hyps:
-        len(hyps) == batch_size. It contains the current hypothesis for
-        each utterance in the batch.
-    Returns:
-      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
-      the shape is on CPU.
-    """
-    num_hyps = [len(h) for h in hyps]
-
-    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
-    # to get exclusive sum later.
-    num_hyps.insert(0, 0)
-
-    num_hyps = torch.tensor(num_hyps)
-    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
-    ans = k2.ragged.create_ragged_shape2(
-        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
-    )
-    return ans
-
-
-def modified_beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    context_graph: Optional[ContextGraph] = None,
-    beam: int = 4,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                context_state=None if context_graph is None else context_graph.root,
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        if blank_penalty != 0:
-            logits[:, 0] -= blank_penalty
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                context_score = 0
-                new_context_state = None if context_graph is None else hyp.context_state
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-                    if context_graph is not None:
-                        (
-                            context_score,
-                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
-
-                new_log_prob = topk_log_probs[k] + context_score
-
-                new_hyp = Hypothesis(
-                    ys=new_ys,
-                    log_prob=new_log_prob,
-                    timestamp=new_timestamp,
-                    context_state=new_context_state,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # finalize context_state, if the matched contexts do not reach final state
-    # we need to add the score on the corresponding backoff arc
-    if context_graph is not None:
-        finalized_B = [HypothesisList() for _ in range(len(B))]
-        for i, hyps in enumerate(B):
-            for hyp in list(hyps):
-                context_score, new_context_state = context_graph.finalize(
-                    hyp.context_state
-                )
-                finalized_B[i].add(
-                    Hypothesis(
-                        ys=hyp.ys,
-                        log_prob=hyp.log_prob + context_score,
-                        timestamp=hyp.timestamp,
-                        context_state=new_context_state,
-                    )
-                )
-        B = finalized_B
-
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    sorted_timestamps = [h.timestamp for h in best_hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
-
-
-def modified_beam_search_lm_rescore(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    lm_scale_list: List[int],
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-    Rescore the final results with RNNLM and return the one with the highest score
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      LM:
-        A neural network language model
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # get the am_scores for n-best list
-    hyps_shape = get_hyps_shape(B)
-    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
-    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
-
-    # now LM rescore
-    # prepare input data to LM
-    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
-    possible_seqs = k2.RaggedTensor(candidate_seqs)
-    row_splits = possible_seqs.shape.row_splits(1)
-    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
-    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
-    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
-    sentence_token_lengths += 1
-
-    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
-    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
-    x = x.to(device).to(torch.int64)
-    y = y.to(device).to(torch.int64)
-    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
-
-    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
-    assert lm_scores.ndim == 2
-    lm_scores = -1 * lm_scores.sum(dim=1)
-
-    ans = {}
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-
-    # get the best hyp with different lm_scale
-    for lm_scale in lm_scale_list:
-        key = f"nnlm_scale_{lm_scale:.2f}"
-        tot_scores = am_scores.values + lm_scores * lm_scale
-        ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
-        max_indexes = ragged_tot_scores.argmax().tolist()
-        unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
-        hyps = []
-        for idx in unsorted_indices:
-            hyps.append(unsorted_hyps[idx])
-
-        ans[key] = hyps
-    return ans
-
-
-def modified_beam_search_lm_rescore_LODR(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    LODR_lm: NgramLm,
-    sp: spm.SentencePieceProcessor,
-    lm_scale_list: List[int],
-    beam: int = 4,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Union[List[List[int]], DecodingResults]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-    Rescore the final results with RNNLM and return the one with the highest score
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-      LM:
-        A neural network language model
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    new_timestamp.append(t)
-
-                new_log_prob = topk_log_probs[k]
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # get the am_scores for n-best list
-    hyps_shape = get_hyps_shape(B)
-    am_scores = torch.tensor([hyp.log_prob.item() for b in B for hyp in b])
-    am_scores = k2.RaggedTensor(value=am_scores, shape=hyps_shape).to(device)
-
-    # now LM rescore
-    # prepare input data to LM
-    candidate_seqs = [hyp.ys[context_size:] for b in B for hyp in b]
-    possible_seqs = k2.RaggedTensor(candidate_seqs)
-    row_splits = possible_seqs.shape.row_splits(1)
-    sentence_token_lengths = row_splits[1:] - row_splits[:-1]
-    possible_seqs_with_sos = add_sos(possible_seqs, sos_id=1)
-    possible_seqs_with_eos = add_eos(possible_seqs, eos_id=1)
-    sentence_token_lengths += 1
-
-    x = possible_seqs_with_sos.pad(mode="constant", padding_value=blank_id)
-    y = possible_seqs_with_eos.pad(mode="constant", padding_value=blank_id)
-    x = x.to(device).to(torch.int64)
-    y = y.to(device).to(torch.int64)
-    sentence_token_lengths = sentence_token_lengths.to(device).to(torch.int64)
-
-    lm_scores = LM.lm(x=x, y=y, lengths=sentence_token_lengths)
-    assert lm_scores.ndim == 2
-    lm_scores = -1 * lm_scores.sum(dim=1)
-
-    # now LODR scores
-    import math
-
-    LODR_scores = []
-    for seq in candidate_seqs:
-        tokens = " ".join(sp.id_to_piece(seq))
-        LODR_scores.append(LODR_lm.score(tokens))
-    LODR_scores = torch.tensor(LODR_scores).to(device) * math.log(
-        10
-    )  # arpa scores are 10-based
-    assert lm_scores.shape == LODR_scores.shape
-
-    ans = {}
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-
-    LODR_scale_list = [0.05 * i for i in range(1, 20)]
-    # get the best hyp with different lm_scale and lodr_scale
-    for lm_scale in lm_scale_list:
-        for lodr_scale in LODR_scale_list:
-            key = f"nnlm_scale_{lm_scale:.2f}_lodr_scale_{lodr_scale:.2f}"
-            tot_scores = (
-                am_scores.values / lm_scale + lm_scores - LODR_scores * lodr_scale
-            )
-            ragged_tot_scores = k2.RaggedTensor(shape=am_scores.shape, value=tot_scores)
-            max_indexes = ragged_tot_scores.argmax().tolist()
-            unsorted_hyps = [candidate_seqs[idx] for idx in max_indexes]
-            hyps = []
-            for idx in unsorted_indices:
-                hyps.append(unsorted_hyps[idx])
-
-            ans[key] = hyps
-    return ans
-
-
-def _deprecated_modified_beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """It limits the maximum number of symbols per frame to 1.
-
-    It decodes only one utterance at a time. We keep it only for reference.
-    The function :func:`modified_beam_search` should be preferred as it
-    supports batch decoding.
-
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-      return_timestamps:
-        Whether to return timestamps.
-
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    T = encoder_out.size(1)
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[-1] * (context_size - 1) + [blank_id],
-            log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-            timestamp=[],
-        )
-    )
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    for t in range(T):
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # current_encoder_out is of shape (1, 1, 1, encoder_out_dim)
-        # fmt: on
-        A = list(B)
-        B = HypothesisList()
-
-        ys_log_probs = torch.cat([hyp.log_prob.reshape(1, 1) for hyp in A])
-        # ys_log_probs is of shape (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyp in A],
-            device=device,
-            dtype=torch.int64,
-        )
-        # decoder_input is of shape (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_output is of shape (num_hyps, 1, 1, joiner_dim)
-
-        current_encoder_out = current_encoder_out.expand(
-            decoder_out.size(0), 1, 1, -1
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )
-        # logits is of shape (num_hyps, 1, 1, vocab_size)
-        logits = logits.squeeze(1).squeeze(1)
-
-        # now logits is of shape (num_hyps, vocab_size)
-        log_probs = logits.log_softmax(dim=-1)
-
-        log_probs.add_(ys_log_probs)
-
-        log_probs = log_probs.reshape(-1)
-        topk_log_probs, topk_indexes = log_probs.topk(beam)
-
-        # topk_hyp_indexes are indexes into `A`
-        topk_hyp_indexes = topk_indexes // logits.size(-1)
-        topk_token_indexes = topk_indexes % logits.size(-1)
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            topk_hyp_indexes = topk_hyp_indexes.tolist()
-            topk_token_indexes = topk_token_indexes.tolist()
-
-        for i in range(len(topk_hyp_indexes)):
-            hyp = A[topk_hyp_indexes[i]]
-            new_ys = hyp.ys[:]
-            new_timestamp = hyp.timestamp[:]
-            new_token = topk_token_indexes[i]
-            if new_token not in (blank_id, unk_id):
-                new_ys.append(new_token)
-                new_timestamp.append(t)
-            new_log_prob = topk_log_probs[i]
-            new_hyp = Hypothesis(
-                ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
-            )
-            B.add(new_hyp)
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    if not return_timestamps:
-        return ys
-    else:
-        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
-
-
-def beam_search(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    beam: int = 4,
-    temperature: float = 1.0,
-    blank_penalty: float = 0.0,
-    return_timestamps: bool = False,
-) -> Union[List[int], DecodingResults]:
-    """
-    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
-
-    espnet/nets/beam_search_transducer.py#L247 is used as a reference.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
-      beam:
-        Beam size.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-
-    Returns:
-      If return_timestamps is False, return the decoded result.
-      Else, return a DecodingResults object containing
-      decoded result and corresponding timestamps.
-    """
-    assert encoder_out.ndim == 3
-
-    # support only batch_size == 1 for now
-    assert encoder_out.size(0) == 1, encoder_out.size(0)
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-
-    device = next(model.parameters()).device
-
-    decoder_input = torch.tensor(
-        [blank_id] * context_size,
-        device=device,
-        dtype=torch.int64,
-    ).reshape(1, context_size)
-
-    decoder_out = model.decoder(decoder_input, need_pad=False)
-    decoder_out = model.joiner.decoder_proj(decoder_out)
-
-    encoder_out = model.joiner.encoder_proj(encoder_out)
-
-    T = encoder_out.size(1)
-    t = 0
-
-    B = HypothesisList()
-    B.add(
-        Hypothesis(
-            ys=[-1] * (context_size - 1) + [blank_id], log_prob=0.0, timestamp=[]
-        )
-    )
-
-    max_sym_per_utt = 20000
-
-    sym_per_utt = 0
-
-    decoder_cache: Dict[str, torch.Tensor] = {}
-
-    while t < T and sym_per_utt < max_sym_per_utt:
-        # fmt: off
-        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
-        # fmt: on
-        A = B
-        B = HypothesisList()
-
-        joint_cache: Dict[str, torch.Tensor] = {}
-
-        # TODO(fangjun): Implement prefix search to update the `log_prob`
-        # of hypotheses in A
-
-        while True:
-            y_star = A.get_most_probable()
-            A.remove(y_star)
-
-            cached_key = y_star.key
-
-            if cached_key not in decoder_cache:
-                decoder_input = torch.tensor(
-                    [y_star.ys[-context_size:]],
-                    device=device,
-                    dtype=torch.int64,
-                ).reshape(1, context_size)
-
-                decoder_out = model.decoder(decoder_input, need_pad=False)
-                decoder_out = model.joiner.decoder_proj(decoder_out)
-                decoder_cache[cached_key] = decoder_out
-            else:
-                decoder_out = decoder_cache[cached_key]
-
-            cached_key += f"-t-{t}"
-            if cached_key not in joint_cache:
-                logits = model.joiner(
-                    current_encoder_out,
-                    decoder_out.unsqueeze(1),
-                    project_input=False,
-                )
-
-                if blank_penalty != 0:
-                    logits[:, :, :, 0] -= blank_penalty
-
-                # TODO(fangjun): Scale the blank posterior
-                log_prob = (logits / temperature).log_softmax(dim=-1)
-                # log_prob is (1, 1, 1, vocab_size)
-                log_prob = log_prob.squeeze()
-                # Now log_prob is (vocab_size,)
-                joint_cache[cached_key] = log_prob
-            else:
-                log_prob = joint_cache[cached_key]
-
-            # First, process the blank symbol
-            skip_log_prob = log_prob[blank_id]
-            new_y_star_log_prob = y_star.log_prob + skip_log_prob
-
-            # ys[:] returns a copy of ys
-            B.add(
-                Hypothesis(
-                    ys=y_star.ys[:],
-                    log_prob=new_y_star_log_prob,
-                    timestamp=y_star.timestamp[:],
-                )
-            )
-
-            # Second, process other non-blank labels
-            values, indices = log_prob.topk(beam + 1)
-            for i, v in zip(indices.tolist(), values.tolist()):
-                if i in (blank_id, unk_id):
-                    continue
-                new_ys = y_star.ys + [i]
-                new_log_prob = y_star.log_prob + v
-                new_timestamp = y_star.timestamp + [t]
-                A.add(
-                    Hypothesis(
-                        ys=new_ys,
-                        log_prob=new_log_prob,
-                        timestamp=new_timestamp,
-                    )
-                )
-
-            # Check whether B contains more than "beam" elements more probable
-            # than the most probable in A
-            A_most_probable = A.get_most_probable()
-
-            kept_B = B.filter(A_most_probable.log_prob)
-
-            if len(kept_B) >= beam:
-                B = kept_B.topk(beam)
-                break
-
-        t += 1
-
-    best_hyp = B.get_most_probable(length_norm=True)
-    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
-
-    if not return_timestamps:
-        return ys
-    else:
-        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
-
-
-def fast_beam_search_with_nbest_rescoring(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    ngram_lm_scale_list: List[float],
-    num_paths: int,
-    G: k2.Fsa,
-    sp: spm.SentencePieceProcessor,
-    word_table: k2.SymbolTable,
-    oov_word: str = "<UNK>",
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Dict[str, Union[List[List[int]], DecodingResults]]:
-    """It limits the maximum number of symbols per frame to 1.
-    A lattice is first obtained using fast beam search, num_path are selected
-    and rescored using a given language model. The shortest path within the
-    lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi.
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      ngram_lm_scale_list:
-        A list of floats representing LM score scales.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      G:
-        An FsaVec containing only a single FSA. It is an n-gram LM.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      oov_word:
-        OOV words are replaced with this word.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      Return the decoded result in a dict, where the key has the form
-      'ngram_lm_scale_xx' and the value is the decoded results
-      optionally with timestamps. `xx` is the ngram LM scale value
-      used during decoding, i.e., 0.1.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    am_scores = nbest.tot_scores()
-
-    # Now we need to compute the LM scores of each path.
-    # (1) Get the token IDs of each Path. We assume the decoding_graph
-    # is an acceptor, i.e., lattice is also an acceptor
-    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
-
-    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
-    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
-
-    token_list: List[List[int]] = tokens.tolist()
-    word_list: List[List[str]] = sp.decode(token_list)
-
-    assert isinstance(oov_word, str), oov_word
-    assert oov_word in word_table, oov_word
-    oov_word_id = word_table[oov_word]
-
-    word_ids_list: List[List[int]] = []
-
-    for words in word_list:
-        this_word_ids = []
-        for w in words.split():
-            if w in word_table:
-                this_word_ids.append(word_table[w])
-            else:
-                this_word_ids.append(oov_word_id)
-        word_ids_list.append(this_word_ids)
-
-    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
-    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
-
-    num_unique_paths = len(word_ids_list)
-
-    b_to_a_map = torch.zeros(
-        num_unique_paths,
-        dtype=torch.int32,
-        device=lattice.device,
-    )
-
-    rescored_word_fsas = k2.intersect_device(
-        a_fsas=G,
-        b_fsas=word_fsas_with_self_loops,
-        b_to_a_map=b_to_a_map,
-        sorted_match_a=True,
-        ret_arc_maps=False,
-    )
-
-    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
-    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
-    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
-        use_double_scores=True,
-        log_semiring=False,
-    )
-
-    ans: Dict[str, Union[List[List[int]], DecodingResults]] = {}
-    for s in ngram_lm_scale_list:
-        key = f"ngram_lm_scale_{s}"
-        tot_scores = am_scores.values + s * ngram_lm_scores
-        ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-        max_indexes = ragged_tot_scores.argmax()
-        best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-        if not return_timestamps:
-            ans[key] = get_texts(best_path)
-        else:
-            ans[key] = get_texts_with_timestamp(best_path)
-
-    return ans
-
-
-def fast_beam_search_with_nbest_rnn_rescoring(
-    model: nn.Module,
-    decoding_graph: k2.Fsa,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    beam: float,
-    max_states: int,
-    max_contexts: int,
-    ngram_lm_scale_list: List[float],
-    num_paths: int,
-    G: k2.Fsa,
-    sp: spm.SentencePieceProcessor,
-    word_table: k2.SymbolTable,
-    rnn_lm_model: torch.nn.Module,
-    rnn_lm_scale_list: List[float],
-    oov_word: str = "<UNK>",
-    use_double_scores: bool = True,
-    nbest_scale: float = 0.5,
-    temperature: float = 1.0,
-    return_timestamps: bool = False,
-) -> Dict[str, Union[List[List[int]], DecodingResults]]:
-    """It limits the maximum number of symbols per frame to 1.
-    A lattice is first obtained using fast beam search, num_path are selected
-    and rescored using a given language model and a rnn-lm.
-    The shortest path within the lattice is used as the final output.
-
-    Args:
-      model:
-        An instance of `Transducer`.
-      decoding_graph:
-        Decoding graph used for decoding, may be a TrivialGraph or a LG.
-      encoder_out:
-        A tensor of shape (N, T, C) from the encoder.
-      encoder_out_lens:
-        A tensor of shape (N,) containing the number of frames in `encoder_out`
-        before padding.
-      beam:
-        Beam value, similar to the beam used in Kaldi.
-      max_states:
-        Max states per stream per frame.
-      max_contexts:
-        Max contexts pre stream per frame.
-      ngram_lm_scale_list:
-        A list of floats representing LM score scales.
-      num_paths:
-        Number of paths to extract from the decoded lattice.
-      G:
-        An FsaVec containing only a single FSA. It is an n-gram LM.
-      sp:
-        The BPE model.
-      word_table:
-        The word symbol table.
-      rnn_lm_model:
-        A rnn-lm model used for LM rescoring
-      rnn_lm_scale_list:
-        A list of floats representing RNN score scales.
-      oov_word:
-        OOV words are replaced with this word.
-      use_double_scores:
-        True to use double precision for computation. False to use
-        single precision.
-      nbest_scale:
-        It's the scale applied to the lattice.scores. A smaller value
-        yields more unique paths.
-      temperature:
-        Softmax temperature.
-      return_timestamps:
-        Whether to return timestamps.
-    Returns:
-      Return the decoded result in a dict, where the key has the form
-      'ngram_lm_scale_xx' and the value is the decoded results
-      optionally with timestamps. `xx` is the ngram LM scale value
-      used during decoding, i.e., 0.1.
-    """
-    lattice = fast_beam_search(
-        model=model,
-        decoding_graph=decoding_graph,
-        encoder_out=encoder_out,
-        encoder_out_lens=encoder_out_lens,
-        beam=beam,
-        max_states=max_states,
-        max_contexts=max_contexts,
-        temperature=temperature,
-    )
-
-    nbest = Nbest.from_lattice(
-        lattice=lattice,
-        num_paths=num_paths,
-        use_double_scores=use_double_scores,
-        nbest_scale=nbest_scale,
-    )
-    # at this point, nbest.fsa.scores are all zeros.
-
-    nbest = nbest.intersect(lattice)
-    # Now nbest.fsa.scores contains acoustic scores
-
-    am_scores = nbest.tot_scores()
-
-    # Now we need to compute the LM scores of each path.
-    # (1) Get the token IDs of each Path. We assume the decoding_graph
-    # is an acceptor, i.e., lattice is also an acceptor
-    tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)  # [path][arc]
-
-    tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.labels.contiguous())
-    tokens = tokens.remove_values_leq(0)  # remove -1 and 0
-
-    token_list: List[List[int]] = tokens.tolist()
-    word_list: List[List[str]] = sp.decode(token_list)
-
-    assert isinstance(oov_word, str), oov_word
-    assert oov_word in word_table, oov_word
-    oov_word_id = word_table[oov_word]
-
-    word_ids_list: List[List[int]] = []
-
-    for words in word_list:
-        this_word_ids = []
-        for w in words.split():
-            if w in word_table:
-                this_word_ids.append(word_table[w])
-            else:
-                this_word_ids.append(oov_word_id)
-        word_ids_list.append(this_word_ids)
-
-    word_fsas = k2.linear_fsa(word_ids_list, device=lattice.device)
-    word_fsas_with_self_loops = k2.add_epsilon_self_loops(word_fsas)
-
-    num_unique_paths = len(word_ids_list)
-
-    b_to_a_map = torch.zeros(
-        num_unique_paths,
-        dtype=torch.int32,
-        device=lattice.device,
-    )
-
-    rescored_word_fsas = k2.intersect_device(
-        a_fsas=G,
-        b_fsas=word_fsas_with_self_loops,
-        b_to_a_map=b_to_a_map,
-        sorted_match_a=True,
-        ret_arc_maps=False,
-    )
-
-    rescored_word_fsas = k2.remove_epsilon_self_loops(rescored_word_fsas)
-    rescored_word_fsas = k2.top_sort(k2.connect(rescored_word_fsas))
-    ngram_lm_scores = rescored_word_fsas.get_tot_scores(
-        use_double_scores=True,
-        log_semiring=False,
-    )
-
-    # Now RNN-LM
-    blank_id = model.decoder.blank_id
-    sos_id = sp.piece_to_id("sos_id")
-    eos_id = sp.piece_to_id("eos_id")
-
-    sos_tokens = add_sos(tokens, sos_id)
-    tokens_eos = add_eos(tokens, eos_id)
-    sos_tokens_row_splits = sos_tokens.shape.row_splits(1)
-    sentence_lengths = sos_tokens_row_splits[1:] - sos_tokens_row_splits[:-1]
-
-    x_tokens = sos_tokens.pad(mode="constant", padding_value=blank_id)
-    y_tokens = tokens_eos.pad(mode="constant", padding_value=blank_id)
-
-    x_tokens = x_tokens.to(torch.int64)
-    y_tokens = y_tokens.to(torch.int64)
-    sentence_lengths = sentence_lengths.to(torch.int64)
-
-    rnn_lm_nll = rnn_lm_model(x=x_tokens, y=y_tokens, lengths=sentence_lengths)
-    assert rnn_lm_nll.ndim == 2
-    assert rnn_lm_nll.shape[0] == len(token_list)
-    rnn_lm_scores = -1 * rnn_lm_nll.sum(dim=1)
-
-    ans: Dict[str, List[List[int]]] = {}
-    for n_scale in ngram_lm_scale_list:
-        for rnn_scale in rnn_lm_scale_list:
-            key = f"ngram_lm_scale_{n_scale}_rnn_lm_scale_{rnn_scale}"
-            tot_scores = (
-                am_scores.values + n_scale * ngram_lm_scores + rnn_scale * rnn_lm_scores
-            )
-            ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-            max_indexes = ragged_tot_scores.argmax()
-            best_path = k2.index_fsa(nbest.fsa, max_indexes)
-
-            if not return_timestamps:
-                ans[key] = get_texts(best_path)
-            else:
-                ans[key] = get_texts_with_timestamp(best_path)
-
-    return ans
-
-
-def modified_beam_search_ngram_rescoring(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    ngram_lm: NgramLm,
-    ngram_lm_scale: float,
-    beam: int = 4,
-    temperature: float = 1.0,
-) -> List[List[int]]:
-    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
-
-    Args:
-      model:
-        The transducer model.
-      encoder_out:
-        Output from the encoder. Its shape is (N, T, C).
-      encoder_out_lens:
-        A 1-D tensor of shape (N,), containing number of valid frames in
-        encoder_out before padding.
-      beam:
-        Number of active paths during the beam search.
-      temperature:
-        Softmax temperature.
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-    lm_scale = ngram_lm_scale
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state_cost=NgramLmStateCost(ngram_lm),
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [
-                hyp.log_prob.reshape(1, 1) + hyp.state_cost.lm_score * lm_scale
-                for hyps in A
-                for hyp in hyps
-            ]
-        )  # (num_hyps, 1)
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
-
-        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
-        # as index, so we use `to(torch.int64)` below.
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-        vocab_size = log_probs.size(-1)
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_ys = hyp.ys[:]
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    new_ys.append(new_token)
-                    state_cost = hyp.state_cost.forward_one_step(new_token)
-                else:
-                    state_cost = hyp.state_cost
-
-                # We only keep AM scores in new_hyp.log_prob
-                new_log_prob = topk_log_probs[k] - hyp.state_cost.lm_score * lm_scale
-
-                new_hyp = Hypothesis(
-                    ys=new_ys, log_prob=new_log_prob, state_cost=state_cost
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-def modified_beam_search_LODR(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LODR_lm: NgramLm,
-    LODR_lm_scale: float,
-    LM: LmScorer,
-    beam: int = 4,
-    context_graph: Optional[ContextGraph] = None,
-) -> List[List[int]]:
-    """This function implements LODR (https://arxiv.org/abs/2203.16776) with
-    `modified_beam_search`. It uses a bi-gram language model as the estimate
-    of the internal language model and subtracts its score during shallow fusion
-    with an external language model. This implementation uses a RNNLM as the
-    external language model.
-
-    Args:
-        model (Transducer):
-            The transducer model
-        encoder_out (torch.Tensor):
-            Encoder output in (N,T,C)
-        encoder_out_lens (torch.Tensor):
-            A 1-D tensor of shape (N,), containing the number of
-            valid frames in encoder_out before padding.
-        LODR_lm:
-            A low order n-gram LM, whose score will be subtracted during shallow fusion
-        LODR_lm_scale:
-            The scale of the LODR_lm
-        LM:
-            A neural net LM, e.g an RNNLM or transformer LM
-        beam (int, optional):
-            Beam size. Defaults to 4.
-
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-    assert LM is not None
-    lm_scale = LM.lm_scale
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    sos_id = getattr(LM, "sos_id", 1)
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    # get initial lm score and lm state by scoring the "sos" token
-    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
-    lens = torch.tensor([1]).to(device)
-    init_score, init_states = LM.score_token(sos_token, lens)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state=init_states,  # state of the NN LM
-                lm_score=init_score.reshape(-1),
-                state_cost=NgramLmStateCost(
-                    LODR_lm
-                ),  # state of the source domain ngram
-                context_state=None if context_graph is None else context_graph.root,
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for batch_size in batch_size_list:
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]  # get batch
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-        """
-        for all hyps with a non-blank new token, score this token.
-        It is a little confusing here because this for-loop
-        looks very similar to the one below. Here, we go through all
-        top-k tokens and only add the non-blanks ones to the token_list.
-        LM will score those tokens given the LM states. Note that
-        the variable `scores` is the LM score after seeing the new
-        non-blank token.
-        """
-        token_list = []
-        hs = []
-        cs = []
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    if LM.lm_type == "rnn":
-                        token_list.append([new_token])
-                        # store the LSTM states
-                        hs.append(hyp.state[0])
-                        cs.append(hyp.state[1])
-                    else:
-                        # for transformer LM
-                        token_list.append(
-                            [sos_id] + hyp.ys[context_size:] + [new_token]
-                        )
-
-        # forward NN LM to get new states and scores
-        if len(token_list) != 0:
-            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
-            if LM.lm_type == "rnn":
-                tokens_to_score = (
-                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
-                )
-                hs = torch.cat(hs, dim=1).to(device)
-                cs = torch.cat(cs, dim=1).to(device)
-                state = (hs, cs)
-            else:
-                # for transformer LM
-                tokens_list = [torch.tensor(tokens) for tokens in token_list]
-                tokens_to_score = (
-                    torch.nn.utils.rnn.pad_sequence(
-                        tokens_list, batch_first=True, padding_value=0.0
-                    )
-                    .to(device)
-                    .to(torch.int64)
-                )
-
-                state = None
-
-            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
-
-        count = 0  # index, used to locate score and lm states
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                ys = hyp.ys[:]
-
-                # current score of hyp
-                lm_score = hyp.lm_score
-                state = hyp.state
-
-                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
-                new_token = topk_token_indexes[k]
-
-                context_score = 0
-                new_context_state = None if context_graph is None else hyp.context_state
-                if new_token not in (blank_id, unk_id):
-                    if context_graph is not None:
-                        (
-                            context_score,
-                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
-
-                    ys.append(new_token)
-                    state_cost = hyp.state_cost.forward_one_step(new_token)
-
-                    # calculate the score of the latest token
-                    current_ngram_score = state_cost.lm_score - hyp.state_cost.lm_score
-
-                    assert current_ngram_score <= 0.0, (
-                        state_cost.lm_score,
-                        hyp.state_cost.lm_score,
-                    )
-                    # score = score + TDLM_score - LODR_score
-                    # LODR_LM_scale should be a negative number here
-                    hyp_log_prob += (
-                        lm_score[new_token] * lm_scale
-                        + LODR_lm_scale * current_ngram_score
-                        + context_score
-                    )  # add the lm score
-
-                    lm_score = scores[count]
-                    if LM.lm_type == "rnn":
-                        state = (
-                            lm_states[0][:, count, :].unsqueeze(1),
-                            lm_states[1][:, count, :].unsqueeze(1),
-                        )
-                    count += 1
-                else:
-                    state_cost = hyp.state_cost
-
-                new_hyp = Hypothesis(
-                    ys=ys,
-                    log_prob=hyp_log_prob,
-                    state=state,
-                    lm_score=lm_score,
-                    state_cost=state_cost,
-                    context_state=new_context_state,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-
-    # finalize context_state, if the matched contexts do not reach final state
-    # we need to add the score on the corresponding backoff arc
-    if context_graph is not None:
-        finalized_B = [HypothesisList() for _ in range(len(B))]
-        for i, hyps in enumerate(B):
-            for hyp in list(hyps):
-                context_score, new_context_state = context_graph.finalize(
-                    hyp.context_state
-                )
-                finalized_B[i].add(
-                    Hypothesis(
-                        ys=hyp.ys,
-                        log_prob=hyp.log_prob + context_score,
-                        timestamp=hyp.timestamp,
-                        context_state=new_context_state,
-                    )
-                )
-        B = finalized_B
-
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    ans = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-
-    return ans
-
-
-def modified_beam_search_lm_shallow_fusion(
-    model: nn.Module,
-    encoder_out: torch.Tensor,
-    encoder_out_lens: torch.Tensor,
-    LM: LmScorer,
-    beam: int = 4,
-    return_timestamps: bool = False,
-) -> List[List[int]]:
-    """Modified_beam_search + NN LM shallow fusion
-
-    Args:
-        model (Transducer):
-            The transducer model
-        encoder_out (torch.Tensor):
-            Encoder output in (N,T,C)
-        encoder_out_lens (torch.Tensor):
-            A 1-D tensor of shape (N,), containing the number of
-            valid frames in encoder_out before padding.
-        sp:
-            Sentence piece generator.
-        LM (LmScorer):
-            A neural net LM, e.g RNN or Transformer
-        beam (int, optional):
-            Beam size. Defaults to 4.
-
-    Returns:
-      Return a list-of-list of token IDs. ans[i] is the decoding results
-      for the i-th utterance.
-    """
-    assert encoder_out.ndim == 3, encoder_out.shape
-    assert encoder_out.size(0) >= 1, encoder_out.size(0)
-    assert LM is not None
-    lm_scale = LM.lm_scale
-
-    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
-        input=encoder_out,
-        lengths=encoder_out_lens.cpu(),
-        batch_first=True,
-        enforce_sorted=False,
-    )
-
-    blank_id = model.decoder.blank_id
-    sos_id = getattr(LM, "sos_id", 1)
-    unk_id = getattr(model, "unk_id", blank_id)
-    context_size = model.decoder.context_size
-    device = next(model.parameters()).device
-
-    batch_size_list = packed_encoder_out.batch_sizes.tolist()
-    N = encoder_out.size(0)
-    assert torch.all(encoder_out_lens > 0), encoder_out_lens
-    assert N == batch_size_list[0], (N, batch_size_list)
-
-    # get initial lm score and lm state by scoring the "sos" token
-    sos_token = torch.tensor([[sos_id]]).to(torch.int64).to(device)
-    lens = torch.tensor([1]).to(device)
-    init_score, init_states = LM.score_token(sos_token, lens)
-
-    B = [HypothesisList() for _ in range(N)]
-    for i in range(N):
-        B[i].add(
-            Hypothesis(
-                ys=[-1] * (context_size - 1) + [blank_id],
-                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
-                state=init_states,
-                lm_score=init_score.reshape(-1),
-                timestamp=[],
-            )
-        )
-
-    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
-
-    offset = 0
-    finalized_B = []
-    for t, batch_size in enumerate(batch_size_list):
-        start = offset
-        end = offset + batch_size
-        current_encoder_out = encoder_out.data[start:end]  # get batch
-        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
-        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
-        offset = end
-
-        finalized_B = B[batch_size:] + finalized_B
-        B = B[:batch_size]
-
-        hyps_shape = get_hyps_shape(B).to(device)
-
-        A = [list(b) for b in B]
-        B = [HypothesisList() for _ in range(batch_size)]
-
-        ys_log_probs = torch.cat(
-            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
-        )
-
-        lm_scores = torch.cat(
-            [hyp.lm_score.reshape(1, -1) for hyps in A for hyp in hyps]
-        )
-
-        decoder_input = torch.tensor(
-            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
-            device=device,
-            dtype=torch.int64,
-        )  # (num_hyps, context_size)
-
-        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
-        decoder_out = model.joiner.decoder_proj(decoder_out)
-
-        current_encoder_out = torch.index_select(
-            current_encoder_out,
-            dim=0,
-            index=hyps_shape.row_ids(1).to(torch.int64),
-        )  # (num_hyps, 1, 1, encoder_out_dim)
-
-        logits = model.joiner(
-            current_encoder_out,
-            decoder_out,
-            project_input=False,
-        )  # (num_hyps, 1, 1, vocab_size)
-
-        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
-
-        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
-
-        log_probs.add_(ys_log_probs)
-
-        vocab_size = log_probs.size(-1)
-
-        log_probs = log_probs.reshape(-1)
-
-        row_splits = hyps_shape.row_splits(1) * vocab_size
-        log_probs_shape = k2.ragged.create_ragged_shape2(
-            row_splits=row_splits, cached_tot_size=log_probs.numel()
-        )
-        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
-        """
-        for all hyps with a non-blank new token, score this token.
-        It is a little confusing here because this for-loop
-        looks very similar to the one below. Here, we go through all
-        top-k tokens and only add the non-blanks ones to the token_list.
-        `LM` will score those tokens given the LM states. Note that
-        the variable `scores` is the LM score after seeing the new
-        non-blank token.
-        """
-        token_list = []  # a list of list
-        hs = []
-        cs = []
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                new_token = topk_token_indexes[k]
-                if new_token not in (blank_id, unk_id):
-                    if LM.lm_type == "rnn":
-                        token_list.append([new_token])
-                        # store the LSTM states
-                        hs.append(hyp.state[0])
-                        cs.append(hyp.state[1])
-                    else:
-                        # for transformer LM
-                        token_list.append(
-                            [sos_id] + hyp.ys[context_size:] + [new_token]
-                        )
-
-        if len(token_list) != 0:
-            x_lens = torch.tensor([len(tokens) for tokens in token_list]).to(device)
-            if LM.lm_type == "rnn":
-                tokens_to_score = (
-                    torch.tensor(token_list).to(torch.int64).to(device).reshape(-1, 1)
-                )
-                hs = torch.cat(hs, dim=1).to(device)
-                cs = torch.cat(cs, dim=1).to(device)
-                state = (hs, cs)
-            else:
-                # for transformer LM
-                tokens_list = [torch.tensor(tokens) for tokens in token_list]
-                tokens_to_score = (
-                    torch.nn.utils.rnn.pad_sequence(
-                        tokens_list, batch_first=True, padding_value=0.0
-                    )
-                    .to(device)
-                    .to(torch.int64)
-                )
-
-                state = None
-
-            scores, lm_states = LM.score_token(tokens_to_score, x_lens, state)
-
-        count = 0  # index, used to locate score and lm states
-        for i in range(batch_size):
-            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
-
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
-                topk_token_indexes = (topk_indexes % vocab_size).tolist()
-
-            for k in range(len(topk_hyp_indexes)):
-                hyp_idx = topk_hyp_indexes[k]
-                hyp = A[i][hyp_idx]
-
-                ys = hyp.ys[:]
-
-                lm_score = hyp.lm_score
-                state = hyp.state
-
-                hyp_log_prob = topk_log_probs[k]  # get score of current hyp
-                new_token = topk_token_indexes[k]
-                new_timestamp = hyp.timestamp[:]
-                if new_token not in (blank_id, unk_id):
-                    ys.append(new_token)
-                    new_timestamp.append(t)
-
-                    hyp_log_prob += lm_score[new_token] * lm_scale  # add the lm score
-
-                    lm_score = scores[count]
-                    if LM.lm_type == "rnn":
-                        state = (
-                            lm_states[0][:, count, :].unsqueeze(1),
-                            lm_states[1][:, count, :].unsqueeze(1),
-                        )
-                    count += 1
-
-                new_hyp = Hypothesis(
-                    ys=ys,
-                    log_prob=hyp_log_prob,
-                    state=state,
-                    lm_score=lm_score,
-                    timestamp=new_timestamp,
-                )
-                B[i].add(new_hyp)
-
-    B = B + finalized_B
-    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-
-    sorted_ans = [h.ys[context_size:] for h in best_hyps]
-    sorted_timestamps = [h.timestamp for h in best_hyps]
-    ans = []
-    ans_timestamps = []
-    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
-    for i in range(N):
-        ans.append(sorted_ans[unsorted_indices[i]])
-        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
-
-    if not return_timestamps:
-        return ans
-    else:
-        return DecodingResults(
-            hyps=ans,
-            timestamps=ans_timestamps,
-        )
diff --git a/egs/librispeech/SSL/hubert/beam_search.py b/egs/librispeech/SSL/hubert/beam_search.py
new file mode 120000
index 0000000000..f4d4b57326
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/beam_search.py
@@ -0,0 +1 @@
+../../ASR/zipformer/beam_search.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/ctc_decode.py b/egs/librispeech/SSL/hubert/ctc_decode.py
index 1f0f9bfac3..f3a17be2a2 100644
--- a/egs/librispeech/SSL/hubert/ctc_decode.py
+++ b/egs/librispeech/SSL/hubert/ctc_decode.py
@@ -22,39 +22,39 @@
 Usage:
 
 (1) ctc-decoding
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --decoding-method ctc-decoding
 
 (2) 1best
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
     --decoding-method 1best
 
 (3) nbest
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
     --decoding-method nbest
 
 (4) nbest-rescoring
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
@@ -63,10 +63,10 @@
     --decoding-method nbest-rescoring
 
 (5) whole-lattice-rescoring
-./zipformer/ctc_decode.py \
+./hubert/ctc_decode.py \
     --epoch 30 \
     --avg 15 \
-    --exp-dir ./zipformer/exp \
+    --exp-dir ./hubert/exp \
     --use-ctc 1 \
     --max-duration 600 \
     --hlg-scale 0.6 \
@@ -164,7 +164,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="zipformer/exp",
+        default="hubert/exp",
         help="The experiment dir",
     )
 
@@ -340,7 +340,7 @@ def decode_one_batch(
     feature_lens = supervisions["num_frames"].to(device)
 
     if params.causal:
-        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        # this seems to cause insertions at the end of the utterance if used with hubert.
         pad_len = 30
         feature_lens += pad_len
         feature = torch.nn.functional.pad(
diff --git a/egs/librispeech/SSL/hubert/dataset.py b/egs/librispeech/SSL/hubert/dataset.py
index 106b27a2c5..c3442df51a 100644
--- a/egs/librispeech/SSL/hubert/dataset.py
+++ b/egs/librispeech/SSL/hubert/dataset.py
@@ -92,9 +92,9 @@ def __init__(self, collate: bool = True) -> None:
             feature_size=1,
             sampling_rate=16000,
             padding_side="right",
-            padding_value=0.0,
+            padding_value=0,
             do_normalize=True,
-            return_attention_mask=True,
+            return_attention_mask=False,
         )
 
     def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
@@ -148,7 +148,7 @@ def _validate(self, cuts: CutSet) -> None:
     )
 
     for batch_idx, batch in enumerate(dl):
-        import pdb
-
-        pdb.set_trace()
-        pass
+        print(batch["audio"])
+        print(batch["audio_lens"])
+        print(batch["supervisions"]["text"])
+        print(batch["cuts"])
diff --git a/egs/librispeech/SSL/hubert/decoder.py b/egs/librispeech/SSL/hubert/decoder.py
deleted file mode 100644
index 7ce44495bf..0000000000
--- a/egs/librispeech/SSL/hubert/decoder.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from scaling import Balancer
-
-
-class Decoder(nn.Module):
-    """This class modifies the stateless decoder from the following paper:
-
-        RNN-transducer with stateless prediction network
-        https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419
-
-    It removes the recurrent connection from the decoder, i.e., the prediction
-    network. Different from the above paper, it adds an extra Conv1d
-    right after the embedding layer.
-
-    TODO: Implement https://arxiv.org/pdf/2109.07513.pdf
-    """
-
-    def __init__(
-        self,
-        vocab_size: int,
-        decoder_dim: int,
-        blank_id: int,
-        context_size: int,
-    ):
-        """
-        Args:
-          vocab_size:
-            Number of tokens of the modeling unit including blank.
-          decoder_dim:
-            Dimension of the input embedding, and of the decoder output.
-          blank_id:
-            The ID of the blank symbol.
-          context_size:
-            Number of previous words to use to predict the next word.
-            1 means bigram; 2 means trigram. n means (n+1)-gram.
-        """
-        super().__init__()
-
-        self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=decoder_dim,
-        )
-        # the balancers are to avoid any drift in the magnitude of the
-        # embeddings, which would interact badly with parameter averaging.
-        self.balancer = Balancer(
-            decoder_dim,
-            channel_dim=-1,
-            min_positive=0.0,
-            max_positive=1.0,
-            min_abs=0.5,
-            max_abs=1.0,
-            prob=0.05,
-        )
-
-        self.blank_id = blank_id
-
-        assert context_size >= 1, context_size
-        self.context_size = context_size
-        self.vocab_size = vocab_size
-
-        if context_size > 1:
-            self.conv = nn.Conv1d(
-                in_channels=decoder_dim,
-                out_channels=decoder_dim,
-                kernel_size=context_size,
-                padding=0,
-                groups=decoder_dim // 4,  # group size == 4
-                bias=False,
-            )
-            self.balancer2 = Balancer(
-                decoder_dim,
-                channel_dim=-1,
-                min_positive=0.0,
-                max_positive=1.0,
-                min_abs=0.5,
-                max_abs=1.0,
-                prob=0.05,
-            )
-        else:
-            # To avoid `RuntimeError: Module 'Decoder' has no attribute 'conv'`
-            # when inference with torch.jit.script and context_size == 1
-            self.conv = nn.Identity()
-            self.balancer2 = nn.Identity()
-
-    def forward(self, y: torch.Tensor, need_pad: bool = True) -> torch.Tensor:
-        """
-        Args:
-          y:
-            A 2-D tensor of shape (N, U).
-          need_pad:
-            True to left pad the input. Should be True during training.
-            False to not pad the input. Should be False during inference.
-        Returns:
-          Return a tensor of shape (N, U, decoder_dim).
-        """
-        y = y.to(torch.int64)
-        # this stuff about clamp() is a temporary fix for a mismatch
-        # at utterance start, we use negative ids in beam_search.py
-        embedding_out = self.embedding(y.clamp(min=0)) * (y >= 0).unsqueeze(-1)
-
-        embedding_out = self.balancer(embedding_out)
-
-        if self.context_size > 1:
-            embedding_out = embedding_out.permute(0, 2, 1)
-            if need_pad is True:
-                embedding_out = F.pad(embedding_out, pad=(self.context_size - 1, 0))
-            else:
-                # During inference time, there is no need to do extra padding
-                # as we only need one output
-                assert embedding_out.size(-1) == self.context_size
-            embedding_out = self.conv(embedding_out)
-            embedding_out = embedding_out.permute(0, 2, 1)
-            embedding_out = F.relu(embedding_out)
-            embedding_out = self.balancer2(embedding_out)
-
-        return embedding_out
diff --git a/egs/librispeech/SSL/hubert/decoder.py b/egs/librispeech/SSL/hubert/decoder.py
new file mode 120000
index 0000000000..a2138e5da4
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/decoder.py
@@ -0,0 +1 @@
+../../ASR/zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/finetune.py b/egs/librispeech/SSL/hubert/finetune.py
index 612a8a2358..0c0095f9f2 100644
--- a/egs/librispeech/SSL/hubert/finetune.py
+++ b/egs/librispeech/SSL/hubert/finetune.py
@@ -64,7 +64,6 @@
 from model import AsrModel
 from optim import Eden, ScaledAdam
 from scaling import ScheduledFloat
-from subsampling import Conv2dSubsampling
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -152,7 +151,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--do-stable-layer-norm",
         type=str2bool,
-        default=True,
+        default=False,
     )
     parser.add_argument(
         "--feat-extract-activation",
@@ -162,12 +161,12 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--feat-extract-norm",
         type=str,
-        default="layer",
+        default="group",
     )
     parser.add_argument(
         "--feat-proj-dropout",
         type=float,
-        default=0.0,
+        default=0.1,
     )
     parser.add_argument(
         "--feat-proj-layer-norm",
@@ -192,7 +191,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--hidden-size",
         type=int,
-        default=1024,
+        default=768,
     )
     parser.add_argument(
         "--initializer-range",
@@ -202,7 +201,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--intermediate-size",
         type=int,
-        default=4096,
+        default=3072,
     )
     parser.add_argument(
         "--layer-norm-eps",
@@ -247,7 +246,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-attention-heads",
         type=int,
-        default=16,
+        default=12,
     )
     parser.add_argument(
         "--num-conv-pos-embedding-groups",
@@ -262,14 +261,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--num-hidden-layers",
         type=int,
-        default=24,
-    )
-
-    parser.add_argument(
-        "--encoder-dim",
-        type=int,
-        default=1024,
-        help="Embedding dimension in encoder model.",
+        default=12,
     )
 
     parser.add_argument(
@@ -366,6 +358,14 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--pretrained-dir",
+        type=str,
+        default="download/hubert-base-ls960",
+        help="""The pretrained model dir.
+        It specifies the directory where the pretrained checkpoint is saved.""",
+    )
+
     parser.add_argument(
         "--bpe-model",
         type=str,
@@ -657,7 +657,7 @@ def get_decoder_model(params: AttributeDict) -> nn.Module:
 
 def get_joiner_model(params: AttributeDict) -> nn.Module:
     joiner = Joiner(
-        encoder_dim=params.encoder_dim,
+        encoder_dim=params.hidden_size,
         decoder_dim=params.decoder_dim,
         joiner_dim=params.joiner_dim,
         vocab_size=params.vocab_size,
@@ -685,7 +685,7 @@ def get_model(params: AttributeDict) -> nn.Module:
         encoder=encoder,
         decoder=decoder,
         joiner=joiner,
-        encoder_dim=params.encoder_dim,
+        encoder_dim=params.hidden_size,
         decoder_dim=params.decoder_dim,
         vocab_size=params.vocab_size,
         use_transducer=params.use_transducer,
@@ -731,6 +731,8 @@ def load_checkpoint_if_available(
     elif params.start_epoch > 1:
         filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
     else:
+        logging.info(f"Loading {params.pretrained_dir}")
+        model.encoder = HubertModel.from_pretrained(params.pretrained_dir)
         return None
 
     assert filename.is_file(), f"{filename} does not exist!"
diff --git a/egs/librispeech/SSL/hubert/joiner.py b/egs/librispeech/SSL/hubert/joiner.py
deleted file mode 100644
index dfb0a0057b..0000000000
--- a/egs/librispeech/SSL/hubert/joiner.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from scaling import ScaledLinear
-
-
-class Joiner(nn.Module):
-    def __init__(
-        self,
-        encoder_dim: int,
-        decoder_dim: int,
-        joiner_dim: int,
-        vocab_size: int,
-    ):
-        super().__init__()
-
-        self.encoder_proj = ScaledLinear(encoder_dim, joiner_dim, initial_scale=0.25)
-        self.decoder_proj = ScaledLinear(decoder_dim, joiner_dim, initial_scale=0.25)
-        self.output_linear = nn.Linear(joiner_dim, vocab_size)
-
-    def forward(
-        self,
-        encoder_out: torch.Tensor,
-        decoder_out: torch.Tensor,
-        project_input: bool = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-          encoder_out:
-            Output from the encoder. Its shape is (N, T, s_range, C).
-          decoder_out:
-            Output from the decoder. Its shape is (N, T, s_range, C).
-           project_input:
-            If true, apply input projections encoder_proj and decoder_proj.
-            If this is false, it is the user's responsibility to do this
-            manually.
-        Returns:
-          Return a tensor of shape (N, T, s_range, C).
-        """
-        assert encoder_out.ndim == decoder_out.ndim, (
-            encoder_out.shape,
-            decoder_out.shape,
-        )
-
-        if project_input:
-            logit = self.encoder_proj(encoder_out) + self.decoder_proj(decoder_out)
-        else:
-            logit = encoder_out + decoder_out
-
-        logit = self.output_linear(torch.tanh(logit))
-
-        return logit
diff --git a/egs/librispeech/SSL/hubert/joiner.py b/egs/librispeech/SSL/hubert/joiner.py
new file mode 120000
index 0000000000..aa3362cda4
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/joiner.py
@@ -0,0 +1 @@
+../../ASR/zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/optim.py b/egs/librispeech/SSL/hubert/optim.py
deleted file mode 100644
index b83359a1ac..0000000000
--- a/egs/librispeech/SSL/hubert/optim.py
+++ /dev/null
@@ -1,1244 +0,0 @@
-#      Copyright      2022  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import contextlib
-import logging
-import random
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-from lhotse.utils import fix_random_seed
-from torch import Tensor, nn
-from torch.optim import Optimizer
-
-
-class BatchedOptimizer(Optimizer):
-    """
-    This class adds to class Optimizer the capability to optimize parameters in batches:
-    it will stack the parameters and their grads for you so the optimizer can work
-    on tensors with an extra leading dimension.  This is intended for speed with GPUs,
-    as it reduces the number of kernels launched in the optimizer.
-
-    Args:
-      params:
-    """
-
-    def __init__(self, params, defaults):
-        super(BatchedOptimizer, self).__init__(params, defaults)
-
-    @contextlib.contextmanager
-    def batched_params(self, param_group, group_params_names):
-        """
-        This function returns (technically, yields) a list of
-          of tuples (p, state), where
-        p is a `fake` parameter that is stacked (over axis 0) from real parameters
-        that share the same shape, and its gradient is also stacked;
-        `state` is the state corresponding to this batch of parameters
-        (it will be physically located in the "state" for one of the real
-        parameters, the last one that has any particular shape and dtype).
-
-        This function is decorated as a context manager so that it can
-        write parameters back to their "real" locations.
-
-        The idea is, instead of doing:
-        <code>
-          for p in group["params"]:
-             state = self.state[p]
-             ...
-        </code>
-        you can do:
-        <code>
-          with self.batched_params(group["params"]) as batches:
-             for p, state, p_names in batches:
-                 ...
-        </code>
-
-        Args:
-          group: a parameter group, which is a list of parameters; should be
-                one of self.param_groups.
-          group_params_names: name for each parameter in group,
-                which is List[str].
-        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
-
-        assert len(param_group) == len(group_params_names)
-        for p, named_p in zip(param_group, group_params_names):
-            key = (str(p.dtype), *p.shape)
-            batches[key].append(p)
-            batches_names[key].append(named_p)
-
-        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i]
-        )
-        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
-        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]
-
-        stacked_params_dict = dict()
-
-        # turn batches into a list, in deterministic order.
-        # tuples will contain tuples of (stacked_param, state, stacked_params_names),
-        # one for each batch in `batches`.
-        tuples = []
-
-        for batch, batch_names in zip(batches, batches_names):
-            p = batch[0]
-            # we arbitrarily store the state in the
-            # state corresponding to the 1st parameter in the
-            # group.  class Optimizer will take care of saving/loading state.
-            state = self.state[p]
-            p_stacked = torch.stack(batch)
-            grad = torch.stack(
-                [torch.zeros_like(p) if p.grad is None else p.grad for p in batch]
-            )
-            p_stacked.grad = grad
-            stacked_params_dict[key] = p_stacked
-            tuples.append((p_stacked, state, batch_names))
-
-        yield tuples  # <-- calling code will do the actual optimization here!
-
-        for (stacked_params, _state, _names), batch in zip(tuples, batches):
-            for i, p in enumerate(batch):  # batch is list of Parameter
-                p.copy_(stacked_params[i])
-
-
-class ScaledAdam(BatchedOptimizer):
-    """
-     Implements 'Scaled Adam', a variant of Adam where we scale each parameter's update
-     proportional to the norm of that parameter; and also learn the scale of the parameter,
-     in log space, subject to upper and lower limits (as if we had factored each parameter as
-     param = underlying_param * log_scale.exp())
-
-
-     Args:
-          params:  The parameters or param_groups to optimize (like other Optimizer subclasses)
-                   Unlike common optimizers, which accept model.parameters() or groups of parameters(),
-                   this optimizer could accept model.named_parameters() or groups of named_parameters().
-                   See comments of function _get_names_of_parameters for its 4 possible cases.
-              lr:  The learning rate.  We will typically use a learning rate schedule that starts
-                   at 0.03 and decreases over time, i.e. much higher than other common
-                   optimizers.
-     clipping_scale: (e.g. 2.0)
-                   A scale for gradient-clipping: if specified, the normalized gradients
-                   over the whole model will be clipped to have 2-norm equal to
-                   `clipping_scale` times the median 2-norm over the most recent period
-                   of `clipping_update_period` minibatches.  By "normalized gradients",
-                   we mean after multiplying by the rms parameter value for this tensor
-                   [for non-scalars]; this is appropriate because our update is scaled
-                   by this quantity.
-            betas: beta1,beta2 are momentum constants for regular momentum, and moving sum-sq grad.
-                   Must satisfy 0 < beta <= beta2 < 1.
-     scalar_lr_scale: A scaling factor on the learning rate, that we use to update the
-                   scale of each parameter tensor and scalar parameters of the mode..
-                   If each parameter were decomposed
-                   as p * p_scale.exp(), where (p**2).mean().sqrt() == 1.0, scalar_lr_scale
-                   would be a the scaling factor on the learning rate of p_scale.
-              eps:  A general-purpose epsilon to prevent division by zero
-    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be >= this value)
-    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                   learning the scale on the parameters (we'll constrain the rms of each non-scalar
-                   parameter tensor to be <= this value)
-       scalar_max: Maximum absolute value for scalar parameters (applicable if your
-                   model has any parameters with numel() == 1).
-    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time
-                   in the update.
-     clipping_update_period: if clipping_scale is specified, this is the period
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=3e-02,
-        clipping_scale=None,
-        betas=(0.9, 0.98),
-        scalar_lr_scale=0.1,
-        eps=1.0e-08,
-        param_min_rms=1.0e-05,
-        param_max_rms=3.0,
-        scalar_max=10.0,
-        size_update_period=4,
-        clipping_update_period=100,
-    ):
-        defaults = dict(
-            lr=lr,
-            clipping_scale=clipping_scale,
-            betas=betas,
-            scalar_lr_scale=scalar_lr_scale,
-            eps=eps,
-            param_min_rms=param_min_rms,
-            param_max_rms=param_max_rms,
-            scalar_max=scalar_max,
-            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period,
-        )
-
-        # If params only contains parameters or group of parameters,
-        # i.e when parameter names are not given,
-        # this flag will be set to False in funciton _get_names_of_parameters.
-        self.show_dominant_parameters = True
-        param_groups, parameters_names = self._get_names_of_parameters(params)
-        super(ScaledAdam, self).__init__(param_groups, defaults)
-        assert len(self.param_groups) == len(parameters_names)
-        self.parameters_names = parameters_names
-
-    def _get_names_of_parameters(
-        self, params_or_named_params
-    ) -> Tuple[List[Dict], List[List[str]]]:
-        """
-        Args:
-          params_or_named_params: according to the way ScaledAdam is initialized in train.py,
-            this argument could be one of following 4 cases,
-            case 1, a generator of parameter, e.g.:
-              optimizer = ScaledAdam(model.parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 2, a list of parameter groups with different config, e.g.:
-              model_param_groups = [
-                      {'params': model.encoder.parameters(), 'lr': 0.05},
-                      {'params': model.decoder.parameters(), 'lr': 0.01},
-                      {'params': model.joiner.parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-            case 3, a generator of named_parameter, e.g.:
-              optimizer = ScaledAdam(model.named_parameters(), lr=params.base_lr, clipping_scale=3.0)
-
-            case 4, a list of named_parameter groups with different config, e.g.:
-              model_named_param_groups = [
-                      {'named_params': model.encoder.named_parameters(), 'lr': 0.05},
-                      {'named_params': model.decoder.named_parameters(), 'lr': 0.01},
-                      {'named_params': model.joiner.named_parameters(), 'lr': 0.03},
-                      ]
-              optimizer = ScaledAdam(model_named_param_groups, lr=params.base_lr, clipping_scale=3.0)
-
-          For case 1 and case 2, input params is used to initialize the underlying torch.optimizer.
-          For case 3 and case 4, firstly, names and params are extracted from input named_params,
-            then, these extracted params are used to initialize the underlying torch.optimizer,
-            and these extracted names are mainly used by function
-            `_show_gradient_dominating_parameter`
-
-        Returns:
-          Returns a tuple containing 2 elements:
-            - `param_groups` with type List[Dict], each Dict element is a parameter group.
-              An example of `param_groups` could be:
-              [
-                  {'params': `one iterable of Parameter`, 'lr': 0.05},
-                  {'params': `another iterable of Parameter`, 'lr': 0.08},
-                  {'params': `a third iterable of Parameter`, 'lr': 0.1},
-              ]
-            - `param_gruops_names` with type List[List[str]],
-               each `List[str]` is for a group['params'] in param_groups,
-               and each `str` is the name of a parameter.
-               A dummy name "foo" is related to each parameter,
-               if input are params without names, i.e. case 1 or case 2.
-        """
-        # variable naming convention in this function:
-        #   p is short for param.
-        #   np is short for named_param.
-        #   p_or_np is short for param_or_named_param.
-        #   cur is short for current.
-        #   group is a dict, e.g. {'params': iterable of parameter, 'lr': 0.05, other fields}.
-        #   groups is a List[group]
-
-        iterable_or_groups = list(params_or_named_params)
-        if len(iterable_or_groups) == 0:
-            raise ValueError("optimizer got an empty parameter list")
-
-        # The first value of returned tuple.  A list of dicts containing at
-        # least 'params' as a key.
-        param_groups = []
-
-        # The second value of returned tuple,
-        # a List[List[str]], each sub-List is for a group.
-        param_groups_names = []
-
-        if not isinstance(iterable_or_groups[0], dict):
-            # case 1 or case 3,
-            # the input is an iterable of parameter or named parameter.
-            param_iterable_cur_group = []
-            param_names_cur_group = []
-            for p_or_np in iterable_or_groups:
-                if isinstance(p_or_np, tuple):
-                    # case 3
-                    name, param = p_or_np
-                else:
-                    # case 1
-                    assert isinstance(p_or_np, torch.Tensor)
-                    param = p_or_np
-                    # Assign a dummy name as a placeholder
-                    name = "foo"
-                    self.show_dominant_parameters = False
-                param_iterable_cur_group.append(param)
-                param_names_cur_group.append(name)
-            param_groups.append({"params": param_iterable_cur_group})
-            param_groups_names.append(param_names_cur_group)
-        else:
-            # case 2 or case 4
-            # the input is groups of parameter or named parameter.
-            for cur_group in iterable_or_groups:
-                assert "named_params" in cur_group
-                name_list = [x[0] for x in cur_group["named_params"]]
-                p_list = [x[1] for x in cur_group["named_params"]]
-                del cur_group["named_params"]
-                cur_group["params"] = p_list
-                param_groups.append(cur_group)
-                param_groups_names.append(name_list)
-
-        return param_groups, param_groups_names
-
-    def __setstate__(self, state):
-        super(ScaledAdam, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        batch = True
-
-        for group, group_params_names in zip(self.param_groups, self.parameters_names):
-            with self.batched_params(group["params"], group_params_names) as batches:
-                # batches is list of pairs (stacked_param, state).  stacked_param is like
-                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
-                # a stacking dim, it is not a real dim.
-
-                if (
-                    len(batches[0][1]) == 0
-                ):  # if len(first state) == 0: not yet initialized
-                    clipping_scale = 1
-                else:
-                    clipping_scale = self._get_clipping_scale(group, batches)
-
-                for p, state, _ in batches:
-                    # Perform optimization step.
-                    # grad is not going to be None, we handled that when creating the batches.
-                    grad = p.grad
-                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
-                    # State initialization
-                    if len(state) == 0:
-                        self._init_state(group, p, state)
-
-                    self._step_one_batch(group, p, state, clipping_scale)
-
-        return loss
-
-    def _init_state(self, group: dict, p: Tensor, state: dict):
-        """
-        Initializes state dict for parameter 'p'.  Assumes that dim 0 of tensor p
-        is actually the batch dimension, corresponding to batched-together
-        parameters of a given shape.
-
-
-        Args:
-           group:   Dict to look up configuration values.
-               p: The parameter that we are initializing the state for
-           state: Dict from string to whatever state we are initializing
-        """
-        size_update_period = group["size_update_period"]
-
-        state["step"] = 0
-
-        kwargs = {"device": p.device, "dtype": p.dtype}
-
-        # 'delta' implements conventional momentum.  There are
-        # several different kinds of update going on, so rather than
-        # compute "exp_avg" like in Adam, we store and decay a
-        # parameter-change "delta", which combines all forms of
-        # update.  this is equivalent to how it's done in Adam,
-        # except for the first few steps.
-        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-
-        if numel > 1:
-            # "param_rms" just periodically records the scalar root-mean-square value of
-            # the parameter tensor.
-            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-            state["param_rms"] = param_rms
-
-            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(
-                size_update_period, *param_rms.shape, **kwargs
-            )
-
-        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-    def _get_clipping_scale(
-        self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]
-    ) -> float:
-        """
-        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
-        by this amount before applying the rest of the update.
-
-        Args:
-           group: the parameter group, an item in self.param_groups
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-        """
-        assert len(tuples) >= 1
-        clipping_scale = group["clipping_scale"]
-        (first_p, first_state, _) = tuples[0]
-        step = first_state["step"]
-        if clipping_scale is None or step == 0:
-            # no clipping.  return early on step == 0 because the other
-            # parameters' state won't have been initialized yet.
-            return 1.0
-        clipping_update_period = group["clipping_update_period"]
-        scalar_lr_scale = group["scalar_lr_scale"]
-
-        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for p, state, param_names in tuples:
-            grad = p.grad
-            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients"
-                )
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                tot_sumsq += (grad**2).sum() * (
-                    scalar_lr_scale**2
-                )  # sum() to change shape [1] to []
-            else:
-                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()
-
-        tot_norm = tot_sumsq.sqrt()
-        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device
-            )
-        first_state["model_norms"][step % clipping_update_period] = tot_norm
-
-        irregular_estimate_steps = [
-            i for i in [10, 20, 40] if i < clipping_update_period
-        ]
-        if step % clipping_update_period == 0 or step in irregular_estimate_steps:
-            # Print some stats.
-            # We don't reach here if step == 0 because we would have returned
-            # above.
-            sorted_norms = first_state["model_norms"].sort()[0].to("cpu")
-            if step in irregular_estimate_steps:
-                sorted_norms = sorted_norms[-step:]
-            num_norms = sorted_norms.numel()
-            quartiles = []
-            for n in range(0, 5):
-                index = min(num_norms - 1, (num_norms // 4) * n)
-                quartiles.append(sorted_norms[index].item())
-
-            median = quartiles[2]
-            if median - median != 0:
-                raise RuntimeError("Too many grads were not finite")
-            threshold = clipping_scale * median
-            if step in irregular_estimate_steps:
-                # use larger thresholds on first few steps of estimating threshold,
-                # as norm may be changing rapidly.
-                threshold = threshold * 2.0
-            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (
-                first_state["num_clipped"] * 100.0 / num_norms
-                if "num_clipped" in first_state
-                else 0.0
-            )
-            first_state["num_clipped"] = 0
-            quartiles = " ".join(["%.3e" % x for x in quartiles])
-            logging.warn(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
-            )
-
-        try:
-            model_norm_threshold = first_state["model_norm_threshold"]
-        except KeyError:
-            return 1.0  # threshold has not yet been set.
-
-        ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
-        if ans != ans:  # e.g. ans is nan
-            ans = 0.0
-        if ans < 1.0:
-            first_state["num_clipped"] += 1
-        if ans < 0.1:
-            logging.warn(
-                f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-            )
-            if self.show_dominant_parameters:
-                assert p.shape[0] == len(param_names)
-                self._show_gradient_dominating_parameter(
-                    tuples, tot_sumsq, group["scalar_lr_scale"]
-                )
-
-        if ans == 0.0:
-            for p, state, param_names in tuples:
-                p.grad.zero_()  # get rid of infinity()
-
-        return ans
-
-    def _show_gradient_dominating_parameter(
-        self,
-        tuples: List[Tuple[Tensor, dict, List[str]]],
-        tot_sumsq: Tensor,
-        scalar_lr_scale: float,
-    ):
-        """
-        Show information of parameter which dominates tot_sumsq.
-
-        Args:
-           tuples: a list of tuples of (param, state, param_names)
-                where param is a batched set of parameters,
-                with a .grad (1st dim is batch dim)
-                and state is the state-dict where optimization parameters are kept.
-                param_names is a List[str] while each str is name for a parameter
-                in batched set of parameters "param".
-            tot_sumsq: sumsq of all parameters. Though it's could be calculated
-                from tuples, we still pass it to save some time.
-        """
-        all_sumsq_orig = {}
-        for p, state, batch_param_names in tuples:
-            # p is a stacked batch parameters.
-            batch_grad = p.grad
-            if p.numel() == p.shape[0]:  # a batch of scalars
-                # Dummy values used by following `zip` statement.
-                batch_rms_orig = torch.full(
-                    p.shape, scalar_lr_scale, device=batch_grad.device
-                )
-            else:
-                batch_rms_orig = state["param_rms"]
-            batch_sumsq_orig = (batch_grad * batch_rms_orig) ** 2
-            if batch_grad.ndim > 1:
-                # need to guard it with if-statement because sum() sums over
-                # all dims if dim == ().
-                batch_sumsq_orig = batch_sumsq_orig.sum(
-                    dim=list(range(1, batch_grad.ndim))
-                )
-            for name, sumsq_orig, rms, grad in zip(
-                batch_param_names, batch_sumsq_orig, batch_rms_orig, batch_grad
-            ):
-                proportion_orig = sumsq_orig / tot_sumsq
-                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)
-
-        sorted_by_proportion = {
-            k: v
-            for k, v in sorted(
-                all_sumsq_orig.items(), key=lambda item: item[1][0], reverse=True
-            )
-        }
-        dominant_param_name = next(iter(sorted_by_proportion))
-        (
-            dominant_proportion,
-            dominant_sumsq,
-            dominant_rms,
-            dominant_grad,
-        ) = sorted_by_proportion[dominant_param_name]
-        logging.warn(
-            f"Parameter dominating tot_sumsq {dominant_param_name}"
-            f" with proportion {dominant_proportion:.2f},"
-            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-            f"={dominant_sumsq:.3e},"
-            f" grad_sumsq={(dominant_grad**2).sum():.3e},"
-            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
-        )
-
-    def _step_one_batch(
-        self, group: dict, p: Tensor, state: dict, clipping_scale: float
-    ):
-        """
-        Do the step for one parameter, which is actually going to be a batch of
-        `real` parameters, with dim 0 as the batch dim.
-        Args:
-                  group:  dict to look up configuration values
-                    p: parameter to update (actually multiple parameters stacked together
-                       as a batch)
-                  state: state-dict for p, to look up the optimizer state
-        """
-        lr = group["lr"]
-        size_update_period = group["size_update_period"]
-        beta1 = group["betas"][0]
-
-        grad = p.grad
-        if clipping_scale != 1.0:
-            grad *= clipping_scale
-        step = state["step"]
-        delta = state["delta"]
-
-        delta.mul_(beta1)
-        batch_size = p.shape[0]
-        numel = p.numel() // batch_size
-        if numel > 1:
-            # Update the size/scale of p, and set param_rms
-            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True
-            )
-            if step % size_update_period == size_update_period - 1:
-                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_(
-                    (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
-                )
-                if step > 0:
-                    # self._size_update() learns the overall scale on the
-                    # parameter, by shrinking or expanding it.
-                    self._size_update(group, scale_grads, p, state)
-
-        if numel == 1:
-            # For parameters with 1 element we just use regular Adam.
-            # Updates delta.
-            self._step_scalar(group, p, state)
-        else:
-            self._step(group, p, state)
-
-        state["step"] = step + 1
-
-    def _size_update(
-        self, group: dict, scale_grads: Tensor, p: Tensor, state: dict
-    ) -> None:
-        """
-               Called only where p.numel() > 1, this updates the scale of the parameter.
-               If we imagine: p =  underlying_param * scale.exp(), and we are doing
-               gradient descent on underlying param and on scale, this function does the update
-               on `scale`.
-
-               Args:
-              group: dict to look up configuration values
-        scale_grads: a tensor of shape (size_update_period, batch_size, 1, 1,...) containing
-                      grads w.r.t. the scales.
-                  p:  The parameter to update
-               state: The state-dict of p
-        """
-
-        param_rms = state["param_rms"]
-        beta1, beta2 = group["betas"]
-        size_lr = group["lr"] * group["scalar_lr_scale"]
-        param_min_rms = group["param_min_rms"]
-        param_max_rms = group["param_max_rms"]
-        eps = group["eps"]
-        step = state["step"]
-        batch_size = p.shape[0]
-
-        size_update_period = scale_grads.shape[0]
-        # correct beta2 for the size update period: we will have
-        # faster decay at this level.
-        beta2_corr = beta2**size_update_period
-
-        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
-        scale_exp_avg_sq.mul_(beta2_corr).add_(
-            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr,
-        )  # shape is (batch_size, 1, 1, ...)
-
-        # The 1st time we reach here is when size_step == 1.
-        size_step = (step + 1) // size_update_period
-        bias_correction2 = 1 - beta2_corr**size_step
-        # we don't bother with bias_correction1; this will help prevent divergence
-        # at the start of training.
-
-        denom = scale_exp_avg_sq.sqrt() + eps
-
-        scale_step = (
-            -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom
-        )
-
-        is_too_small = param_rms < param_min_rms
-
-        # when the param gets too small, just don't shrink it any further.
-        scale_step.masked_fill_(is_too_small, 0.0)
-
-        # and ensure the parameter rms after update never exceeds param_max_rms.
-        # We have to look at the trained model for parameters at or around the
-        # param_max_rms, because sometimes they can indicate a problem with the
-        # topology or settings.
-        scale_step = torch.minimum(scale_step, (param_max_rms - param_rms) / param_rms)
-
-        delta = state["delta"]
-        # the factor of (1-beta1) relates to momentum.
-        delta.add_(p * scale_step, alpha=(1 - beta1))
-
-    def _step(self, group: dict, p: Tensor, state: dict):
-        """
-        This function does the core update of self.step(), in the case where the members of
-        the batch have more than 1 element.
-
-        Args:
-            group: A dict which will be used to look up configuration values
-                p: The parameter to be updated
-             grad: The grad of p
-            state: The state-dict corresponding to parameter p
-
-        This function modifies p.
-        """
-        grad = p.grad
-        lr = group["lr"]
-        beta1, beta2 = group["betas"]
-        eps = group["eps"]
-        param_min_rms = group["param_min_rms"]
-        step = state["step"]
-
-        exp_avg_sq = state["exp_avg_sq"]
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
-
-        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
-        bias_correction2 = 1 - beta2 ** (this_step + 1)
-        if bias_correction2 < 0.99:
-            # note: not in-place.
-            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
-
-        denom = exp_avg_sq.sqrt()
-        denom += eps
-        grad = grad / denom
-
-        alpha = -lr * (1 - beta1) * state["param_rms"].clamp(min=param_min_rms)
-
-        delta = state["delta"]
-        delta.add_(grad * alpha)
-        p.add_(delta)
-
-    def _step_scalar(self, group: dict, p: Tensor, state: dict):
-        """
-        A simplified form of the core update for scalar tensors, where we cannot get a good
-        estimate of the parameter rms.
-        """
-        beta1, beta2 = group["betas"]
-        scalar_max = group["scalar_max"]
-        eps = group["eps"]
-        lr = group["lr"] * group["scalar_lr_scale"]
-        grad = p.grad
-
-        exp_avg_sq = state["exp_avg_sq"]  # shape: (batch_size,)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
-        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
-        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
-        denom = (exp_avg_sq / bias_correction2).sqrt() + eps
-
-        delta = state["delta"]
-        delta.add_(grad / denom, alpha=-lr * (1 - beta1))
-        p.clamp_(min=-scalar_max, max=scalar_max)
-        p.add_(delta)
-
-
-class LRScheduler(object):
-    """
-    Base-class for learning rate schedulers where the learning-rate depends on both the
-    batch and the epoch.
-    """
-
-    def __init__(self, optimizer: Optimizer, verbose: bool = False):
-        # Attach optimizer
-        if not isinstance(optimizer, Optimizer):
-            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
-        self.optimizer = optimizer
-        self.verbose = verbose
-
-        for group in optimizer.param_groups:
-            group.setdefault("base_lr", group["lr"])
-
-        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
-
-        self.epoch = 0
-        self.batch = 0
-
-    def state_dict(self):
-        """Returns the state of the scheduler as a :class:`dict`.
-
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
-        """
-        return {
-            "base_lrs": self.base_lrs,
-            "epoch": self.epoch,
-            "batch": self.batch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Loads the schedulers state.
-
-        Args:
-            state_dict (dict): scheduler state. Should be an object returned
-                from a call to :meth:`state_dict`.
-        """
-        self.__dict__.update(state_dict)
-
-    def get_last_lr(self) -> List[float]:
-        """Return last computed learning rate by current scheduler.  Will be a list of float."""
-        return self._last_lr
-
-    def get_lr(self):
-        # Compute list of learning rates from self.epoch and self.batch and
-        # self.base_lrs; this must be overloaded by the user.
-        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
-        raise NotImplementedError
-
-    def step_batch(self, batch: Optional[int] = None) -> None:
-        # Step the batch index, or just set it.  If `batch` is specified, it
-        # must be the batch index from the start of training, i.e. summed over
-        # all epochs.
-        # You can call this in any order; if you don't provide 'batch', it should
-        # of course be called once per batch.
-        if batch is not None:
-            self.batch = batch
-        else:
-            self.batch = self.batch + 1
-        self._set_lrs()
-
-    def step_epoch(self, epoch: Optional[int] = None):
-        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
-        # you should call this at the start of the epoch; if you don't provide the 'epoch'
-        # arg, you should call it at the end of the epoch.
-        if epoch is not None:
-            self.epoch = epoch
-        else:
-            self.epoch = self.epoch + 1
-        self._set_lrs()
-
-    def _set_lrs(self):
-        values = self.get_lr()
-        assert len(values) == len(self.optimizer.param_groups)
-
-        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
-            param_group, lr = data
-            param_group["lr"] = lr
-            self.print_lr(self.verbose, i, lr)
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
-
-    def print_lr(self, is_verbose, group, lr):
-        """Display the current learning rate."""
-        if is_verbose:
-            logging.warn(
-                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
-                f" of group {group} to {lr:.4e}."
-            )
-
-
-class Eden(LRScheduler):
-    """
-    Eden scheduler.
-    The basic formula (before warmup) is:
-      lr = base_lr * (((batch**2 + lr_batches**2) / lr_batches**2) ** -0.25 *
-                     (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25)) * warmup
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-
-    If you don't have the concept of epochs, or one epoch takes a very long time,
-    you can replace the notion of 'epoch' with some measure of the amount of data
-    processed, e.g. hours of data or frames of data, with 'lr_epochs' being set to
-    some measure representing "quite a lot of data": say, one fifth or one third
-    of an entire training run, but it doesn't matter much.  You could also use
-    Eden2 which has only the notion of batches.
-
-    We suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-        lr_epochs: the number of epochs after which we start significantly
-              decreasing the learning rate, suggest 6 if you plan to do e.g.
-              20 to 40 epochs, but may need smaller number if dataset is huge
-              and you will do few epochs.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        lr_epochs: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        warmup_start: float = 0.5,
-        verbose: bool = False,
-    ):
-        super(Eden, self).__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.lr_epochs = lr_epochs
-        self.warmup_batches = warmup_batches
-
-        assert 0.0 <= warmup_start <= 1.0, warmup_start
-        self.warmup_start = warmup_start
-
-    def get_lr(self):
-        factor = (
-            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
-        ) ** -0.25 * (
-            ((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25
-        )
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else self.warmup_start
-            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
-            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-
-        return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-class Eden2(LRScheduler):
-    """
-    Eden2 scheduler, simpler than Eden because it does not use the notion of epoch,
-    only batches.
-
-    The basic formula (before warmup) is:
-      lr = base_lr * ((batch**2 + lr_batches**2) / lr_batches**2) ** -0.5) * warmup
-
-    where `warmup` increases from linearly 0.5 to 1 over `warmup_batches` batches
-    and then stays constant at 1.
-
-
-     E.g. suggest base_lr = 0.04 (passed to optimizer) if used with ScaledAdam
-
-    Args:
-        optimizer: the optimizer to change the learning rates on
-        lr_batches: the number of batches after which we start significantly
-              decreasing the learning rate, suggest 5000.
-    """
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        lr_batches: Union[int, float],
-        warmup_batches: Union[int, float] = 500.0,
-        warmup_start: float = 0.5,
-        verbose: bool = False,
-    ):
-        super().__init__(optimizer, verbose)
-        self.lr_batches = lr_batches
-        self.warmup_batches = warmup_batches
-
-        assert 0.0 <= warmup_start <= 1.0, warmup_start
-        self.warmup_start = warmup_start
-
-    def get_lr(self):
-        factor = (
-            (self.batch**2 + self.lr_batches**2) / self.lr_batches**2
-        ) ** -0.5
-        warmup_factor = (
-            1.0
-            if self.batch >= self.warmup_batches
-            else self.warmup_start
-            + (1.0 - self.warmup_start) * (self.batch / self.warmup_batches)
-            # else 0.5 + 0.5 * (self.batch / self.warmup_batches)
-        )
-
-        return [x * factor * warmup_factor for x in self.base_lrs]
-
-
-def _test_eden():
-    m = torch.nn.Linear(100, 100)
-    optim = ScaledAdam(m.parameters(), lr=0.03)
-
-    scheduler = Eden(optim, lr_batches=100, lr_epochs=2, verbose=True)
-
-    for epoch in range(10):
-        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
-
-        for step in range(20):
-            x = torch.randn(200, 100).detach()
-            x.requires_grad = True
-            y = m(x)
-            dy = torch.randn(200, 100).detach()
-            f = (y * dy).sum()
-            f.backward()
-
-            optim.step()
-            scheduler.step_batch()
-            optim.zero_grad()
-
-    logging.info(f"last lr = {scheduler.get_last_lr()}")
-    logging.info(f"state dict = {scheduler.state_dict()}")
-
-
-# This is included mostly as a baseline for ScaledAdam.
-class Eve(Optimizer):
-    """
-    Implements Eve algorithm.  This is a modified version of AdamW with a special
-    way of setting the weight-decay / shrinkage-factor, which is designed to make the
-    rms of the parameters approach a particular target_rms (default: 0.1).  This is
-    for use with networks with 'scaled' versions of modules (see scaling.py), which
-    will be close to invariant to the absolute scale on the parameter matrix.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-    Eve is unpublished so far.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 3e-4;
-            this value means that the weight would decay significantly after
-            about 3k minibatches.  Is not multiplied by learning rate, but
-            is conditional on RMS-value of parameter being > target_rms.
-        target_rms (float, optional): target root-mean-square value of
-           parameters, if they fall below this we will stop applying weight decay.
-
-
-    .. _Adam: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.98),
-        eps=1e-8,
-        weight_decay=1e-3,
-        target_rms=0.1,
-    ):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0 <= weight_decay <= 0.1:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0 < target_rms <= 10.0:
-            raise ValueError("Invalid target_rms value: {}".format(target_rms))
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            target_rms=target_rms,
-        )
-        super(Eve, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Eve, self).__setstate__(state)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                # Perform optimization step
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError("AdamW does not support sparse gradients")
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt() * (bias_correction2**-0.5)).add_(
-                    group["eps"]
-                )
-
-                step_size = group["lr"] / bias_correction1
-                target_rms = group["target_rms"]
-                weight_decay = group["weight_decay"]
-
-                if p.numel() > 1:
-                    # avoid applying this weight-decay on "scaling factors"
-                    # (which are scalar).
-                    is_above_target_rms = p.norm() > (target_rms * (p.numel() ** 0.5))
-                    p.mul_(1 - (weight_decay * is_above_target_rms))
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
-
-                if random.random() < 0.0005:
-                    step = (exp_avg / denom) * step_size
-                    logging.info(
-                        f"Delta rms = {(step**2).mean().item()}, shape = {step.shape}"
-                    )
-
-        return loss
-
-
-def _test_scaled_adam(hidden_dim: int):
-    import timeit
-
-    from scaling import ScaledLinear
-
-    E = 100
-    B = 4
-    T = 2
-    logging.info("in test_eve_cain")
-    # device = torch.device('cuda')
-    device = torch.device("cpu")
-    dtype = torch.float32
-
-    fix_random_seed(42)
-    # these input_magnitudes and output_magnitudes are to test that
-    # Abel is working as we expect and is able to adjust scales of
-    # different dims differently.
-    input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-    output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
-
-    for iter in [1, 0]:
-        fix_random_seed(42)
-        Linear = torch.nn.Linear if iter == 0 else ScaledLinear
-
-        m = torch.nn.Sequential(
-            Linear(E, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, hidden_dim),
-            torch.nn.PReLU(),
-            Linear(hidden_dim, E),
-        ).to(device)
-
-        train_pairs = [
-            (
-                100.0
-                * torch.randn(B, T, E, device=device, dtype=dtype)
-                * input_magnitudes,
-                torch.randn(B, T, E, device=device, dtype=dtype) * output_magnitudes,
-            )
-            for _ in range(20)
-        ]
-
-        if iter == 0:
-            optim = Eve(m.parameters(), lr=0.003)
-        elif iter == 1:
-            optim = ScaledAdam(m.parameters(), lr=0.03, clipping_scale=2.0)
-        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
-
-        start = timeit.default_timer()
-        avg_loss = 0.0
-        for epoch in range(180):
-            scheduler.step_epoch()
-            # if epoch == 100 and iter in [2,3]:
-            #    optim.reset_speedup()  # check it doesn't crash.
-
-            # if epoch == 130:
-            #    opts = diagnostics.TensorDiagnosticOptions(
-            #        512
-            #    )  # allow 4 megabytes per sub-module
-            #    diagnostic = diagnostics.attach_diagnostics(m, opts)
-
-            for n, (x, y) in enumerate(train_pairs):
-                y_out = m(x)
-                loss = ((y_out - y) ** 2).mean() * 100.0
-                if epoch == 0 and n == 0:
-                    avg_loss = loss.item()
-                else:
-                    avg_loss = 0.98 * avg_loss + 0.02 * loss.item()
-                if n == 0 and epoch % 5 == 0:
-                    # norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
-                    # norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()
-                    # norm2 = '%.2e' % (m[2].weight**2).mean().sqrt().item()
-                    # norm2b = '%.2e' % (m[2].bias**2).mean().sqrt().item()
-                    # scale1 = '%.2e' % (m[0].weight_scale.exp().item())
-                    # scale1b = '%.2e' % (m[0].bias_scale.exp().item())
-                    # scale2 = '%.2e' % (m[2].weight_scale.exp().item())
-                    # scale2b = '%.2e' % (m[2].bias_scale.exp().item())
-                    lr = scheduler.get_last_lr()[0]
-                    logging.info(
-                        f"Iter {iter}, epoch {epoch}, batch {n}, avg_loss {avg_loss:.4g}, lr={lr:.4e}"
-                    )  # , norms={norm1,norm1b,norm2,norm2b}") # scales={scale1,scale1b,scale2,scale2b}
-                loss.log().backward()
-                optim.step()
-                optim.zero_grad()
-                scheduler.step_batch()
-
-        # diagnostic.print_diagnostics()
-
-        stop = timeit.default_timer()
-        logging.info(f"Iter={iter}, Time taken: {stop - start}")
-
-        logging.info(f"last lr = {scheduler.get_last_lr()}")
-        # logging.info("state dict = ", scheduler.state_dict())
-        # logging.info("optim state_dict = ", optim.state_dict())
-        logging.info(f"input_magnitudes = {input_magnitudes}")
-        logging.info(f"output_magnitudes = {output_magnitudes}")
-
-
-if __name__ == "__main__":
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    logging.getLogger().setLevel(logging.INFO)
-    import subprocess
-
-    s = subprocess.check_output(
-        "git status -uno .; git log -1; git diff HEAD .", shell=True
-    )
-    logging.info(s)
-    import sys
-
-    if len(sys.argv) > 1:
-        hidden_dim = int(sys.argv[1])
-    else:
-        hidden_dim = 200
-
-    _test_scaled_adam(hidden_dim)
-    _test_eden()
diff --git a/egs/librispeech/SSL/hubert/optim.py b/egs/librispeech/SSL/hubert/optim.py
new file mode 120000
index 0000000000..56b827b8ae
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/optim.py
@@ -0,0 +1 @@
+../../ASR/zipformer/optim.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/scaling.py b/egs/librispeech/SSL/hubert/scaling.py
deleted file mode 100644
index 29ac33c02b..0000000000
--- a/egs/librispeech/SSL/hubert/scaling.py
+++ /dev/null
@@ -1,1908 +0,0 @@
-# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import math
-import random
-from typing import Optional, Tuple, Union
-
-import k2
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-
-def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
-    max_value = torch.max(x, y)
-    diff = torch.abs(x - y)
-    return max_value + torch.log1p(torch.exp(-diff))
-
-
-# RuntimeError: Exporting the operator logaddexp to ONNX opset version
-# 14 is not supported. Please feel free to request support or submit
-# a pull request on PyTorch GitHub.
-#
-# The following function is to solve the above error when exporting
-# models to ONNX via torch.jit.trace()
-def logaddexp(x: Tensor, y: Tensor) -> Tensor:
-    # Caution(fangjun): Put torch.jit.is_scripting() before
-    # torch.onnx.is_in_onnx_export();
-    # otherwise, it will cause errors for torch.jit.script().
-    #
-    # torch.logaddexp() works for both torch.jit.script() and
-    # torch.jit.trace() but it causes errors for ONNX export.
-    #
-    if torch.jit.is_scripting():
-        # Note: We cannot use torch.jit.is_tracing() here as it also
-        # matches torch.onnx.export().
-        return torch.logaddexp(x, y)
-    elif torch.onnx.is_in_onnx_export():
-        return logaddexp_onnx(x, y)
-    else:
-        # for torch.jit.trace()
-        return torch.logaddexp(x, y)
-
-
-class PiecewiseLinear(object):
-    """
-    Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
-    the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
-    respectively.
-    """
-
-    def __init__(self, *args):
-        assert len(args) >= 1, len(args)
-        if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
-            self.pairs = list(args[0].pairs)
-        else:
-            self.pairs = [(float(x), float(y)) for x, y in args]
-        for x, y in self.pairs:
-            assert isinstance(x, (float, int)), type(x)
-            assert isinstance(y, (float, int)), type(y)
-
-        for i in range(len(self.pairs) - 1):
-            assert self.pairs[i + 1][0] > self.pairs[i][0], (
-                i,
-                self.pairs[i],
-                self.pairs[i + 1],
-            )
-
-    def __str__(self):
-        # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
-        return f"PiecewiseLinear({str(self.pairs)[1:-1]})"
-
-    def __call__(self, x):
-        if x <= self.pairs[0][0]:
-            return self.pairs[0][1]
-        elif x >= self.pairs[-1][0]:
-            return self.pairs[-1][1]
-        else:
-            cur_x, cur_y = self.pairs[0]
-            for i in range(1, len(self.pairs)):
-                next_x, next_y = self.pairs[i]
-                if x >= cur_x and x <= next_x:
-                    return cur_y + (next_y - cur_y) * (x - cur_x) / (next_x - cur_x)
-                cur_x, cur_y = next_x, next_y
-            assert False
-
-    def __mul__(self, alpha):
-        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
-
-    def __add__(self, x):
-        if isinstance(x, (float, int)):
-            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
-        s, x = self.get_common_basis(x)
-        return PiecewiseLinear(
-            *[(sp[0], sp[1] + xp[1]) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def max(self, x):
-        if isinstance(x, (float, int)):
-            x = PiecewiseLinear((0, x))
-        s, x = self.get_common_basis(x, include_crossings=True)
-        return PiecewiseLinear(
-            *[(sp[0], max(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def min(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            x = PiecewiseLinear((0, x))
-        s, x = self.get_common_basis(x, include_crossings=True)
-        return PiecewiseLinear(
-            *[(sp[0], min(sp[1], xp[1])) for sp, xp in zip(s.pairs, x.pairs)]
-        )
-
-    def __eq__(self, other):
-        return self.pairs == other.pairs
-
-    def get_common_basis(self, p: "PiecewiseLinear", include_crossings: bool = False):
-        """
-        Returns (self_mod, p_mod) which are equivalent piecewise linear
-        functions to self and p, but with the same x values.
-
-          p: the other piecewise linear function
-          include_crossings: if true, include in the x values positions
-              where the functions indicate by this and p crosss.
-        """
-        assert isinstance(p, PiecewiseLinear), type(p)
-
-        # get sorted x-values without repetition.
-        x_vals = sorted(set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
-        y_vals1 = [self(x) for x in x_vals]
-        y_vals2 = [p(x) for x in x_vals]
-
-        if include_crossings:
-            extra_x_vals = []
-            for i in range(len(x_vals) - 1):
-                if (y_vals1[i] > y_vals2[i]) != (y_vals1[i + 1] > y_vals2[i + 1]):
-                    # if the two lines in this subsegment potentially cross each other..
-                    diff_cur = abs(y_vals1[i] - y_vals2[i])
-                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
-                    # `pos`, between 0 and 1, gives the relative x position,
-                    # with 0 being x_vals[i] and 1 being x_vals[i+1].
-                    pos = diff_cur / (diff_cur + diff_next)
-                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
-                    extra_x_vals.append(extra_x_val)
-            if len(extra_x_vals) > 0:
-                x_vals = sorted(set(x_vals + extra_x_vals))
-        y_vals1 = [self(x) for x in x_vals]
-        y_vals2 = [p(x) for x in x_vals]
-        return (
-            PiecewiseLinear(*zip(x_vals, y_vals1)),
-            PiecewiseLinear(*zip(x_vals, y_vals2)),
-        )
-
-
-class ScheduledFloat(torch.nn.Module):
-    """
-    This object is a torch.nn.Module only because we want it to show up in [top_level module].modules();
-    it does not have a working forward() function.  You are supposed to cast it to float, as
-    in, float(parent_module.whatever), and use it as something like a dropout prob.
-
-    It is a floating point value whose value changes depending on the batch count of the
-    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
-    in sorted order on x; x corresponds to the batch index.  For batch-index values before the
-    first x or after the last x, we just use the first or last y value.
-
-    Example:
-       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)
-
-    `default` is used when self.batch_count is not set or not in training mode or in
-     torch.jit scripting mode.
-    """
-
-    def __init__(self, *args, default: float = 0.0):
-        super().__init__()
-        # self.batch_count and self.name will be written to in the training loop.
-        self.batch_count = None
-        self.name = None
-        self.default = default
-        self.schedule = PiecewiseLinear(*args)
-
-    def extra_repr(self) -> str:
-        return (
-            f"batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}"
-        )
-
-    def __float__(self):
-        batch_count = self.batch_count
-        if (
-            batch_count is None
-            or not self.training
-            or torch.jit.is_scripting()
-            or torch.jit.is_tracing()
-        ):
-            return float(self.default)
-        else:
-            ans = self.schedule(self.batch_count)
-            if random.random() < 0.0002:
-                logging.info(
-                    f"ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}"
-                )
-            return ans
-
-    def __add__(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule + x, default=self.default)
-        else:
-            return ScheduledFloat(
-                self.schedule + x.schedule, default=self.default + x.default
-            )
-
-    def max(self, x):
-        if isinstance(x, float) or isinstance(x, int):
-            return ScheduledFloat(self.schedule.max(x), default=self.default)
-        else:
-            return ScheduledFloat(
-                self.schedule.max(x.schedule), default=max(self.default, x.default)
-            )
-
-
-FloatLike = Union[float, ScheduledFloat]
-
-
-def random_cast_to_half(x: Tensor, min_abs: float = 5.0e-06) -> Tensor:
-    """
-    A randomized way of casting a floating point value to half precision.
-    """
-    if x.dtype == torch.float16:
-        return x
-    x_abs = x.abs()
-    is_too_small = x_abs < min_abs
-    # for elements where is_too_small is true, random_val will contain +-min_abs with
-    # probability (x.abs() / min_abs), and 0.0 otherwise.  [so this preserves expectations,
-    # for those elements].
-    random_val = min_abs * x.sign() * (torch.rand_like(x) * min_abs < x_abs)
-    return torch.where(is_too_small, random_val, x).to(torch.float16)
-
-
-class CutoffEstimator:
-    """
-    Estimates cutoffs of an arbitrary numerical quantity such that a specified
-    proportion of items will be above the cutoff on average.
-
-      p is the proportion of items that should be above the cutoff.
-    """
-
-    def __init__(self, p: float):
-        self.p = p
-        # total count of items
-        self.count = 0
-        # total count of items that were above the cutoff
-        self.count_above = 0
-        # initial cutoff value
-        self.cutoff = 0
-
-    def __call__(self, x: float) -> bool:
-        """
-        Returns true if x is above the cutoff.
-        """
-        ans = x > self.cutoff
-        self.count += 1
-        if ans:
-            self.count_above += 1
-        cur_p = self.count_above / self.count
-        delta_p = cur_p - self.p
-        if (delta_p > 0) == ans:
-            q = abs(delta_p)
-            self.cutoff = x * q + self.cutoff * (1 - q)
-        return ans
-
-
-class SoftmaxFunction(torch.autograd.Function):
-    """
-    Tries to handle half-precision derivatives in a randomized way that should
-    be more accurate for training than the default behavior.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor, dim: int):
-        ans = x.softmax(dim=dim)
-        # if x dtype is float16, x.softmax() returns a float32 because
-        # (presumably) that op does not support float16, and autocast
-        # is enabled.
-        if torch.is_autocast_enabled():
-            ans = ans.to(torch.float16)
-        ctx.save_for_backward(ans)
-        ctx.x_dtype = x.dtype
-        ctx.dim = dim
-        return ans
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        (ans,) = ctx.saved_tensors
-        with torch.cuda.amp.autocast(enabled=False):
-            ans_grad = ans_grad.to(torch.float32)
-            ans = ans.to(torch.float32)
-            x_grad = ans_grad * ans
-            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
-            return x_grad, None
-
-
-def softmax(x: Tensor, dim: int):
-    if not x.requires_grad or torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x.softmax(dim=dim)
-
-    return SoftmaxFunction.apply(x, dim)
-
-
-class MaxEigLimiterFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        coeffs: Tensor,
-        direction: Tensor,
-        channel_dim: int,
-        grad_scale: float,
-    ) -> Tensor:
-        ctx.channel_dim = channel_dim
-        ctx.grad_scale = grad_scale
-        ctx.save_for_backward(x.detach(), coeffs.detach(), direction.detach())
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad, *args):
-        with torch.enable_grad():
-            (x_orig, coeffs, new_direction) = ctx.saved_tensors
-            x_orig.requires_grad = True
-            num_channels = x_orig.shape[ctx.channel_dim]
-            x = x_orig.transpose(ctx.channel_dim, -1).reshape(-1, num_channels)
-            new_direction.requires_grad = False
-            x = x - x.mean(dim=0)
-            x_var = (x**2).mean()
-            x_residual = x - coeffs * new_direction
-            x_residual_var = (x_residual**2).mean()
-            # `variance_proportion` is the proportion of the variance accounted for
-            # by the top eigen-direction.  This is to be minimized.
-            variance_proportion = (x_var - x_residual_var) / (x_var + 1.0e-20)
-            variance_proportion.backward()
-        x_orig_grad = x_orig.grad
-        x_extra_grad = (
-            x_orig.grad
-            * ctx.grad_scale
-            * x_grad.norm()
-            / (x_orig_grad.norm() + 1.0e-20)
-        )
-        return x_grad + x_extra_grad.detach(), None, None, None, None
-
-
-class BiasNormFunction(torch.autograd.Function):
-    # This computes:
-    #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
-    #   return x * scales
-    # (after unsqueezing the bias), but it does it in a memory-efficient way so that
-    # it can just store the returned value (chances are, this will also be needed for
-    # some other reason, related to the next operation, so we can save memory).
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        bias: Tensor,
-        log_scale: Tensor,
-        channel_dim: int,
-        store_output_for_backprop: bool,
-    ) -> Tensor:
-        assert bias.ndim == 1
-        if channel_dim < 0:
-            channel_dim = channel_dim + x.ndim
-        ctx.store_output_for_backprop = store_output_for_backprop
-        ctx.channel_dim = channel_dim
-        for _ in range(channel_dim + 1, x.ndim):
-            bias = bias.unsqueeze(-1)
-        scales = (
-            torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
-        ) * log_scale.exp()
-        ans = x * scales
-        ctx.save_for_backward(
-            ans.detach() if store_output_for_backprop else x,
-            scales.detach(),
-            bias.detach(),
-            log_scale.detach(),
-        )
-        return ans
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor) -> Tensor:
-        ans_or_x, scales, bias, log_scale = ctx.saved_tensors
-        if ctx.store_output_for_backprop:
-            x = ans_or_x / scales
-        else:
-            x = ans_or_x
-        x = x.detach()
-        x.requires_grad = True
-        bias.requires_grad = True
-        log_scale.requires_grad = True
-        with torch.enable_grad():
-            # recompute scales from x, bias and log_scale.
-            scales = (
-                torch.mean((x - bias) ** 2, dim=ctx.channel_dim, keepdim=True) ** -0.5
-            ) * log_scale.exp()
-            ans = x * scales
-            ans.backward(gradient=ans_grad)
-        return x.grad, bias.grad.flatten(), log_scale.grad, None, None
-
-
-class BiasNorm(torch.nn.Module):
-    """
-    This is intended to be a simpler, and hopefully cheaper, replacement for
-    LayerNorm.  The observation this is based on, is that Transformer-type
-    networks, especially with pre-norm, sometimes seem to set one of the
-    feature dimensions to a large constant value (e.g. 50), which "defeats"
-    the LayerNorm because the output magnitude is then not strongly dependent
-    on the other (useful) features.  Presumably the weight and bias of the
-    LayerNorm are required to allow it to do this.
-
-    Instead, we give the BiasNorm a trainable bias that it can use when
-    computing the scale for normalization.  We also give it a (scalar)
-    trainable scale on the output.
-
-
-    Args:
-       num_channels: the number of channels, e.g. 512.
-       channel_dim: the axis/dimension corresponding to the channel,
-         interpreted as an offset from the input's ndim if negative.
-         This is NOT the num_channels; it should typically be one of
-         {-2, -1, 0, 1, 2, 3}.
-      log_scale: the initial log-scale that we multiply the output by; this
-         is learnable.
-      log_scale_min: FloatLike, minimum allowed value of log_scale
-      log_scale_max: FloatLike, maximum allowed value of log_scale
-      store_output_for_backprop: only possibly affects memory use; recommend
-         to set to True if you think the output of this module is more likely
-         than the input of this module to be required to be stored for the
-         backprop.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int = -1,  # CAUTION: see documentation.
-        log_scale: float = 1.0,
-        log_scale_min: float = -1.5,
-        log_scale_max: float = 1.5,
-        store_output_for_backprop: bool = False,
-    ) -> None:
-        super(BiasNorm, self).__init__()
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.log_scale = nn.Parameter(torch.tensor(log_scale))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-
-        self.log_scale_min = log_scale_min
-        self.log_scale_max = log_scale_max
-
-        self.store_output_for_backprop = store_output_for_backprop
-
-    def forward(self, x: Tensor) -> Tensor:
-        assert x.shape[self.channel_dim] == self.num_channels
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            channel_dim = self.channel_dim
-            if channel_dim < 0:
-                channel_dim += x.ndim
-            bias = self.bias
-            for _ in range(channel_dim + 1, x.ndim):
-                bias = bias.unsqueeze(-1)
-            scales = (
-                torch.mean((x - bias) ** 2, dim=channel_dim, keepdim=True) ** -0.5
-            ) * self.log_scale.exp()
-            return x * scales
-
-        log_scale = limit_param_value(
-            self.log_scale,
-            min=float(self.log_scale_min),
-            max=float(self.log_scale_max),
-            training=self.training,
-        )
-
-        return BiasNormFunction.apply(
-            x, self.bias, log_scale, self.channel_dim, self.store_output_for_backprop
-        )
-
-
-def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
-    """
-    Behaves like a constructor of a modified version of nn.Linear
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Linear(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-def ScaledConv1d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv1d:
-    """
-    Behaves like a constructor of a modified version of nn.Conv1d
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Conv1d(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-def ScaledConv2d(*args, initial_scale: float = 1.0, **kwargs) -> nn.Conv2d:
-    """
-    Behaves like a constructor of a modified version of nn.Conv2d
-    that gives an easy way to set the default initial parameter scale.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False, but:
-    NO PADDING-RELATED ARGS.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-    ans = nn.Conv2d(*args, **kwargs)
-    with torch.no_grad():
-        ans.weight[:] *= initial_scale
-        if ans.bias is not None:
-            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
-    return ans
-
-
-class ChunkCausalDepthwiseConv1d(torch.nn.Module):
-    """
-    Behaves like a depthwise 1d convolution, except that it is causal in
-    a chunkwise way, as if we had a block-triangular attention mask.
-    The chunk size is provided at test time (it should probably be
-    kept in sync with the attention mask).
-
-    This has a little more than twice the parameters of a conventional
-    depthwise conv1d module: we implement it by having one
-    depthwise convolution, of half the width, that is causal (via
-    right-padding); and one depthwise convolution that is applied only
-    within chunks, that we multiply by a scaling factor which depends
-    on the position within the chunk.
-
-    Args:
-        Accepts the standard args and kwargs that nn.Linear accepts
-        e.g. in_features, out_features, bias=False.
-
-        initial_scale: you can override this if you want to increase
-           or decrease the initial magnitude of the module's output
-           (affects the initialization of weight_scale and bias_scale).
-           Another option, if you want to do something like this, is
-           to re-initialize the parameters.
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        kernel_size: int,
-        initial_scale: float = 1.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-        assert kernel_size % 2 == 1
-
-        half_kernel_size = (kernel_size + 1) // 2
-        # will pad manually, on one side.
-        self.causal_conv = nn.Conv1d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=half_kernel_size,
-            padding=0,
-            bias=True,
-        )
-
-        self.chunkwise_conv = nn.Conv1d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=kernel_size,
-            padding=kernel_size // 2,
-            bias=bias,
-        )
-
-        # first row is correction factors added to the scale near the left edge of the chunk,
-        # second row is correction factors added to the scale near the right edge of the chunk,
-        # both of these are added to a default scale of 1.0.
-        self.chunkwise_conv_scale = nn.Parameter(torch.zeros(2, channels, kernel_size))
-        self.kernel_size = kernel_size
-
-        with torch.no_grad():
-            self.causal_conv.weight[:] *= initial_scale
-            self.chunkwise_conv.weight[:] *= initial_scale
-            if bias:
-                torch.nn.init.uniform_(
-                    self.causal_conv.bias, -0.1 * initial_scale, 0.1 * initial_scale
-                )
-
-    def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor:
-        """
-             Forward function.  Args:
-               x: a Tensor of shape (batch_size, channels, seq_len)
-        chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
-        """
-        (batch_size, num_channels, seq_len) = x.shape
-
-        # half_kernel_size = self.kernel_size + 1 // 2
-        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
-        # in the causal conv.  It's the amount by which we must pad on the left,
-        # to make the convolution causal.
-        left_pad = self.kernel_size // 2
-
-        if chunk_size < 0 or chunk_size > seq_len:
-            chunk_size = seq_len
-        right_pad = -seq_len % chunk_size
-
-        x = torch.nn.functional.pad(x, (left_pad, right_pad))
-
-        x_causal = self.causal_conv(x[..., : left_pad + seq_len])
-        assert x_causal.shape == (batch_size, num_channels, seq_len)
-
-        x_chunk = x[..., left_pad:]
-        num_chunks = x_chunk.shape[2] // chunk_size
-        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks, chunk_size)
-        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(
-            batch_size * num_chunks, num_channels, chunk_size
-        )
-        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
-
-        chunk_scale = self._get_chunk_scale(chunk_size)
-
-        x_chunk = x_chunk * chunk_scale
-        x_chunk = x_chunk.reshape(
-            batch_size, num_chunks, num_channels, chunk_size
-        ).permute(0, 2, 1, 3)
-        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks * chunk_size)[
-            ..., :seq_len
-        ]
-
-        return x_chunk + x_causal
-
-    def _get_chunk_scale(self, chunk_size: int):
-        """Returns tensor of shape (num_channels, chunk_size) that will be used to
-        scale the output of self.chunkwise_conv."""
-        left_edge = self.chunkwise_conv_scale[0]
-        right_edge = self.chunkwise_conv_scale[1]
-        if chunk_size < self.kernel_size:
-            left_edge = left_edge[:, :chunk_size]
-            right_edge = right_edge[:, -chunk_size:]
-        else:
-            t = chunk_size - self.kernel_size
-            channels = left_edge.shape[0]
-            pad = torch.zeros(
-                channels, t, device=left_edge.device, dtype=left_edge.dtype
-            )
-            left_edge = torch.cat((left_edge, pad), dim=-1)
-            right_edge = torch.cat((pad, right_edge), dim=-1)
-        return 1.0 + (left_edge + right_edge)
-
-    def streaming_forward(
-        self,
-        x: Tensor,
-        cache: Tensor,
-    ) -> Tuple[Tensor, Tensor]:
-        """Streaming Forward function.
-
-        Args:
-            x: a Tensor of shape (batch_size, channels, seq_len)
-            cache: cached left context of shape (batch_size, channels, left_pad)
-        """
-        (batch_size, num_channels, seq_len) = x.shape
-
-        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
-        # in the causal conv.  It's the amount by which we must pad on the left,
-        # to make the convolution causal.
-        left_pad = self.kernel_size // 2
-
-        # Pad cache
-        assert cache.shape[-1] == left_pad, (cache.shape[-1], left_pad)
-        x = torch.cat([cache, x], dim=2)
-        # Update cache
-        cache = x[..., -left_pad:]
-
-        x_causal = self.causal_conv(x)
-        assert x_causal.shape == (batch_size, num_channels, seq_len)
-
-        x_chunk = x[..., left_pad:]
-        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
-
-        chunk_scale = self._get_chunk_scale(chunk_size=seq_len)
-        x_chunk = x_chunk * chunk_scale
-
-        return x_chunk + x_causal, cache
-
-
-class BalancerFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        x: Tensor,
-        min_mean: float,
-        max_mean: float,
-        min_rms: float,
-        max_rms: float,
-        grad_scale: float,
-        channel_dim: int,
-    ) -> Tensor:
-        if channel_dim < 0:
-            channel_dim += x.ndim
-        ctx.channel_dim = channel_dim
-        ctx.save_for_backward(x)
-        ctx.config = (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim)
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor) -> Tuple[Tensor, None, None, None, None, None]:
-        (x,) = ctx.saved_tensors
-        (min_mean, max_mean, min_rms, max_rms, grad_scale, channel_dim) = ctx.config
-
-        try:
-            with torch.enable_grad():
-                with torch.cuda.amp.autocast(enabled=False):
-                    x = x.to(torch.float32)
-                    x = x.detach()
-                    x.requires_grad = True
-                    mean_dims = [i for i in range(x.ndim) if i != channel_dim]
-                    uncentered_var = (x**2).mean(dim=mean_dims, keepdim=True)
-                    mean = x.mean(dim=mean_dims, keepdim=True)
-                    stddev = (uncentered_var - (mean * mean)).clamp(min=1.0e-20).sqrt()
-                    rms = uncentered_var.clamp(min=1.0e-20).sqrt()
-
-                    m = mean / stddev
-                    # part of loss that relates to mean / stddev
-                    m_loss = (m - m.clamp(min=min_mean, max=max_mean)).abs()
-
-                    # put a much larger scale on the RMS-max-limit loss, so that if both it and the
-                    # m_loss are violated we fix the RMS loss first.
-                    rms_clamped = rms.clamp(min=min_rms, max=max_rms)
-                    r_loss = (rms_clamped / rms).log().abs()
-
-                    loss = m_loss + r_loss
-
-                    loss.backward(gradient=torch.ones_like(loss))
-                    loss_grad = x.grad
-                    loss_grad_rms = (
-                        (loss_grad**2)
-                        .mean(dim=mean_dims, keepdim=True)
-                        .sqrt()
-                        .clamp(min=1.0e-20)
-                    )
-
-                    loss_grad = loss_grad * (grad_scale / loss_grad_rms)
-
-                    x_grad_float = x_grad.to(torch.float32)
-                    # scale each element of loss_grad by the absolute value of the corresponding
-                    # element of x_grad, which we view as a noisy estimate of its magnitude for that
-                    # (frame and dimension).  later we can consider factored versions.
-                    x_grad_mod = x_grad_float + (x_grad_float.abs() * loss_grad)
-                    x_grad = x_grad_mod.to(x_grad.dtype)
-        except Exception as e:
-            logging.info(
-                f"Caught exception in Balancer backward: {e}, size={list(x_grad.shape)}, will continue."
-            )
-
-        return x_grad, None, None, None, None, None, None
-
-
-class Balancer(torch.nn.Module):
-    """
-    Modifies the backpropped derivatives of a function to try to encourage, for
-    each channel, that it is positive at least a proportion `threshold` of the
-    time.  It does this by multiplying negative derivative values by up to
-    (1+max_factor), and positive derivative values by up to (1-max_factor),
-    interpolated from 1 at the threshold to those extremal values when none
-    of the inputs are positive.
-
-    Args:
-           num_channels: the number of channels
-           channel_dim: the dimension/axis corresponding to the channel, e.g.
-               -1, 0, 1, 2; will be interpreted as an offset from x.ndim if negative.
-           min_positive: the minimum, per channel, of the proportion of the time
-               that (x > 0), below which we start to modify the derivatives.
-           max_positive: the maximum, per channel, of the proportion of the time
-               that (x > 0), above which we start to modify the derivatives.
-           scale_gain_factor: determines the 'gain' with which we increase the
-              change in gradient once the constraints on min_abs and max_abs
-              are violated.
-           min_abs:  the minimum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-           max_abs:  the maximum average-absolute-value difference from the mean
-               value per channel, which we allow, before we start to modify
-               the derivatives to prevent this.
-         prob: determines the minimum probability with which we modify the
-             gradients for the {min,max}_positive and {min,max}_abs constraints,
-             on each forward().  This is done randomly to prevent all layers
-             from doing it at the same time.
-    """
-
-    def __init__(
-        self,
-        num_channels: int,
-        channel_dim: int,
-        min_positive: FloatLike = 0.05,
-        max_positive: FloatLike = 0.95,
-        min_abs: FloatLike = 0.2,
-        max_abs: FloatLike = 100.0,
-        grad_scale: FloatLike = 0.04,
-        prob: Optional[FloatLike] = None,
-    ):
-        super().__init__()
-
-        if prob is None:
-            prob = ScheduledFloat((0.0, 0.5), (8000.0, 0.125), default=0.4)
-        self.prob = prob
-        # 5% of the time we will return and do nothing because memory usage is
-        # too high.
-        self.mem_cutoff = CutoffEstimator(0.05)
-
-        # actually self.num_channels is no longer needed except for an assertion.
-        self.num_channels = num_channels
-        self.channel_dim = channel_dim
-        self.min_positive = min_positive
-        self.max_positive = max_positive
-        self.min_abs = min_abs
-        self.max_abs = max_abs
-        self.grad_scale = grad_scale
-
-    def forward(self, x: Tensor) -> Tensor:
-        if (
-            torch.jit.is_scripting()
-            or not x.requires_grad
-            or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
-        ):
-            return _no_op(x)
-
-        prob = float(self.prob)
-        if random.random() < prob:
-            # The following inner-functions convert from the way we historically specified
-            # these limitations, as limits on the absolute value and the proportion of positive
-            # values, to limits on the RMS value and the (mean / stddev).
-            def _abs_to_rms(x):
-                # for normally distributed data, if the expected absolute value is x, the
-                # expected rms value will be sqrt(pi/2) * x.
-                return 1.25331413732 * x
-
-            def _proportion_positive_to_mean(x):
-                def _atanh(x):
-                    eps = 1.0e-10
-                    # eps is to prevent crashes if x is exactly 0 or 1.
-                    # we'll just end up returning a fairly large value.
-                    return (math.log(1 + x + eps) - math.log(1 - x + eps)) / 2.0
-
-                def _approx_inverse_erf(x):
-                    # 1 / (sqrt(pi) * ln(2)),
-                    # see https://math.stackexchange.com/questions/321569/approximating-the-error-function-erf-by-analytical-functions
-                    # this approximation is extremely crude and gets progressively worse for
-                    # x very close to -1 or +1, but we mostly care about the "middle" region
-                    # e.g. _approx_inverse_erf(0.05) = 0.0407316414078772,
-                    # and math.erf(0.0407316414078772) = 0.045935330944660666,
-                    # which is pretty close to 0.05.
-                    return 0.8139535143 * _atanh(x)
-
-                # first convert x from the range 0..1 to the range -1..1 which the error
-                # function returns
-                x = -1 + (2 * x)
-                return _approx_inverse_erf(x)
-
-            min_mean = _proportion_positive_to_mean(float(self.min_positive))
-            max_mean = _proportion_positive_to_mean(float(self.max_positive))
-            min_rms = _abs_to_rms(float(self.min_abs))
-            max_rms = _abs_to_rms(float(self.max_abs))
-            grad_scale = float(self.grad_scale)
-
-            assert x.shape[self.channel_dim] == self.num_channels
-
-            return BalancerFunction.apply(
-                x, min_mean, max_mean, min_rms, max_rms, grad_scale, self.channel_dim
-            )
-        else:
-            return _no_op(x)
-
-
-def penalize_abs_values_gt(
-    x: Tensor, limit: float, penalty: float, name: str = None
-) -> Tensor:
-    """
-    Returns x unmodified, but in backprop will put a penalty for the excess of
-    the absolute values of elements of x over the limit "limit".  E.g. if
-    limit == 10.0, then if x has any values over 10 it will get a penalty.
-
-    Caution: the value of this penalty will be affected by grad scaling used
-    in automatic mixed precision training.  For this reasons we use this,
-    it shouldn't really matter, or may even be helpful; we just use this
-    to disallow really implausible values of scores to be given to softmax.
-
-    The name is for randomly printed debug info.
-    """
-    x_sign = x.sign()
-    over_limit = (x.abs() - limit) > 0
-    # The following is a memory efficient way to penalize the absolute values of
-    # x that's over the limit.  (The memory efficiency comes when you think
-    # about which items torch needs to cache for the autograd, and which ones it
-    # can throw away).  The numerical value of aux_loss as computed here will
-    # actually be larger than it should be, by limit * over_limit.sum(), but it
-    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
-    # limit).relu().
-    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
-    # note: we don't do sum() here on aux)_loss, but it's as if we had done
-    # sum() due to how with_loss() works.
-    x = with_loss(x, aux_loss, name)
-    # you must use x for something, or this will be ineffective.
-    return x
-
-
-def _diag(x: Tensor):  # like .diag(), but works for tensors with 3 dims.
-    if x.ndim == 2:
-        return x.diag()
-    else:
-        (batch, dim, dim) = x.shape
-        x = x.reshape(batch, dim * dim)
-        x = x[:, :: dim + 1]
-        assert x.shape == (batch, dim)
-        return x
-
-
-def _whitening_metric(x: Tensor, num_groups: int):
-    """
-    Computes the "whitening metric", a value which will be 1.0 if all the eigenvalues of
-    of the centered feature covariance are the same within each group's covariance matrix
-    and also between groups.
-    Args:
-        x: a Tensor of shape (*, num_channels)
-     num_groups:  the number of groups of channels, a number >=1 that divides num_channels
-    Returns:
-        Returns a scalar Tensor that will be 1.0 if the data is "perfectly white" and
-    greater than 1.0 otherwise.
-    """
-    assert x.dtype != torch.float16
-    x = x.reshape(-1, x.shape[-1])
-    (num_frames, num_channels) = x.shape
-    assert num_channels % num_groups == 0
-    channels_per_group = num_channels // num_groups
-    x = x.reshape(num_frames, num_groups, channels_per_group).transpose(0, 1)
-    # x now has shape (num_groups, num_frames, channels_per_group)
-    # subtract the mean so we use the centered, not uncentered, covariance.
-    # My experience has been that when we "mess with the gradients" like this,
-    # it's better not do anything that tries to move the mean around, because
-    # that can easily cause instability.
-    x = x - x.mean(dim=1, keepdim=True)
-    # x_covar: (num_groups, channels_per_group, channels_per_group)
-    x_covar = torch.matmul(x.transpose(1, 2), x)
-    x_covar_mean_diag = _diag(x_covar).mean()
-    # the following expression is what we'd get if we took the matrix product
-    # of each covariance and measured the mean of its trace, i.e.
-    # the same as _diag(torch.matmul(x_covar, x_covar)).mean().
-    x_covarsq_mean_diag = (x_covar**2).sum() / (num_groups * channels_per_group)
-    # this metric will be >= 1.0; the larger it is, the less 'white' the data was.
-    metric = x_covarsq_mean_diag / (x_covar_mean_diag**2 + 1.0e-20)
-    return metric
-
-
-class WhiteningPenaltyFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, module: nn.Module) -> Tensor:
-        ctx.save_for_backward(x)
-        ctx.module = module
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor):
-        (x_orig,) = ctx.saved_tensors
-        w = ctx.module
-
-        try:
-            with torch.enable_grad():
-                with torch.cuda.amp.autocast(enabled=False):
-                    x_detached = x_orig.to(torch.float32).detach()
-                    x_detached.requires_grad = True
-
-                    metric = _whitening_metric(x_detached, w.num_groups)
-
-                    if random.random() < 0.005 or __name__ == "__main__":
-                        logging.info(
-                            f"Whitening: name={w.name}, num_groups={w.num_groups}, num_channels={x_orig.shape[-1]}, "
-                            f"metric={metric.item():.2f} vs. limit={float(w.whitening_limit)}"
-                        )
-
-                    if metric < float(w.whitening_limit):
-                        w.prob = w.min_prob
-                        return x_grad, None
-                    else:
-                        w.prob = w.max_prob
-                        metric.backward()
-                        penalty_grad = x_detached.grad
-                        scale = w.grad_scale * (
-                            x_grad.to(torch.float32).norm()
-                            / (penalty_grad.norm() + 1.0e-20)
-                        )
-                        penalty_grad = penalty_grad * scale
-                        return x_grad + penalty_grad.to(x_grad.dtype), None
-        except Exception as e:
-            logging.info(
-                f"Caught exception in Whiten backward: {e}, size={list(x_grad.shape)}, will continue."
-            )
-        return x_grad, None
-
-
-class Whiten(nn.Module):
-    def __init__(
-        self,
-        num_groups: int,
-        whitening_limit: FloatLike,
-        prob: Union[float, Tuple[float, float]],
-        grad_scale: FloatLike,
-    ):
-        """
-        Args:
-          num_groups: the number of groups to divide the channel dim into before
-            whitening.  We will attempt to make the feature covariance
-            within each group, after mean subtraction, as "white" as possible,
-            while having the same trace across all groups.
-         whitening_limit: a value greater than 1.0, that dictates how much
-           freedom we have to violate the constraints.  1.0 would mean perfectly
-           white, with exactly the same trace across groups; larger values
-           give more freedom.  E.g. 2.0.
-         prob: the probability with which we apply the gradient modification
-           (also affects the grad scale).  May be supplied as a float,
-           or as a pair (min_prob, max_prob)
-
-          grad_scale: determines the scale on the gradient term from this object,
-            relative to the rest of the gradient on the attention weights.
-            E.g. 0.02 (you may want to use smaller values than this if prob is large)
-        """
-        super(Whiten, self).__init__()
-        assert num_groups >= 1
-        assert float(whitening_limit) >= 1
-        assert grad_scale >= 0
-        self.num_groups = num_groups
-        self.whitening_limit = whitening_limit
-        self.grad_scale = grad_scale
-
-        if isinstance(prob, float):
-            prob = (prob, prob)
-        (self.min_prob, self.max_prob) = prob
-        assert 0 < self.min_prob <= self.max_prob <= 1
-        self.prob = self.max_prob
-        self.name = None  # will be set in training loop
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        In the forward pass, this function just returns the input unmodified.
-        In the backward pass, it will modify the gradients to ensure that the
-        distribution in each group has close to (lambda times I) as the covariance
-        after mean subtraction, with the same lambda across groups.
-        For whitening_limit > 1, there will be more freedom to violate this
-        constraint.
-
-        Args:
-           x: the input of shape (*, num_channels)
-
-        Returns:
-            x, unmodified.   You should make sure
-        you use the returned value, or the graph will be freed
-        and nothing will happen in backprop.
-        """
-        grad_scale = float(self.grad_scale)
-        if not x.requires_grad or random.random() > self.prob or grad_scale == 0:
-            return _no_op(x)
-        else:
-            return WhiteningPenaltyFunction.apply(x, self)
-
-
-class WithLoss(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, y: Tensor, name: str):
-        ctx.y_shape = y.shape
-        if random.random() < 0.002 and name is not None:
-            loss_sum = y.sum().item()
-            logging.info(f"WithLoss: name={name}, loss-sum={loss_sum:.3e}")
-        return x
-
-    @staticmethod
-    def backward(ctx, ans_grad: Tensor):
-        return (
-            ans_grad,
-            torch.ones(ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device),
-            None,
-        )
-
-
-def with_loss(x, y, name):
-    # returns x but adds y.sum() to the loss function.
-    return WithLoss.apply(x, y, name)
-
-
-class ScaleGradFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, alpha: float) -> Tensor:
-        ctx.alpha = alpha
-        return x
-
-    @staticmethod
-    def backward(ctx, grad: Tensor):
-        return grad * ctx.alpha, None
-
-
-def scale_grad(x: Tensor, alpha: float):
-    return ScaleGradFunction.apply(x, alpha)
-
-
-class ScaleGrad(nn.Module):
-    def __init__(self, alpha: float):
-        super().__init__()
-        self.alpha = alpha
-
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
-            return x
-        return scale_grad(x, self.alpha)
-
-
-class LimitParamValue(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x: Tensor, min: float, max: float):
-        ctx.save_for_backward(x)
-        assert max >= min
-        ctx.min = min
-        ctx.max = max
-        return x
-
-    @staticmethod
-    def backward(ctx, x_grad: Tensor):
-        (x,) = ctx.saved_tensors
-        # where x < ctx.min, ensure all grads are negative (this will tend to make
-        # x more positive).
-        x_grad = x_grad * torch.where(
-            torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0
-        )
-        # where x > ctx.max, ensure all grads are positive (this will tend to make
-        # x more negative).
-        x_grad *= torch.where(torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0)
-        return x_grad, None, None
-
-
-def limit_param_value(
-    x: Tensor, min: float, max: float, prob: float = 0.6, training: bool = True
-):
-    # You apply this to (typically) an nn.Parameter during training to ensure that its
-    # (elements mostly) stays within a supplied range.  This is done by modifying the
-    # gradients in backprop.
-    # It's not necessary to do this on every batch: do it only some of the time,
-    # to save a little time.
-    if training and random.random() < prob:
-        return LimitParamValue.apply(x, min, max)
-    else:
-        return x
-
-
-def _no_op(x: Tensor) -> Tensor:
-    if torch.jit.is_scripting() or torch.jit.is_tracing():
-        return x
-    else:
-        # a no-op function that will have a node in the autograd graph,
-        # to avoid certain bugs relating to backward hooks
-        return x.chunk(1, dim=-1)[0]
-
-
-class Identity(torch.nn.Module):
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, x):
-        return _no_op(x)
-
-
-class DoubleSwishFunction(torch.autograd.Function):
-    """
-      double_swish(x) = x * torch.sigmoid(x-1)
-
-    This is a definition, originally motivated by its close numerical
-    similarity to swish(swish(x)), where swish(x) =  x * sigmoid(x).
-
-    Memory-efficient derivative computation:
-     double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1)
-     double_swish'(x) = d/dx double_swish(x) =  x * s'(x) + x' * s(x) = x * s'(x) + s(x).
-     Now, s'(x) = s(x) * (1-s(x)).
-     double_swish'(x) =  x * s'(x) + s(x).
-                      =  x * s(x) * (1-s(x)) + s(x).
-                     = double_swish(x) * (1-s(x)) + s(x)
-     ... so we just need to remember s(x) but not x itself.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        s = torch.sigmoid(x - 1.0)
-        y = x * s
-
-        if requires_grad:
-            deriv = y * (1 - s) + s
-
-            # notes on derivative of x * sigmoid(x - 1):
-            # https://www.wolframalpha.com/input?i=d%2Fdx+%28x+*+sigmoid%28x-1%29%29
-            # min \simeq -0.043638.  Take floor as -0.044 so it's a lower bund
-            # max \simeq 1.1990.   Take ceil to be 1.2 so it's an upper bound.
-            # the combination of "+ torch.rand_like(deriv)" and casting to torch.uint8 (which
-            # floors), should be expectation-preserving.
-            floor = -0.044
-            ceil = 1.2
-            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                deriv
-            )
-            if __name__ == "__main__":
-                # for self-testing only.
-                assert d_scaled.min() >= 0.0
-                assert d_scaled.max() < 256.0
-            d_int = d_scaled.to(torch.uint8)
-            ctx.save_for_backward(d_int)
-        if x.dtype == torch.float16 or torch.is_autocast_enabled():
-            y = y.to(torch.float16)
-        return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.043637
-        ceil = 1.2
-
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class DoubleSwish(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: Tensor) -> Tensor:
-        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
-        that we approximate closely with x * sigmoid(x-1).
-        """
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            return x * torch.sigmoid(x - 1.0)
-        return DoubleSwishFunction.apply(x)
-
-
-# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates.
-class Dropout2(nn.Module):
-    def __init__(self, p: FloatLike):
-        super().__init__()
-        self.p = p
-
-    def forward(self, x: Tensor) -> Tensor:
-        return torch.nn.functional.dropout(x, p=float(self.p), training=self.training)
-
-
-class MulForDropout3(torch.autograd.Function):
-    # returns (x * y * alpha) where alpha is a float and y doesn't require
-    # grad and is zero-or-one.
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, y, alpha):
-        assert not y.requires_grad
-        ans = x * y * alpha
-        ctx.save_for_backward(ans)
-        ctx.alpha = alpha
-        return ans
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, ans_grad):
-        (ans,) = ctx.saved_tensors
-        x_grad = ctx.alpha * ans_grad * (ans != 0)
-        return x_grad, None, None
-
-
-# Dropout3 is just like normal dropout, except it supports schedules on the dropout rates,
-# and it lets you choose one dimension to share the dropout mask over
-class Dropout3(nn.Module):
-    def __init__(self, p: FloatLike, shared_dim: int):
-        super().__init__()
-        self.p = p
-        self.shared_dim = shared_dim
-
-    def forward(self, x: Tensor) -> Tensor:
-        p = float(self.p)
-        if not self.training or p == 0:
-            return _no_op(x)
-        scale = 1.0 / (1 - p)
-        rand_shape = list(x.shape)
-        rand_shape[self.shared_dim] = 1
-        mask = torch.rand(*rand_shape, device=x.device) > p
-        ans = MulForDropout3.apply(x, mask, scale)
-        return ans
-
-
-class SwooshLFunction(torch.autograd.Function):
-    """
-    swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-
-        coeff = -0.08
-
-        with torch.cuda.amp.autocast(enabled=False):
-            with torch.enable_grad():
-                x = x.detach()
-                x.requires_grad = True
-                y = torch.logaddexp(zero, x - 4.0) + coeff * x - 0.035
-
-                if not requires_grad:
-                    return y
-
-                y.backward(gradient=torch.ones_like(y))
-
-                grad = x.grad
-                floor = coeff
-                ceil = 1.0 + coeff + 0.005
-
-                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                    grad
-                )
-                if __name__ == "__main__":
-                    # for self-testing only.
-                    assert d_scaled.min() >= 0.0
-                    assert d_scaled.max() < 256.0
-
-                d_int = d_scaled.to(torch.uint8)
-                ctx.save_for_backward(d_int)
-                if x.dtype == torch.float16 or torch.is_autocast_enabled():
-                    y = y.to(torch.float16)
-                return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-
-        coeff = -0.08
-        floor = coeff
-        ceil = 1.0 + coeff + 0.005
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class SwooshL(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
-        if not x.requires_grad:
-            return k2.swoosh_l_forward(x)
-        else:
-            return k2.swoosh_l(x)
-        # return SwooshLFunction.apply(x)
-
-
-class SwooshLOnnx(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-L activation."""
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
-
-
-class SwooshRFunction(torch.autograd.Function):
-    """
-     swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
-
-    derivatives are between -0.08 and 0.92.
-    """
-
-    @staticmethod
-    def forward(ctx, x: Tensor) -> Tensor:
-        requires_grad = x.requires_grad
-
-        if x.dtype == torch.float16:
-            x = x.to(torch.float32)
-
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-
-        with torch.cuda.amp.autocast(enabled=False):
-            with torch.enable_grad():
-                x = x.detach()
-                x.requires_grad = True
-                y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
-
-                if not requires_grad:
-                    return y
-                y.backward(gradient=torch.ones_like(y))
-
-                grad = x.grad
-                floor = -0.08
-                ceil = 0.925
-
-                d_scaled = (grad - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                    grad
-                )
-                if __name__ == "__main__":
-                    # for self-testing only.
-                    assert d_scaled.min() >= 0.0
-                    assert d_scaled.max() < 256.0
-
-                d_int = d_scaled.to(torch.uint8)
-                ctx.save_for_backward(d_int)
-                if x.dtype == torch.float16 or torch.is_autocast_enabled():
-                    y = y.to(torch.float16)
-                return y
-
-    @staticmethod
-    def backward(ctx, y_grad: Tensor) -> Tensor:
-        (d,) = ctx.saved_tensors
-        # the same constants as used in forward pass.
-        floor = -0.08
-        ceil = 0.925
-        d = d * ((ceil - floor) / 255.0) + floor
-        return y_grad * d
-
-
-class SwooshR(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
-        if not x.requires_grad:
-            return k2.swoosh_r_forward(x)
-        else:
-            return k2.swoosh_r(x)
-        # return SwooshRFunction.apply(x)
-
-
-class SwooshROnnx(torch.nn.Module):
-    def forward(self, x: Tensor) -> Tensor:
-        """Return Swoosh-R activation."""
-        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-        return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687
-
-
-# simple version of SwooshL that does not redefine the backprop, used in
-# ActivationDropoutAndLinearFunction.
-def SwooshLForward(x: Tensor):
-    x_offset = x - 4.0
-    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
-    return log_sum - 0.08 * x - 0.035
-
-
-# simple version of SwooshR that does not redefine the backprop, used in
-# ActivationDropoutAndLinearFunction.
-def SwooshRForward(x: Tensor):
-    x_offset = x - 1.0
-    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
-    log_sum = torch.where(log_sum == float("inf"), x_offset, log_sum)
-    return log_sum - 0.08 * x - 0.313261687
-
-
-class ActivationDropoutAndLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(
-        ctx,
-        x: Tensor,
-        weight: Tensor,
-        bias: Optional[Tensor],
-        activation: str,
-        dropout_p: float,
-        dropout_shared_dim: Optional[int],
-    ):
-        if dropout_p != 0.0:
-            dropout_shape = list(x.shape)
-            if dropout_shared_dim is not None:
-                dropout_shape[dropout_shared_dim] = 1
-            # else it won't be very memory efficient.
-            dropout_mask = (1.0 / (1.0 - dropout_p)) * (
-                torch.rand(*dropout_shape, device=x.device, dtype=x.dtype) > dropout_p
-            )
-        else:
-            dropout_mask = None
-
-        ctx.save_for_backward(x, weight, bias, dropout_mask)
-
-        ctx.activation = activation
-
-        forward_activation_dict = {
-            "SwooshL": k2.swoosh_l_forward,
-            "SwooshR": k2.swoosh_r_forward,
-        }
-        # it will raise a KeyError if this fails.  This will be an error.  We let it
-        # propagate to the user.
-        activation_func = forward_activation_dict[activation]
-        x = activation_func(x)
-        if dropout_mask is not None:
-            x = x * dropout_mask
-        x = torch.nn.functional.linear(x, weight, bias)
-        return x
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, ans_grad: Tensor):
-        saved = ctx.saved_tensors
-        (x, weight, bias, dropout_mask) = saved
-
-        forward_and_deriv_activation_dict = {
-            "SwooshL": k2.swoosh_l_forward_and_deriv,
-            "SwooshR": k2.swoosh_r_forward_and_deriv,
-        }
-        # the following lines a KeyError if the activation is unrecognized.
-        # This will be an error.  We let it propagate to the user.
-        func = forward_and_deriv_activation_dict[ctx.activation]
-
-        y, func_deriv = func(x)
-        if dropout_mask is not None:
-            y = y * dropout_mask
-        # now compute derivative of y w.r.t. weight and bias..
-        # y: (..., in_channels), ans_grad: (..., out_channels),
-        (out_channels, in_channels) = weight.shape
-
-        in_channels = y.shape[-1]
-        g = ans_grad.reshape(-1, out_channels)
-        weight_deriv = torch.matmul(g.t(), y.reshape(-1, in_channels))
-        y_deriv = torch.matmul(ans_grad, weight)
-        bias_deriv = None if bias is None else g.sum(dim=0)
-        x_deriv = y_deriv * func_deriv
-        if dropout_mask is not None:
-            # order versus func_deriv does not matter
-            x_deriv = x_deriv * dropout_mask
-
-        return x_deriv, weight_deriv, bias_deriv, None, None, None
-
-
-class ActivationDropoutAndLinear(torch.nn.Module):
-    """
-     This merges an activation function followed by dropout and then a nn.Linear module;
-     it does so in a memory efficient way so that it only stores the input to the whole
-     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
-     equivalent to:
-       nn.Sequential(SwooshL(),
-                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
-                     ScaledLinear(in_channels, out_channels, bias=bias,
-                                  initial_scale=initial_scale))
-    If dropout_shared_dim is None, the dropout would be equivalent to
-    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
-    mask is smaller.
-
-     Args:
-        in_channels: number of input channels, e.g. 256
-        out_channels: number of output channels, e.g. 256
-        bias: if true, have a bias
-        activation: the activation function, for now just support SwooshL.
-        dropout_p: the dropout probability or schedule (happens after nonlinearity).
-        dropout_shared_dim: the dimension, if any, across which the dropout mask is
-             shared (e.g. the time dimension).  If None, this may be less memory
-             efficient if there are modules before this one that cache the input
-             for their backprop (e.g. Balancer or Whiten).
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        bias: bool = True,
-        activation: str = "SwooshL",
-        dropout_p: FloatLike = 0.0,
-        dropout_shared_dim: Optional[int] = -1,
-        initial_scale: float = 1.0,
-    ):
-        super().__init__()
-        # create a temporary module of nn.Linear that we'll steal the
-        # weights and bias from
-        l = ScaledLinear(
-            in_channels, out_channels, bias=bias, initial_scale=initial_scale
-        )
-
-        self.weight = l.weight
-        # register_parameter properly handles making it a parameter when l.bias
-        # is None. I think there is some reason for doing it this way rather
-        # than just setting it to None but I don't know what it is, maybe
-        # something to do with exporting the module..
-        self.register_parameter("bias", l.bias)
-
-        self.activation = activation
-        self.dropout_p = dropout_p
-        self.dropout_shared_dim = dropout_shared_dim
-
-    def forward(self, x: Tensor):
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            if self.activation == "SwooshL":
-                x = SwooshLForward(x)
-            elif self.activation == "SwooshR":
-                x = SwooshRForward(x)
-            else:
-                assert False, self.activation
-            return torch.nn.functional.linear(x, self.weight, self.bias)
-
-        return ActivationDropoutAndLinearFunction.apply(
-            x,
-            self.weight,
-            self.bias,
-            self.activation,
-            float(self.dropout_p),
-            self.dropout_shared_dim,
-        )
-
-
-def convert_num_channels(x: Tensor, num_channels: int) -> Tensor:
-    if num_channels <= x.shape[-1]:
-        return x[..., :num_channels]
-    else:
-        shape = list(x.shape)
-        shape[-1] = num_channels - shape[-1]
-        zeros = torch.zeros(shape, dtype=x.dtype, device=x.device)
-        return torch.cat((x, zeros), dim=-1)
-
-
-def _test_whiten():
-    for proportion in [0.1, 0.5, 10.0]:
-        logging.info(f"_test_whiten(): proportion = {proportion}")
-        x = torch.randn(100, 128)
-        direction = torch.randn(128)
-        coeffs = torch.randn(100, 1)
-        x += proportion * direction * coeffs
-
-        x.requires_grad = True
-
-        m = Whiten(
-            1, 5.0, prob=1.0, grad_scale=0.1  # num_groups  # whitening_limit,
-        )  # grad_scale
-
-        for _ in range(4):
-            y = m(x)
-
-        y_grad = torch.randn_like(x)
-        y.backward(gradient=y_grad)
-
-        if proportion < 0.2:
-            assert torch.allclose(x.grad, y_grad)
-        elif proportion > 1.0:
-            assert not torch.allclose(x.grad, y_grad)
-
-
-def _test_balancer_sign():
-    probs = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = 1.0 * ((2.0 * (torch.rand(probs.numel(), N) < probs.unsqueeze(-1))) - 1.0)
-    x = x.detach()
-    x.requires_grad = True
-    m = Balancer(
-        probs.numel(),
-        channel_dim=0,
-        min_positive=0.05,
-        max_positive=0.95,
-        min_abs=0.0,
-        prob=1.0,
-    )
-
-    y_grad = torch.sign(torch.randn(probs.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_balancer_sign: x = ", x)
-    print("_test_balancer_sign: y grad = ", y_grad)
-    print("_test_balancer_sign: x grad = ", x.grad)
-
-
-def _test_balancer_magnitude():
-    magnitudes = torch.arange(0, 1, 0.01)
-    N = 1000
-    x = torch.sign(torch.randn(magnitudes.numel(), N)) * magnitudes.unsqueeze(-1)
-    x = x.detach()
-    x.requires_grad = True
-    m = Balancer(
-        magnitudes.numel(),
-        channel_dim=0,
-        min_positive=0.0,
-        max_positive=1.0,
-        min_abs=0.2,
-        max_abs=0.7,
-        prob=1.0,
-    )
-
-    y_grad = torch.sign(torch.randn(magnitudes.numel(), N))
-
-    y = m(x)
-    y.backward(gradient=y_grad)
-    print("_test_balancer_magnitude: x = ", x)
-    print("_test_balancer_magnitude: y grad = ", y_grad)
-    print("_test_balancer_magnitude: x grad = ", x.grad)
-
-
-def _test_double_swish_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = DoubleSwish()
-
-    tol = (1.2 - (-0.043637)) / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_swooshl_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = SwooshL()
-
-    tol = 1.0 / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_swooshr_deriv():
-    x = torch.randn(10, 12, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    m = SwooshR()
-
-    tol = 1.0 / 255.0
-    torch.autograd.gradcheck(m, x, atol=tol, eps=0.01)
-
-    # for self-test.
-    x = torch.randn(1000, 1000, dtype=torch.double) * 3.0
-    x.requires_grad = True
-    y = m(x)
-
-
-def _test_softmax():
-    a = torch.randn(2, 10, dtype=torch.float64)
-    b = a.clone()
-    a.requires_grad = True
-    b.requires_grad = True
-    a.softmax(dim=1)[:, 0].sum().backward()
-    print("a grad = ", a.grad)
-    softmax(b, dim=1)[:, 0].sum().backward()
-    print("b grad = ", b.grad)
-    assert torch.allclose(a.grad, b.grad)
-
-
-def _test_piecewise_linear():
-    p = PiecewiseLinear((0, 10.0))
-    for x in [-100, 0, 100]:
-        assert p(x) == 10.0
-    p = PiecewiseLinear((0, 10.0), (1, 0.0))
-    for x, y in [(-100, 10.0), (0, 10.0), (0.5, 5.0), (1, 0.0), (2, 0.0)]:
-        print("x, y = ", x, y)
-        assert p(x) == y, (x, p(x), y)
-
-    q = PiecewiseLinear((0.5, 15.0), (0.6, 1.0))
-    x_vals = [-1.0, 0.0, 0.1, 0.2, 0.5, 0.6, 0.7, 0.9, 1.0, 2.0]
-    pq = p.max(q)
-    for x in x_vals:
-        y1 = max(p(x), q(x))
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-    pq = p.min(q)
-    for x in x_vals:
-        y1 = min(p(x), q(x))
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-    pq = p + q
-    for x in x_vals:
-        y1 = p(x) + q(x)
-        y2 = pq(x)
-        assert abs(y1 - y2) < 0.001
-
-
-def _test_activation_dropout_and_linear():
-    in_channels = 20
-    out_channels = 30
-
-    for bias in [True, False]:
-        # actually we don't test for dropout_p != 0.0 because forward functions will give
-        # different answers.  This is because we are using the k2 implementation of
-        # swoosh_l an swoosh_r inside SwooshL() and SwooshR(), and they call randn()
-        # internally, messing up the random state.
-        for dropout_p in [0.0]:
-            for activation in ["SwooshL", "SwooshR"]:
-                m1 = nn.Sequential(
-                    SwooshL() if activation == "SwooshL" else SwooshR(),
-                    Dropout3(p=dropout_p, shared_dim=-1),
-                    ScaledLinear(
-                        in_channels, out_channels, bias=bias, initial_scale=0.5
-                    ),
-                )
-                m2 = ActivationDropoutAndLinear(
-                    in_channels,
-                    out_channels,
-                    bias=bias,
-                    initial_scale=0.5,
-                    activation=activation,
-                    dropout_p=dropout_p,
-                )
-                with torch.no_grad():
-                    m2.weight[:] = m1[2].weight
-                    if bias:
-                        m2.bias[:] = m1[2].bias
-                # make sure forward gives same result.
-                x1 = torch.randn(10, in_channels)
-                x1.requires_grad = True
-
-                # TEMP.
-                assert torch.allclose(
-                    SwooshRFunction.apply(x1), SwooshRForward(x1), atol=1.0e-03
-                )
-
-                x2 = x1.clone().detach()
-                x2.requires_grad = True
-                seed = 10
-                torch.manual_seed(seed)
-                y1 = m1(x1)
-                y_grad = torch.randn_like(y1)
-                y1.backward(gradient=y_grad)
-                torch.manual_seed(seed)
-                y2 = m2(x2)
-                y2.backward(gradient=y_grad)
-
-                print(
-                    f"bias = {bias}, dropout_p = {dropout_p}, activation = {activation}"
-                )
-                print("y1 = ", y1)
-                print("y2 = ", y2)
-                assert torch.allclose(y1, y2, atol=0.02)
-                assert torch.allclose(m1[2].weight.grad, m2.weight.grad, atol=1.0e-05)
-                if bias:
-                    assert torch.allclose(m1[2].bias.grad, m2.bias.grad, atol=1.0e-05)
-                print("x1.grad = ", x1.grad)
-                print("x2.grad = ", x2.grad)
-
-                def isclose(a, b):
-                    # return true if cosine similarity is > 0.9.
-                    return (a * b).sum() > 0.9 * (
-                        (a**2).sum() * (b**2).sum()
-                    ).sqrt()
-
-                # the SwooshL() implementation has a noisy gradient due to 1-byte
-                # storage of it.
-                assert isclose(x1.grad, x2.grad)
-
-
-if __name__ == "__main__":
-    logging.getLogger().setLevel(logging.INFO)
-    torch.set_num_threads(1)
-    torch.set_num_interop_threads(1)
-    _test_piecewise_linear()
-    _test_softmax()
-    _test_whiten()
-    _test_balancer_sign()
-    _test_balancer_magnitude()
-    _test_double_swish_deriv()
-    _test_swooshr_deriv()
-    _test_swooshl_deriv()
-    _test_activation_dropout_and_linear()
diff --git a/egs/librispeech/SSL/hubert/scaling.py b/egs/librispeech/SSL/hubert/scaling.py
new file mode 120000
index 0000000000..e30bd99de2
--- /dev/null
+++ b/egs/librispeech/SSL/hubert/scaling.py
@@ -0,0 +1 @@
+../../ASR/zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/librispeech/SSL/hubert/subsampling.py b/egs/librispeech/SSL/hubert/subsampling.py
deleted file mode 100644
index b2f769d3f6..0000000000
--- a/egs/librispeech/SSL/hubert/subsampling.py
+++ /dev/null
@@ -1,406 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2023  Xiaomi Corp.        (authors: Daniel Povey,
-#                                                  Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Tuple
-
-import torch
-from scaling import (
-    Balancer,
-    BiasNorm,
-    Dropout3,
-    FloatLike,
-    Optional,
-    ScaledConv2d,
-    ScaleGrad,
-    ScheduledFloat,
-    SwooshL,
-    SwooshR,
-    Whiten,
-)
-from torch import Tensor, nn
-
-
-class ConvNeXt(nn.Module):
-    """
-    Our interpretation of the ConvNeXt module as used in https://arxiv.org/pdf/2206.14747.pdf
-    """
-
-    def __init__(
-        self,
-        channels: int,
-        hidden_ratio: int = 3,
-        kernel_size: Tuple[int, int] = (7, 7),
-        layerdrop_rate: FloatLike = None,
-    ):
-        super().__init__()
-        self.padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2)
-        hidden_channels = channels * hidden_ratio
-        if layerdrop_rate is None:
-            layerdrop_rate = ScheduledFloat((0.0, 0.2), (20000.0, 0.015))
-        self.layerdrop_rate = layerdrop_rate
-
-        self.depthwise_conv = nn.Conv2d(
-            in_channels=channels,
-            out_channels=channels,
-            groups=channels,
-            kernel_size=kernel_size,
-            padding=self.padding,
-        )
-
-        self.pointwise_conv1 = nn.Conv2d(
-            in_channels=channels, out_channels=hidden_channels, kernel_size=1
-        )
-
-        self.hidden_balancer = Balancer(
-            hidden_channels,
-            channel_dim=1,
-            min_positive=0.3,
-            max_positive=1.0,
-            min_abs=0.75,
-            max_abs=5.0,
-        )
-
-        self.activation = SwooshL()
-        self.pointwise_conv2 = ScaledConv2d(
-            in_channels=hidden_channels,
-            out_channels=channels,
-            kernel_size=1,
-            initial_scale=0.01,
-        )
-
-        self.out_balancer = Balancer(
-            channels,
-            channel_dim=1,
-            min_positive=0.4,
-            max_positive=0.6,
-            min_abs=1.0,
-            max_abs=6.0,
-        )
-        self.out_whiten = Whiten(
-            num_groups=1,
-            whitening_limit=5.0,
-            prob=(0.025, 0.25),
-            grad_scale=0.01,
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
-            return self.forward_internal(x)
-        layerdrop_rate = float(self.layerdrop_rate)
-
-        if layerdrop_rate != 0.0:
-            batch_size = x.shape[0]
-            mask = (
-                torch.rand((batch_size, 1, 1, 1), dtype=x.dtype, device=x.device)
-                > layerdrop_rate
-            )
-        else:
-            mask = None
-        # turns out this caching idea does not work with --world-size > 1
-        # return caching_eval(self.forward_internal, x, mask)
-        return self.forward_internal(x, mask)
-
-    def forward_internal(
-        self, x: Tensor, layer_skip_mask: Optional[Tensor] = None
-    ) -> Tensor:
-        """
-        x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
-
-        The returned value has the same shape as x.
-        """
-        bypass = x
-        x = self.depthwise_conv(x)
-        x = self.pointwise_conv1(x)
-        x = self.hidden_balancer(x)
-        x = self.activation(x)
-        x = self.pointwise_conv2(x)
-
-        if layer_skip_mask is not None:
-            x = x * layer_skip_mask
-
-        x = bypass + x
-        x = self.out_balancer(x)
-
-        if x.requires_grad:
-            x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
-            x = self.out_whiten(x)
-            x = x.transpose(1, 3)  # (N, C, H, W)
-
-        return x
-
-    def streaming_forward(
-        self,
-        x: Tensor,
-        cached_left_pad: Tensor,
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Args:
-            x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
-            cached_left_pad: (batch_size, num_channels, left_pad, num_freqs)
-
-        Returns:
-            - The returned value has the same shape as x.
-            - Updated cached_left_pad.
-        """
-        padding = self.padding
-
-        # The length without right padding for depth-wise conv
-        T = x.size(2) - padding[0]
-
-        bypass = x[:, :, :T, :]
-
-        # Pad left side
-        assert cached_left_pad.size(2) == padding[0], (
-            cached_left_pad.size(2),
-            padding[0],
-        )
-        x = torch.cat([cached_left_pad, x], dim=2)
-        # Update cached left padding
-        cached_left_pad = x[:, :, T : padding[0] + T, :]
-
-        # depthwise_conv
-        x = torch.nn.functional.conv2d(
-            x,
-            weight=self.depthwise_conv.weight,
-            bias=self.depthwise_conv.bias,
-            padding=(0, padding[1]),
-            groups=self.depthwise_conv.groups,
-        )
-        x = self.pointwise_conv1(x)
-        x = self.hidden_balancer(x)
-        x = self.activation(x)
-        x = self.pointwise_conv2(x)
-
-        x = bypass + x
-        return x, cached_left_pad
-
-
-class Conv2dSubsampling(nn.Module):
-    """Convolutional 2D subsampling (to 1/2 length).
-
-    Convert an input of shape (N, T, idim) to an output
-    with shape (N, T', odim), where
-    T' = (T-3)//2 - 2 == (T-7)//2
-
-    It is based on
-    https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/subsampling.py  # noqa
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        layer1_channels: int = 8,
-        layer2_channels: int = 32,
-        layer3_channels: int = 128,
-        dropout: FloatLike = 0.1,
-    ) -> None:
-        """
-        Args:
-          in_channels:
-            Number of channels in. The input shape is (N, T, in_channels).
-            Caution: It requires: T >=7, in_channels >=7
-          out_channels
-            Output dim. The output shape is (N, (T-3)//2, out_channels)
-          layer1_channels:
-            Number of channels in layer1
-          layer1_channels:
-            Number of channels in layer2
-          bottleneck:
-            bottleneck dimension for 1d squeeze-excite
-        """
-        assert in_channels >= 7
-        super().__init__()
-
-        # The ScaleGrad module is there to prevent the gradients
-        # w.r.t. the weight or bias of the first Conv2d module in self.conv from
-        # exceeding the range of fp16 when using automatic mixed precision (amp)
-        # training.  (The second one is necessary to stop its bias from getting
-        # a too-large gradient).
-
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                in_channels=1,
-                out_channels=layer1_channels,
-                kernel_size=3,
-                padding=(0, 1),  # (time, freq)
-            ),
-            ScaleGrad(0.2),
-            Balancer(layer1_channels, channel_dim=1, max_abs=1.0),
-            SwooshR(),
-            nn.Conv2d(
-                in_channels=layer1_channels,
-                out_channels=layer2_channels,
-                kernel_size=3,
-                stride=2,
-                padding=0,
-            ),
-            Balancer(layer2_channels, channel_dim=1, max_abs=4.0),
-            SwooshR(),
-            nn.Conv2d(
-                in_channels=layer2_channels,
-                out_channels=layer3_channels,
-                kernel_size=3,
-                stride=(1, 2),  # (time, freq)
-            ),
-            Balancer(layer3_channels, channel_dim=1, max_abs=4.0),
-            SwooshR(),
-        )
-
-        # just one convnext layer
-        self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
-
-        # (in_channels-3)//4
-        self.out_width = (((in_channels - 1) // 2) - 1) // 2
-        self.layer3_channels = layer3_channels
-
-        self.out = nn.Linear(self.out_width * layer3_channels, out_channels)
-        # use a larger than normal grad_scale on this whitening module; there is
-        # only one such module, so there is not a concern about adding together
-        # many copies of this extra gradient term.
-        self.out_whiten = Whiten(
-            num_groups=1,
-            whitening_limit=ScheduledFloat((0.0, 4.0), (20000.0, 8.0), default=4.0),
-            prob=(0.025, 0.25),
-            grad_scale=0.02,
-        )
-
-        # max_log_eps=0.0 is to prevent both eps and the output of self.out from
-        # getting large, there is an unnecessary degree of freedom.
-        self.out_norm = BiasNorm(out_channels)
-        self.dropout = Dropout3(dropout, shared_dim=1)
-
-    def forward(
-        self, x: torch.Tensor, x_lens: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames in
-
-        Returns:
-          - a tensor of shape (N, (T-7)//2, odim)
-          - output lengths, of shape (batch_size,)
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-        # scaling x by 0.1 allows us to use a larger grad-scale in fp16 "amp" (automatic mixed precision)
-        # training, since the weights in the first convolution are otherwise the limiting factor for getting infinite
-        # gradients.
-        x = self.conv(x)
-        x = self.convnext(x)
-
-        # Now x is of shape (N, odim, (T-7)//2, (idim-3)//4)
-        b, c, t, f = x.size()
-
-        x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, (T-7)//2, out_width * layer3_channels))
-
-        x = self.out(x)
-        # Now x is of shape (N, (T-7)//2, odim)
-        x = self.out_whiten(x)
-        x = self.out_norm(x)
-        x = self.dropout(x)
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            x_lens = (x_lens - 7) // 2
-        else:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                x_lens = (x_lens - 7) // 2
-        assert x.size(1) == x_lens.max().item(), (x.size(1), x_lens.max())
-
-        return x, x_lens
-
-    def streaming_forward(
-        self,
-        x: torch.Tensor,
-        x_lens: torch.Tensor,
-        cached_left_pad: Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Subsample x.
-
-        Args:
-          x:
-            Its shape is (N, T, idim).
-          x_lens:
-            A tensor of shape (batch_size,) containing the number of frames in
-
-        Returns:
-          - a tensor of shape (N, (T-7)//2, odim)
-          - output lengths, of shape (batch_size,)
-          - updated cache
-        """
-        # On entry, x is (N, T, idim)
-        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
-
-        # T' = (T-7)//2
-        x = self.conv(x)
-
-        # T' = (T-7)//2-3
-        x, cached_left_pad = self.convnext.streaming_forward(
-            x, cached_left_pad=cached_left_pad
-        )
-
-        # Now x is of shape (N, odim, T', ((idim-1)//2 - 1)//2)
-        b, c, t, f = x.size()
-
-        x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, T', out_width * layer3_channels))
-
-        x = self.out(x)
-        # Now x is of shape (N, T', odim)
-        x = self.out_norm(x)
-
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            assert self.convnext.padding[0] == 3
-            # The ConvNeXt module needs 3 frames of right padding after subsampling
-            x_lens = (x_lens - 7) // 2 - 3
-        else:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                # The ConvNeXt module needs 3 frames of right padding after subsampling
-                assert self.convnext.padding[0] == 3
-                x_lens = (x_lens - 7) // 2 - 3
-
-        assert x.size(1) == x_lens.max().item(), (x.shape, x_lens.max())
-
-        return x, x_lens, cached_left_pad
-
-    @torch.jit.export
-    def get_init_states(
-        self,
-        batch_size: int = 1,
-        device: torch.device = torch.device("cpu"),
-    ) -> Tensor:
-        """Get initial states for Conv2dSubsampling module.
-        It is the cached left padding for ConvNeXt module,
-        of shape (batch_size, num_channels, left_pad, num_freqs)
-        """
-        left_pad = self.convnext.padding[0]
-        freq = self.out_width
-        channels = self.layer3_channels
-        cached_embed_left_pad = torch.zeros(batch_size, channels, left_pad, freq).to(
-            device
-        )
-
-        return cached_embed_left_pad

From 802349302940bb83870724c0f70b1cf47dc081c2 Mon Sep 17 00:00:00 2001
From: Yifan Yang <64255737+yfyeung@users.noreply.github.com>
Date: Mon, 1 Jan 2024 20:09:22 +0800
Subject: [PATCH 3/4] update

---
 egs/librispeech/SSL/hubert/dataset.py  | 55 +--------------
 egs/librispeech/SSL/hubert/decode.py   | 37 +++++-----
 egs/librispeech/SSL/hubert/finetune.py | 98 +++++++++++++-------------
 egs/librispeech/SSL/hubert/model.py    | 10 ++-
 4 files changed, 77 insertions(+), 123 deletions(-)

diff --git a/egs/librispeech/SSL/hubert/dataset.py b/egs/librispeech/SSL/hubert/dataset.py
index c3442df51a..d97fe99459 100644
--- a/egs/librispeech/SSL/hubert/dataset.py
+++ b/egs/librispeech/SSL/hubert/dataset.py
@@ -25,53 +25,6 @@
 from transformers import Wav2Vec2FeatureExtractor
 
 
-class HubertDataset(torch.utils.data.Dataset):
-    """
-    In this implementation, there will always be a single channel.
-
-    Returns:
-
-    .. code-block::
-
-        {
-            'audio': (B x NumSamples) float tensor
-            'audio_lens': (B, ) int tensor
-        }
-    """
-
-    def __init__(self, collate: bool = True) -> None:
-        super().__init__()
-        self.feature_extractor = Wav2Vec2FeatureExtractor(
-            feature_size=1,
-            sampling_rate=16000,
-            padding_side="right",
-            padding_value=0.0,
-            do_normalize=True,
-            return_attention_mask=True,
-        )
-
-    def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
-        self._validate(cuts)
-        audio, _ = read_audio_from_cuts(cuts, return_tensors=False)
-        audio = self.feature_extractor(
-            audio,
-            padding=True,
-            return_tensors="pt",
-            sampling_rate=16000,
-        ).input_values
-        audio_lens = torch.tensor([cut.num_samples for cut in cuts], dtype=torch.int32)
-
-        return {
-            "cuts": cuts,
-            "audio": audio,
-            "audio_lens": audio_lens,
-        }
-
-    def _validate(self, cuts: CutSet) -> None:
-        validate(cuts)
-        assert all(cut.has_recording for cut in cuts)
-
-
 class HubertAsrDataset(torch.utils.data.Dataset):
     """
     In this implementation, there will always be a single channel.
@@ -94,7 +47,8 @@ def __init__(self, collate: bool = True) -> None:
             padding_side="right",
             padding_value=0,
             do_normalize=True,
-            return_attention_mask=False,
+            return_attention_mask=True,
+            feature_extractor_type="Wav2Vec2FeatureExtractor",
         )
 
     def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
@@ -148,7 +102,4 @@ def _validate(self, cuts: CutSet) -> None:
     )
 
     for batch_idx, batch in enumerate(dl):
-        print(batch["audio"])
-        print(batch["audio_lens"])
-        print(batch["supervisions"]["text"])
-        print(batch["cuts"])
+        break
diff --git a/egs/librispeech/SSL/hubert/decode.py b/egs/librispeech/SSL/hubert/decode.py
index 604d714531..7df3c29634 100644
--- a/egs/librispeech/SSL/hubert/decode.py
+++ b/egs/librispeech/SSL/hubert/decode.py
@@ -121,7 +121,7 @@
     modified_beam_search_lm_shallow_fusion,
     modified_beam_search_LODR,
 )
-from train import add_model_arguments, get_model, get_params
+from finetune import add_model_arguments, get_model, get_params
 
 from icefall import ContextGraph, LmScorer, NgramLm
 from icefall.checkpoint import (
@@ -425,16 +425,10 @@ def decode_one_batch(
       the returned dict.
     """
     device = next(model.parameters()).device
-    feature = batch["inputs"]
-    assert feature.ndim == 3
+    audio = batch["audio"].to(device)
+    audio_lens = torch.full(audio.shape[:1], audio.shape[1], dtype=torch.int32)
 
-    feature = feature.to(device)
-    # at entry, feature is (N, T, C)
-
-    supervisions = batch["supervisions"]
-    feature_lens = supervisions["num_frames"].to(device)
-
-    encoder_out, encoder_out_lens = model.forward_encoder(feature, feature_lens)
+    encoder_out, encoder_out_lens = model.forward_encoder(audio, audio_lens)
 
     hyps = []
 
@@ -665,7 +659,7 @@ def decode_dataset(
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
         texts = batch["supervisions"]["text"]
-        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        cut_ids = [cut.id for cut in batch["cuts"]]
 
         hyps_dict = decode_one_batch(
             params=params,
@@ -996,14 +990,23 @@ def main():
     args.return_cuts = True
     librispeech = LibriSpeechAsrDataModule(args)
 
-    test_clean_cuts = librispeech.test_clean_cuts()
-    test_other_cuts = librispeech.test_other_cuts()
+    dev_clean_cuts = librispeech.dev_clean_cuts()
+    dev_other_cuts = librispeech.dev_other_cuts()
+
+    dev_clean_dl = librispeech.test_dataloaders(dev_clean_cuts)
+    dev_other_dl = librispeech.test_dataloaders(dev_other_cuts)
+
+    test_sets = ["dev-clean", "dev-other"]
+    test_dl = [dev_clean_dl, dev_other_dl]
+
+    # test_clean_cuts = librispeech.test_clean_cuts()
+    # test_other_cuts = librispeech.test_other_cuts()
 
-    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
-    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+    # test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    # test_other_dl = librispeech.test_dataloaders(test_other_cuts)
 
-    test_sets = ["test-clean", "test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
+    # test_sets = ["test-clean", "test-other"]
+    # test_dl = [test_clean_dl, test_other_dl]
 
     for test_set, test_dl in zip(test_sets, test_dl):
         results_dict = decode_dataset(
diff --git a/egs/librispeech/SSL/hubert/finetune.py b/egs/librispeech/SSL/hubert/finetune.py
index 0c0095f9f2..ad0ae4199b 100644
--- a/egs/librispeech/SSL/hubert/finetune.py
+++ b/egs/librispeech/SSL/hubert/finetune.py
@@ -31,8 +31,8 @@
   --start-epoch 1 \
   --use-fp16 0 \
   --exp-dir hubert/exp \
-  --full-libri 1 \
-  --max-duration 80
+  --full-libri 0 \
+  --max-duration 200
 
 It supports finetuning with:
   - transducer loss (default), with `--use-transducer True --use-ctc False`
@@ -63,7 +63,6 @@
 from lhotse.utils import fix_random_seed
 from model import AsrModel
 from optim import Eden, ScaledAdam
-from scaling import ScheduledFloat
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -216,17 +215,17 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--mask-feature-length",
         type=int,
-        default=10,
+        default=64,
     )
     parser.add_argument(
         "--mask-feature-min-masks",
         type=int,
-        default=0,
+        default=2,
     )
     parser.add_argument(
         "--mask-feature-prob",
         type=float,
-        default=0.0,
+        default=0.5,
     )
     parser.add_argument(
         "--mask-time-length",
@@ -236,12 +235,12 @@ def add_model_arguments(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--mask-time-min-masks",
         type=int,
-        default=2,
+        default=10,
     )
     parser.add_argument(
         "--mask-time-prob",
         type=float,
-        default=0.05,
+        default=0.65,
     )
     parser.add_argument(
         "--num-attention-heads",
@@ -361,7 +360,6 @@ def get_parser():
     parser.add_argument(
         "--pretrained-dir",
         type=str,
-        default="download/hubert-base-ls960",
         help="""The pretrained model dir.
         It specifies the directory where the pretrained checkpoint is saved.""",
     )
@@ -374,7 +372,7 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--base-lr", type=float, default=0.0005, help="The base learning rate."
+        "--base-lr", type=float, default=0.001, help="The base learning rate."
     )
 
     parser.add_argument(
@@ -608,40 +606,43 @@ def _conv_out_length(input_length, kernel_size, stride):
 
 
 def get_encoder_model(params: AttributeDict) -> nn.Module:
-    config = HubertConfig(
-        hidden_size=params.hidden_size,
-        num_hidden_layers=params.num_hidden_layers,
-        num_attention_heads=params.num_attention_heads,
-        intermediate_size=params.intermediate_size,
-        hidden_act=params.hidden_act,
-        hidden_dropout=params.hidden_dropout,
-        activation_dropout=params.activation_dropout,
-        attention_dropout=params.attention_dropout,
-        feat_proj_layer_norm=params.feat_proj_layer_norm,
-        feat_proj_dropout=params.feat_proj_dropout,
-        final_dropout=params.final_dropout,
-        layerdrop=params.layerdrop,
-        initializer_range=params.initializer_range,
-        layer_norm_eps=params.layer_norm_eps,
-        feat_extract_norm=params.feat_extract_norm,
-        feat_extract_activation=params.feat_extract_activation,
-        conv_dim=_to_int_tuple(params.conv_dim),
-        conv_stride=_to_int_tuple(params.conv_stride),
-        conv_kernel=_to_int_tuple(params.conv_kernel),
-        conv_bias=params.conv_bias,
-        num_conv_pos_embeddings=params.num_conv_pos_embeddings,
-        num_conv_pos_embedding_groups=params.num_conv_pos_embedding_groups,
-        do_stable_layer_norm=params.do_stable_layer_norm,
-        apply_spec_augment=params.apply_spec_augment,
-        mask_time_prob=params.mask_time_prob,
-        mask_time_length=params.mask_time_length,
-        mask_time_min_masks=params.mask_time_min_masks,
-        mask_feature_prob=params.mask_feature_prob,
-        mask_feature_length=params.mask_feature_length,
-        mask_feature_min_masks=params.mask_feature_min_masks,
-    )
-
-    encoder = HubertModel(config)
+    if hasattr(params, "pretrained_dir"):
+        logging.info(f"Loading {params.pretrained_dir}")
+        encoder = HubertModel.from_pretrained(params.pretrained_dir)
+    else:
+        config = HubertConfig(
+            hidden_size=params.hidden_size,
+            num_hidden_layers=params.num_hidden_layers,
+            num_attention_heads=params.num_attention_heads,
+            intermediate_size=params.intermediate_size,
+            hidden_act=params.hidden_act,
+            hidden_dropout=params.hidden_dropout,
+            activation_dropout=params.activation_dropout,
+            attention_dropout=params.attention_dropout,
+            feat_proj_layer_norm=params.feat_proj_layer_norm,
+            feat_proj_dropout=params.feat_proj_dropout,
+            final_dropout=params.final_dropout,
+            layerdrop=params.layerdrop,
+            initializer_range=params.initializer_range,
+            layer_norm_eps=params.layer_norm_eps,
+            feat_extract_norm=params.feat_extract_norm,
+            feat_extract_activation=params.feat_extract_activation,
+            conv_dim=_to_int_tuple(params.conv_dim),
+            conv_stride=_to_int_tuple(params.conv_stride),
+            conv_kernel=_to_int_tuple(params.conv_kernel),
+            conv_bias=params.conv_bias,
+            num_conv_pos_embeddings=params.num_conv_pos_embeddings,
+            num_conv_pos_embedding_groups=params.num_conv_pos_embedding_groups,
+            do_stable_layer_norm=params.do_stable_layer_norm,
+            apply_spec_augment=params.apply_spec_augment,
+            mask_time_prob=params.mask_time_prob,
+            mask_time_length=params.mask_time_length,
+            mask_time_min_masks=params.mask_time_min_masks,
+            mask_feature_prob=params.mask_feature_prob,
+            mask_feature_length=params.mask_feature_length,
+            mask_feature_min_masks=params.mask_feature_min_masks,
+        )
+        encoder = HubertModel(config)
     return encoder
 
 
@@ -731,8 +732,6 @@ def load_checkpoint_if_available(
     elif params.start_epoch > 1:
         filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
     else:
-        logging.info(f"Loading {params.pretrained_dir}")
-        model.encoder = HubertModel.from_pretrained(params.pretrained_dir)
         return None
 
     assert filename.is_file(), f"{filename} does not exist!"
@@ -839,7 +838,7 @@ def compute_loss(
     """
     device = model.device if isinstance(model, DDP) else next(model.parameters()).device
     audio = batch["audio"].to(device)
-    audio_lens = batch["audio_lens"].to(device)
+    audio_lens = torch.full(audio.shape[:1], audio.shape[1], dtype=torch.int32)
 
     batch_idx_train = params.batch_idx_train
     warm_step = params.warm_step
@@ -1113,7 +1112,10 @@ def save_bad_model(suffix: str = ""):
                         "train/grad_scale", cur_grad_scale, params.batch_idx_train
                     )
 
-        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+        if (
+            batch_idx % (params.valid_interval * params.accum_grad) == 0
+            and not params.print_diagnostics
+        ):
             logging.info("Computing validation loss")
             valid_info = compute_validation_loss(
                 params=params,
diff --git a/egs/librispeech/SSL/hubert/model.py b/egs/librispeech/SSL/hubert/model.py
index ce203e3e0c..5484e9da54 100644
--- a/egs/librispeech/SSL/hubert/model.py
+++ b/egs/librispeech/SSL/hubert/model.py
@@ -32,7 +32,7 @@ def __init__(
         encoder,
         decoder: Optional[nn.Module] = None,
         joiner: Optional[nn.Module] = None,
-        encoder_dim: int = 1024,
+        encoder_dim: int = 768,
         decoder_dim: int = 512,
         vocab_size: int = 500,
         use_transducer: bool = True,
@@ -111,7 +111,7 @@ def forward_encoder(
             A 2-D tensor of shape (N, T).
           x_lens:
             A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
+            w/wo padding.
 
         Returns:
           encoder_out:
@@ -119,12 +119,10 @@ def forward_encoder(
           encoder_out_lens:
             Encoder output lengths, of shape (N,).
         """
+        encoder_out = self.encoder(x).last_hidden_state
         encoder_out_lens = self.encoder._get_feat_extract_output_lengths(x_lens)
         assert torch.all(encoder_out_lens > 0), (x_lens, encoder_out_lens)
 
-        src_key_padding_mask = make_pad_mask(x_lens)
-        encoder_out = self.encoder(x, src_key_padding_mask).last_hidden_state
-
         return encoder_out, encoder_out_lens
 
     def forward_ctc(
@@ -278,7 +276,7 @@ def forward(
             A 2-D tensor of shape (N, T).
           x_lens:
             A 1-D tensor of shape (N,). It contains the number of frames in `x`
-            before padding.
+            w/wo padding.
           y:
             A ragged tensor with 2 axes [utt][label]. It contains labels of each
             utterance.

From a232cebbc8d9d1303915f3c780ea138369b3ed4e Mon Sep 17 00:00:00 2001
From: Yifan Yang <64255737+yfyeung@users.noreply.github.com>
Date: Wed, 3 Jan 2024 17:00:13 +0800
Subject: [PATCH 4/4] small fix for decode.py

---
 egs/librispeech/SSL/hubert/decode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/librispeech/SSL/hubert/decode.py b/egs/librispeech/SSL/hubert/decode.py
index 7df3c29634..1e96224ebb 100644
--- a/egs/librispeech/SSL/hubert/decode.py
+++ b/egs/librispeech/SSL/hubert/decode.py
@@ -482,7 +482,7 @@ def decode_one_batch(
             max_contexts=params.max_contexts,
             max_states=params.max_states,
             num_paths=params.num_paths,
-            ref_texts=sp.encode(supervisions["text"]),
+            ref_texts=sp.encode(batch["supervisions"]["text"]),
             nbest_scale=params.nbest_scale,
         )
         for hyp in sp.decode(hyp_tokens):