diff --git a/.github/ISSUE_TEMPLATE/sweep.yml b/.github/ISSUE_TEMPLATE/sweep.yml
deleted file mode 100644
index 18c591b..0000000
--- a/.github/ISSUE_TEMPLATE/sweep.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: 🤖 Sweep AI
-description: Write a prompt for the Sweep AI bot to create a pull request from.
-title: 'Sweep: '
-labels: ["sweep"]
-
-body:
-- type: markdown
-  attributes:
-    value: |
-      #### See the [Sweep AI docs](https://docs.sweep.dev/) for more information.
-
-      For a few line fixes, you can start your PR with `Sweep (fast): `. Sweep will use GPT-3.5 to quickly create a PR for very small changes
-
-      For larger bugs, features, refactors, and tests, you can start your PR with `Sweep (slow): `. Sweep will perform a deeper search and more self-reviews but will take longer.
-
-      To see examples of sweep being used for tsml-eval, check out the [Sweep issues](https://github.com/time-series-machine-learning/tsml-eval/issues?q=is%3Aissue+label%3Asweep+).
-- type: textarea
-  attributes:
-    label: Details
-    description: >
-      Tell Sweep where and what to edit and provide enough context for a new developer
-      to the codebase.
-    placeholder: |
-        Bugs: The bug might be in ... file. Here are the logs: ...
-        Features: the new endpoint should use the ... class from ... file because it contains ... logic.
-        Refactors: We are migrating this function to ... version because ...
-  validations:
-    required: true
-- type: textarea
-  attributes:
-    label: Files to change
-    description: Optional but can improve Sweep
-    placeholder: |
-      src/main.py
-      tests/test.py
-    render: Shell
diff --git a/.github/workflows/pr_opened.yml b/.github/workflows/pr_opened.yml
deleted file mode 100644
index efe1cba..0000000
--- a/.github/workflows/pr_opened.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: PR Opened
-
-on:
-  pull_request_target:
-    types: [opened]
-
-permissions:
-  contents: read
-  pull-requests: write
-  issues: write
-
-jobs:
-  # based on the scikit-learn 1.3.1 PR labelers
-  labeler:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          sparse-checkout: build_tools
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - name: Install PyGithub
-        run: pip install -Uq PyGithub
-
-      - uses: actions/create-github-app-token@v1
-        id: app-token
-        with:
-          app-id: ${{ vars.PR_APP_ID }}
-          private-key: ${{ secrets.PR_APP_KEY }}
-
-      - name: Label pull request
-        id: label-pr
-        run: python build_tools/pr_labeler.py
-        env:
-          CONTEXT_GITHUB: ${{ toJson(github) }}
-          GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
-
-      - name: Write pull request comment
-        run: python build_tools/pr_open_commenter.py
-        env:
-          CONTEXT_GITHUB: ${{ toJson(github) }}
-          GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
-          TITLE_LABELS: ${{ steps.label-pr.outputs.title-labels }}
-          TITLE_LABELS_NEW: ${{ steps.label-pr.outputs.title-labels-new }}
-          CONTENT_LABELS: ${{ steps.label-pr.outputs.content-labels }}
-          CONTENT_LABELS_STATUS: ${{ steps.label-pr.outputs.content-labels-status }}
diff --git a/.github/workflows/pr_precommit.yml b/.github/workflows/pr_precommit.yml
index 630e3d1..dcdc758 100644
--- a/.github/workflows/pr_precommit.yml
+++ b/.github/workflows/pr_precommit.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches:
       - main
-  pull_request_target:
+  pull_request:
     branches:
       - main
 
@@ -30,36 +30,13 @@ jobs:
         run: echo '${{ steps.changed-files.outputs.all_changed_files }}'
 
       # only check the full repository if PR and correctly labelled
-      - if: ${{ github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'full pre-commit') }}
+      - if: ${{ github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'full pre-commit') }}
         name: Full pre-commit
         uses: pre-commit/action@v3.0.1
         with:
           extra_args: --all-files
-      - if: ${{ github.event_name != 'pull_request_target' || !contains(github.event.pull_request.labels.*.name, 'full pre-commit') }}
+      - if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'full pre-commit') }}
         name: Local pre-commit
         uses: pre-commit/action@v3.0.1
         with:
           extra_args: --files ${{ steps.changed-files.outputs.all_changed_files }}
-
-      # push fixes if pre-commit fails and PR is eligible
-      - if: ${{ failure() && github.event_name == 'pull_request_target' && !github.event.pull_request.draft && !contains(github.event.pull_request.labels.*.name, 'stop pre-commit fixes')}}
-        uses: actions/create-github-app-token@v1
-        id: app-token
-        with:
-          app-id: ${{ vars.PR_APP_ID }}
-          private-key: ${{ secrets.PR_APP_KEY }}
-
-      - if: ${{ failure() && github.event_name == 'pull_request_target' && !github.event.pull_request.draft && !contains(github.event.pull_request.labels.*.name, 'stop pre-commit fixes') }}
-        name: Checkout
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.head_ref }}
-          token: ${{ steps.app-token.outputs.token }}
-
-      - if: ${{ failure() && github.event_name == 'pull_request_target' && !github.event.pull_request.draft && !contains(github.event.pull_request.labels.*.name, 'stop pre-commit fixes')}}
-        name: Push pre-commit fixes
-        uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          commit_message: Automatic `pre-commit` fixes
-          commit_user_name: tsml-actions-bot[bot]
diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml
deleted file mode 100644
index e69de29..0000000
diff --git a/README.md b/README.md
index a99fa28..fbd08e9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 [![github-actions-release](https://img.shields.io/github/actions/workflow/status/time-series-machine-learning/tsml-py/release.yml?logo=github&label=build%20%28release%29)](https://github.com/time-series-machine-learning/tsml-py/actions/workflows/release.yml)
 [![github-actions-main](https://img.shields.io/github/actions/workflow/status/time-series-machine-learning/tsml-py/periodic_tests.yml?logo=github&branch=main&label=build%20%28main%29)](https://github.com/time-series-machine-learning/tsml-py/actions/workflows/periodic_tests.yml)
-[![codecov](https://img.shields.io/codecov/c/github/time-series-machine-learning/tsml-py?label=codecov&logo=codecov)](https://codecov.io/gh/time-series-machine-learning/tsml-py)
 [![pypi](https://img.shields.io/pypi/v/tsml?logo=pypi&color=blue)](https://pypi.org/project/tsml/)
 [![!conda](https://img.shields.io/conda/vn/conda-forge/tsml?logo=anaconda&color=blue)](https://anaconda.org/conda-forge/tsml)
 [![python-versions](https://img.shields.io/pypi/pyversions/tsml?logo=python)](https://www.python.org/)
@@ -9,10 +8,10 @@
 
 # tsml-py
 
-A toolkit for time series machine learning algorithms.
+A toolkit for in-development time series machine learning algorithms.
 
 Please see [`tsml_eval`](https://github.com/time-series-machine-learning/tsml-eval) and
-[`aeon`](https://github.com/aeon-toolkit/aeon) for more developed packages. This package
+[`aeon`](https://github.com/aeon-toolkit/aeon) for more developed and stable packages. This package
 is more of a sandbox for testing out new ideas and algorithms. It may contain some
 algorithms and implementations that are not available in the other toolkits.
 
@@ -28,4 +27,5 @@ pip install tsml
 
 ## Acknowledgements
 
-This work is supported by the UK Engineering and Physical Sciences Research Council (EPSRC) EP/W030756/1
+This work is supported by the UK Engineering and Physical Sciences Research Council
+(EPSRC) EP/W030756/1
diff --git a/build_tools/pr_labeler.py b/build_tools/pr_labeler.py
deleted file mode 100644
index 880c0cb..0000000
--- a/build_tools/pr_labeler.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Labels PRs based on title and change list.
-
-Must be run in a github action with the pull_request_target event.
-
-Based on the scikit-learn v1.3.1 label_title_regex.py script.
-"""
-
-import json
-import os
-import re
-import sys
-
-from github import Github
-
-context_dict = json.loads(os.getenv("CONTEXT_GITHUB"))
-
-repo = context_dict["repository"]
-g = Github(os.getenv("GITHUB_TOKEN"))
-repo = g.get_repo(repo)
-pr_number = context_dict["event"]["number"]
-pr = repo.get_pull(number=pr_number)
-labels = [label.name for label in pr.get_labels()]
-
-if "[bot]" in pr.user.login:
-    sys.exit(0)
-
-# title labels
-title = pr.title
-
-title_regex_to_labels = [
-    (r"\benh\b", "enhancement"),
-    (r"\bmnt\b", "maintenance"),
-    (r"\bbug\b", "bug"),
-    (r"\bdoc\b", "documentation"),
-    (r"\bref\b", "refactor"),
-]
-
-title_labels = [
-    label for regex, label in title_regex_to_labels if re.search(regex, title.lower())
-]
-title_labels_to_add = list(set(title_labels) - set(labels))
-
-# content labels
-paths = [file.filename for file in pr.get_files()]
-
-content_paths_to_labels = [
-    ("tsml/tests/", "testing"),
-]
-
-present_content_labels = [
-    label for _, label in content_paths_to_labels if label in labels
-]
-
-content_labels = [
-    label
-    for package, label in content_paths_to_labels
-    if any([package in path for path in paths])
-]
-content_labels = list(set(content_labels))
-
-content_labels_to_add = content_labels
-content_labels_status = "used"
-if len(present_content_labels) > 0:
-    content_labels_to_add = []
-    content_labels_status = "ignored"
-if len(content_labels) > 3:
-    content_labels_to_add = []
-    content_labels_status = (
-        "large" if content_labels_status != "ignored" else "ignored+large"
-    )
-
-# add to PR
-if title_labels_to_add or content_labels_to_add:
-    pr.add_to_labels(*title_labels_to_add + content_labels_to_add)
-
-with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
-    print(f"title-labels={title_labels}".replace(" ", ""), file=fh)  # noqa: T201
-    print(  # noqa: T201
-        f"title-labels-new={title_labels_to_add}".replace(" ", ""), file=fh
-    )
-    print(f"content-labels={content_labels}".replace(" ", ""), file=fh)  # noqa: T201
-    print(f"content-labels-status={content_labels_status}", file=fh)  # noqa: T201
diff --git a/build_tools/pr_open_commenter.py b/build_tools/pr_open_commenter.py
deleted file mode 100644
index d307abb..0000000
--- a/build_tools/pr_open_commenter.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""Writes a comment on PR opening.
-
-Includes output from the labeler action.
-"""
-
-import json
-import os
-import sys
-
-from github import Github
-
-context_dict = json.loads(os.getenv("CONTEXT_GITHUB"))
-
-repo = context_dict["repository"]
-g = Github(os.getenv("GITHUB_TOKEN"))
-repo = g.get_repo(repo)
-pr_number = context_dict["event"]["number"]
-pr = repo.get_pull(number=pr_number)
-
-if "[bot]" in pr.user.login:
-    sys.exit(0)
-
-title_labels = os.getenv("TITLE_LABELS")[1:-1].replace("'", "").split(",")
-title_labels_new = os.getenv("TITLE_LABELS_NEW")[1:-1].replace("'", "").split(",")
-content_labels = os.getenv("CONTENT_LABELS")[1:-1].replace("'", "").split(",")
-content_labels_status = os.getenv("CONTENT_LABELS_STATUS")
-
-replacement_labels = []
-for i, label in enumerate(content_labels):
-    for cur_label, new_label in replacement_labels:
-        if label == cur_label:
-            content_labels[i] = new_label
-
-labels = [(label.name, label.color) for label in repo.get_labels()]
-title_labels = [
-    f"$\\color{{#{color}}}{{\\textsf{{{label}}}}}$"
-    for label, color in labels
-    if label in title_labels
-]
-title_labels_new = [
-    f"$\\color{{#{color}}}{{\\textsf{{{label}}}}}$"
-    for label, color in labels
-    if label in title_labels_new
-]
-content_labels = [
-    f"$\\color{{#{color}}}{{\\textsf{{{label}}}}}$"
-    for label, color in labels
-    if label in content_labels
-]
-
-title_labels_str = ""
-if len(title_labels) == 0:
-    title_labels_str = "I did not find any labels to add based on the title."
-elif len(title_labels_new) != 0:
-    arr_str = str(title_labels_new).strip("[]").replace("'", "")
-    title_labels_str = (
-        "I have added the following labels to this PR based on the title: "
-        f"**[ {arr_str} ]**."
-    )
-    if len(title_labels) != len(title_labels_new):
-        arr_str = (
-            str(set(title_labels) - set(title_labels_new)).strip("[]").replace("'", "")
-        )
-        title_labels_str += (
-            f" The following labels were already present: **[ {arr_str} ]**"
-        )
-
-content_labels_str = ""
-if len(content_labels) != 0:
-    if content_labels_status == "used":
-        arr_str = str(content_labels).strip("[]").replace("'", "")
-        content_labels_str = (
-            "I have added the following labels to this PR based on "
-            f"the changes made: **[ {arr_str} ]**. Feel free "
-            "to change these if they do not properly represent the PR."
-        )
-    elif content_labels_status == "ignored":
-        arr_str = str(content_labels).strip("[]").replace("'", "")
-        content_labels_str = (
-            "I would have added the following labels to this PR "
-            f"based on the changes made: **[ {arr_str} ]**, "
-            "however some package labels are already present."
-        )
-    elif content_labels_status == "large":
-        content_labels_str = (
-            "This PR changes too many different packages (>3) for "
-            "automatic addition of labels, please manually add package "
-            "labels if relevant."
-        )
-elif title_labels_str == "":
-    content_labels_str = (
-        "I did not find any labels to add that did not already "
-        "exist. If the content of your PR changes, make sure to "
-        "update the labels accordingly."
-    )
-
-pr.create_issue_comment(
-    f"""
-## Thank you for contributing to `tsml-py`
-
-{title_labels_str}
-{content_labels_str}
-
-The [Checks](https://github.com/time-series-machine-learning/tsml-py/pull/{pr_number}/checks) tab will show the status of our automated tests. You can click on individual test runs in the tab or "Details" in the panel below to see more information if there is a failure.
-    """  # noqa
-)
diff --git a/pyproject.toml b/pyproject.toml
index 900dff4..7679260 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,25 +38,26 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "numba>=0.55.0",
-    "numpy>=1.21.0",
-    "scipy>=1.9.0,<1.13.0",
-    "pandas>=1.5.3",
-    "scikit-learn>=1.0.0",
+    "numba>=0.55.0,<0.61.0",
+    "numpy>=1.21.0,<2.2.0",
+    "scipy>=1.9.0,<1.14.0",
+    "pandas>=1.5.3,<2.3.0",
+    "scikit-learn>=1.0.0,<1.4.0",
     "packaging>=20.0",
 ]
 
 [project.optional-dependencies]
 all_extras = [
     "grailts",
-    "pycatch22",
-    "pyfftw>=0.12.0; python_version < '3.12'",  # requires fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html)
     "scikit-fda>=0.7.0",
     "statsmodels>=0.12.1",
     "stumpy>=1.6.0",
+    "wildboar",
 ]
 unstable_extras = [
-    "mrsqm>=0.0.7; python_version < '3.12'",  # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html)
+    "pycatch22",
+    "pyfftw>=0.12.0; python_version < '3.12'",  # requires fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html)
+    "mrsqm>=0.0.7; platform_system != 'Windows' and python_version < '3.12'",  # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html)
 ]
 dev = [
     "pre-commit",
@@ -67,24 +68,7 @@ dev = [
     "pytest-cov",
     "pytest-rerunfailures",
 ]
-binder = [
-    "notebook",
-    "jupyterlab",
-]
-docs = [
-    "sphinx<7.4.0",
-    "sphinx-design",
-    "sphinx-version-warning",
-    "sphinx_issues",
-    "sphinx-copybutton",
-    "sphinx-remove-toctrees",
-    "sphinxext-opengraph",
-    "nbsphinx",
-    "numpydoc",
-    "myst-parser",
-    "jupyter",
-    "furo",
-]
+
 
 [project.urls]
 homepage = "https://www.timeseriesclassification.com/"
diff --git a/tsml/compose/_channel_ensemble.py b/tsml/compose/_channel_ensemble.py
index 7d2e8f1..1c2bfb7 100644
--- a/tsml/compose/_channel_ensemble.py
+++ b/tsml/compose/_channel_ensemble.py
@@ -204,11 +204,11 @@ class ChannelEnsembleClassifier(ClassifierMixin, _BaseChannelEnsemble):
     Examples
     --------
     >>> from tsml.compose import ChannelEnsembleClassifier
-    >>> from tsml.interval_based import TSFClassifier
+    >>> from tsml.interval_based import IntervalForestClassifier
     >>> from tsml.utils.testing import generate_3d_test_data
     >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0)
     >>> reg = ChannelEnsembleClassifier(
-    ...     estimators=("tsf", TSFClassifier(n_estimators=2), "all-split"),
+    ...     estimators=("tsf", IntervalForestClassifier(n_estimators=2), "all-split"),
     ...     random_state=0,
     ... )
     >>> reg.fit(X, y)
@@ -349,12 +349,12 @@ def get_test_params(
         params : dict or list of dict
             Parameters to create testing instances of the class.
         """
-        from tsml.interval_based import TSFClassifier
+        from tsml.interval_based import IntervalForestClassifier
 
         return {
             "estimators": [
-                ("tsf1", TSFClassifier(n_estimators=2), 0),
-                ("tsf2", TSFClassifier(n_estimators=2), 0),
+                ("tsf1", IntervalForestClassifier(n_estimators=2), 0),
+                ("tsf2", IntervalForestClassifier(n_estimators=2), 0),
             ]
         }
 
@@ -411,12 +411,12 @@ class ChannelEnsembleRegressor(RegressorMixin, _BaseChannelEnsemble):
     Examples
     --------
     >>> from tsml.compose import ChannelEnsembleRegressor
-    >>> from tsml.interval_based import TSFRegressor
+    >>> from tsml.interval_based import IntervalForestRegressor
     >>> from tsml.utils.testing import generate_3d_test_data
     >>> X, y = generate_3d_test_data(n_samples=8, series_length=10,
     ...                              regression_target=True, random_state=0)
     >>> reg = ChannelEnsembleRegressor(
-    ...     estimators=("tsf", TSFRegressor(n_estimators=2), "all-split"),
+    ...     estimators=("tsf", IntervalForestRegressor(n_estimators=2), "all-split"),
     ...     random_state=0,
     ... )
     >>> reg.fit(X, y)
@@ -518,12 +518,12 @@ def get_test_params(
         params : dict or list of dict
             Parameters to create testing instances of the class.
         """
-        from tsml.interval_based import TSFRegressor
+        from tsml.interval_based import IntervalForestRegressor
 
         return {
             "estimators": [
-                ("tsf1", TSFRegressor(n_estimators=2), 0),
-                ("tsf2", TSFRegressor(n_estimators=2), 0),
+                ("tsf1", IntervalForestRegressor(n_estimators=2), 0),
+                ("tsf2", IntervalForestRegressor(n_estimators=2), 0),
             ]
         }
 
diff --git a/tsml/compose/tests/test_channel_ensemble.py b/tsml/compose/tests/test_channel_ensemble.py
index ffe8ce8..2232c75 100644
--- a/tsml/compose/tests/test_channel_ensemble.py
+++ b/tsml/compose/tests/test_channel_ensemble.py
@@ -9,7 +9,7 @@
     _check_key_type,
     _get_channel,
 )
-from tsml.interval_based import TSFClassifier, TSFRegressor
+from tsml.interval_based import IntervalForestClassifier, IntervalForestRegressor
 from tsml.utils.testing import generate_3d_test_data, generate_unequal_test_data
 
 
@@ -18,7 +18,7 @@ def test_single_estimator():
     X, y = generate_3d_test_data(n_channels=3)
 
     ens = ChannelEnsembleClassifier(
-        estimators=[("tsf", TSFClassifier(n_estimators=2), "all")]
+        estimators=[("tsf", IntervalForestClassifier(n_estimators=2), "all")]
     )
     ens.fit(X, y)
 
@@ -26,7 +26,7 @@ def test_single_estimator():
     assert ens.predict(X).shape == (X.shape[0],)
 
     ens = ChannelEnsembleRegressor(
-        estimators=[("tsf", TSFRegressor(n_estimators=2), "all")]
+        estimators=[("tsf", IntervalForestRegressor(n_estimators=2), "all")]
     )
     ens.fit(X, y)
 
@@ -39,7 +39,7 @@ def test_single_estimator_split():
     X, y = generate_3d_test_data(n_channels=3)
 
     ens = ChannelEnsembleClassifier(
-        estimators=("tsf", TSFClassifier(n_estimators=2), "all-split")
+        estimators=("tsf", IntervalForestClassifier(n_estimators=2), "all-split")
     )
     ens.fit(X, y)
 
@@ -48,7 +48,7 @@ def test_single_estimator_split():
     assert ens.predict(X).shape == (X.shape[0],)
 
     ens = ChannelEnsembleRegressor(
-        estimators=("tsf", TSFRegressor(n_estimators=2), "all-split")
+        estimators=("tsf", IntervalForestRegressor(n_estimators=2), "all-split")
     )
     ens.fit(X, y)
 
@@ -62,8 +62,8 @@ def test_remainder():
     X, y = generate_3d_test_data(n_channels=3)
 
     ens = ChannelEnsembleClassifier(
-        estimators=[("tsf", TSFClassifier(n_estimators=2), 0)],
-        remainder=TSFClassifier(n_estimators=2),
+        estimators=[("tsf", IntervalForestClassifier(n_estimators=2), 0)],
+        remainder=IntervalForestClassifier(n_estimators=2),
     )
     ens.fit(X, y)
 
@@ -71,8 +71,8 @@ def test_remainder():
     assert ens.predict(X).shape == (X.shape[0],)
 
     ens = ChannelEnsembleRegressor(
-        estimators=[("tsf", TSFRegressor(n_estimators=2), 0)],
-        remainder=TSFRegressor(n_estimators=2),
+        estimators=[("tsf", IntervalForestRegressor(n_estimators=2), 0)],
+        remainder=IntervalForestRegressor(n_estimators=2),
     )
     ens.fit(X, y)
 
diff --git a/tsml/feature_based/__init__.py b/tsml/feature_based/__init__.py
index e972190..66277b9 100644
--- a/tsml/feature_based/__init__.py
+++ b/tsml/feature_based/__init__.py
@@ -1,11 +1,8 @@
 """Feature-based estimators."""
 
 __all__ = [
-    "Catch22Classifier",
-    "Catch22Regressor",
     "FPCAClassifier",
     "FPCARegressor",
 ]
 
-from tsml.feature_based._catch22 import Catch22Classifier, Catch22Regressor
 from tsml.feature_based._fpca import FPCAClassifier, FPCARegressor
diff --git a/tsml/feature_based/_catch22.py b/tsml/feature_based/_catch22.py
deleted file mode 100644
index aad57df..0000000
--- a/tsml/feature_based/_catch22.py
+++ /dev/null
@@ -1,523 +0,0 @@
-"""Catch22 Classifier.
-
-Pipeline estimator using the Catch22 transformer and an estimator.
-"""
-
-__author__ = ["MatthewMiddlehurst"]
-__all__ = ["Catch22Classifier", "Catch22Regressor"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_is_fitted
-
-from tsml.base import BaseTimeSeriesEstimator, _clone_estimator
-from tsml.transformations._catch22 import Catch22Transformer
-from tsml.utils.validation import _check_optional_dependency, check_n_jobs
-
-
-class Catch22Classifier(ClassifierMixin, BaseTimeSeriesEstimator):
-    """Canonical Time-series Characteristics (catch22) classifier.
-
-    This classifier simply transforms the input data using the Catch22 [1]
-    transformer and builds a provided estimator using the transformed data.
-
-    Parameters
-    ----------
-    features : int/str or List of int/str, optional, default="all"
-        The Catch22 features to extract by feature index, feature name as a str or as a
-        list of names or indices for multiple features. If "all", all features are
-        extracted.
-        Valid features are as follows:
-            ["DN_HistogramMode_5", "DN_HistogramMode_10",
-            "SB_BinaryStats_diff_longstretch0", "DN_OutlierInclude_p_001_mdrmd",
-            "DN_OutlierInclude_n_001_mdrmd", "CO_f1ecac", "CO_FirstMin_ac",
-            "SP_Summaries_welch_rect_area_5_1", "SP_Summaries_welch_rect_centroid",
-            "FC_LocalSimple_mean3_stderr", "CO_trev_1_num", "CO_HistogramAMI_even_2_5",
-            "IN_AutoMutualInfoStats_40_gaussian_fmmi", "MD_hrv_classic_pnn40",
-            "SB_BinaryStats_mean_longstretch1", "SB_MotifThree_quantile_hh",
-            "FC_LocalSimple_mean1_tauresrat", "CO_Embed2_Dist_tau_d_expfit_meandiff",
-            "SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1",
-            "SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1",
-            "SB_TransitionMatrix_3ac_sumdiagcov", "PD_PeriodicityWang_th0_01"]
-    catch24 : bool, optional, default=True
-        Extract the mean and standard deviation as well as the 22 Catch22 features if
-        true. If a List of specific features to extract is provided, "Mean" and/or
-        "StandardDeviation" must be added to the List to extract these features.
-    outlier_norm : bool, optional, default=False
-        Normalise each series during the two outlier Catch22 features, which can take a
-        while to process for large values.
-    replace_nans : bool, optional, default=True
-        Replace NaN or inf values from the Catch22 transform with 0.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for tsml.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    estimator : sklearn classifier, optional, default=None
-        An sklearn estimator to be built using the transformed data.
-        Defaults to sklearn RandomForestClassifier(n_estimators=200)
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib for Catch22,
-        if None a 'prefer' value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set. If input is a list, the length
-        of the first series is used.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-
-    See Also
-    --------
-    Catch22Transformer
-    Catch22Regressor
-
-    Notes
-    -----
-    Authors `catch22ForestClassifier <https://github.com/chlubba/sktime-catch22>`_.
-
-    For the Java version, see `tsml <https://github.com/uea-machine-learning/tsml/blob
-    /master/src/main/java/tsml/classifiers/hybrids/Catch22Classifier.java>`_.
-
-    References
-    ----------
-    .. [1] Lubba, Carl H., et al. "catch22: Canonical time-series characteristics."
-        Data Mining and Knowledge Discovery 33.6 (2019): 1821-1852.
-        https://link.springer.com/article/10.1007/s10618-019-00647-x
-
-    Examples
-    --------
-    >>> from tsml.feature_based import Catch22Classifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0)
-    >>> clf = Catch22Classifier(random_state=0)
-    >>> clf.fit(X, y)
-    Catch22Classifier(...)
-    >>> clf.predict(X)
-    array([0, 1, 1, 0, 0, 1, 0, 1])
-    """
-
-    def __init__(
-        self,
-        features="all",
-        catch24=True,
-        outlier_norm=False,
-        replace_nans=True,
-        use_pycatch22=True,
-        estimator=None,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.features = features
-        self.catch24 = catch24
-        self.outlier_norm = outlier_norm
-        self.replace_nans = replace_nans
-        self.use_pycatch22 = use_pycatch22
-        self.estimator = estimator
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-        self.parallel_backend = parallel_backend
-
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) or
-                list of size (n_instances) of 2D np.ndarray (n_channels,
-                n_timepoints_i), where n_timepoints_i is length of series i
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The class labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2)
-        X = self._convert_X(X)
-
-        check_classification_targets(y)
-
-        self.n_instances_ = len(X)
-        self.n_channels_, self.n_timepoints_ = X[0].shape
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        if self.n_classes_ == 1:
-            return self
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._transformer = Catch22Transformer(
-            features=self.features,
-            catch24=self.catch24,
-            outlier_norm=self.outlier_norm,
-            replace_nans=self.replace_nans,
-            use_pycatch22=self.use_pycatch22,
-            n_jobs=self._n_jobs,
-            parallel_backend=self.parallel_backend,
-        )
-
-        self._estimator = _clone_estimator(
-            (
-                RandomForestClassifier(n_estimators=200)
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t = self._transformer.fit_transform(X, y)
-        self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) or
-                list of size (n_instances) of 2D np.ndarray (n_channels,
-                n_timepoints_i), where n_timepoints_i is length of series i
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted class labels.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        return self._estimator.predict(self._transformer.transform(X))
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) or
-                list of size (n_instances) of 2D np.ndarray (n_channels,
-                n_timepoints_i), where n_timepoints_i is length of series i
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat([[1]], X.shape[0], axis=0)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        m = getattr(self._estimator, "predict_proba", None)
-        if callable(m):
-            return self._estimator.predict_proba(self._transformer.transform(X))
-        else:
-            dists = np.zeros((X.shape[0], self.n_classes_))
-            preds = self._estimator.predict(self._transformer.transform(X))
-            for i in range(0, X.shape[0]):
-                dists[i, self.class_dictionary_[preds[i]]] = 1
-            return dists
-
-    def _more_tags(self) -> dict:
-        return {
-            "X_types": ["np_list", "3darray"],
-            "equal_length_only": False,
-            "optional_dependency": self.use_pycatch22,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "estimator": RandomForestClassifier(n_estimators=2),
-            "features": (
-                "Mean",
-                "DN_HistogramMode_5",
-                "SB_BinaryStats_mean_longstretch1",
-            ),
-        }
-
-
-class Catch22Regressor(RegressorMixin, BaseTimeSeriesEstimator):
-    """Canonical Time-series Characteristics (catch22) regressor.
-
-    This regressor simply transforms the input data using the Catch22 [1]
-    transformer and builds a provided estimator using the transformed data.
-
-    Parameters
-    ----------
-    features : int/str or List of int/str, optional, default="all"
-        The Catch22 features to extract by feature index, feature name as a str or as a
-        list of names or indices for multiple features. If "all", all features are
-        extracted.
-        Valid features are as follows:
-            ["DN_HistogramMode_5", "DN_HistogramMode_10",
-            "SB_BinaryStats_diff_longstretch0", "DN_OutlierInclude_p_001_mdrmd",
-            "DN_OutlierInclude_n_001_mdrmd", "CO_f1ecac", "CO_FirstMin_ac",
-            "SP_Summaries_welch_rect_area_5_1", "SP_Summaries_welch_rect_centroid",
-            "FC_LocalSimple_mean3_stderr", "CO_trev_1_num", "CO_HistogramAMI_even_2_5",
-            "IN_AutoMutualInfoStats_40_gaussian_fmmi", "MD_hrv_classic_pnn40",
-            "SB_BinaryStats_mean_longstretch1", "SB_MotifThree_quantile_hh",
-            "FC_LocalSimple_mean1_tauresrat", "CO_Embed2_Dist_tau_d_expfit_meandiff",
-            "SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1",
-            "SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1",
-            "SB_TransitionMatrix_3ac_sumdiagcov", "PD_PeriodicityWang_th0_01"]
-    catch24 : bool, optional, default=True
-        Extract the mean and standard deviation as well as the 22 Catch22 features if
-        true. If a List of specific features to extract is provided, "Mean" and/or
-        "StandardDeviation" must be added to the List to extract these features.
-    outlier_norm : bool, optional, default=False
-        Normalise each series during the two outlier Catch22 features, which can take a
-        while to process for large values.
-    replace_nans : bool, optional, default=True
-        Replace NaN or inf values from the Catch22 transform with 0.
-    use_pycatch22 : bool, optional, default=False
-        Wraps the C based pycatch22 implementation for tsml.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    estimator : sklearn regressor, optional, default=None
-        An sklearn estimator to be built using the transformed data.
-        Defaults to sklearn RandomForestRegressor(n_estimators=200)
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib for Catch22,
-        if None a 'prefer' value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-
-    See Also
-    --------
-    Catch22Transformer
-    Catch22Classifier
-
-    References
-    ----------
-    .. [1] Lubba, Carl H., et al. "catch22: Canonical time-series characteristics."
-        Data Mining and Knowledge Discovery 33.6 (2019): 1821-1852.
-        https://link.springer.com/article/10.1007/s10618-019-00647-x
-
-    Examples
-    --------
-    >>> from tsml.feature_based import Catch22Regressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = Catch22Regressor(random_state=0)
-    >>> reg.fit(X, y)
-    Catch22Regressor(...)
-    >>> reg.predict(X)
-    array([0.44505834, 1.28376726, 1.09799075, 0.64209462, 0.59410108,
-           1.1746538 , 0.70590611, 1.13361721])
-    """
-
-    def __init__(
-        self,
-        features="all",
-        catch24=True,
-        outlier_norm=False,
-        replace_nans=True,
-        use_pycatch22=False,
-        estimator=None,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.features = features
-        self.catch24 = catch24
-        self.outlier_norm = outlier_norm
-        self.replace_nans = replace_nans
-        self.use_pycatch22 = use_pycatch22
-        self.estimator = estimator
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-        self.parallel_backend = parallel_backend
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) or
-                list of size (n_instances) of 2D np.ndarray (n_channels,
-                n_timepoints_i), where n_timepoints_i is length of series i
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The target labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2, y_numeric=True)
-        X = self._convert_X(X)
-
-        self.n_instances_ = len(X)
-        self.n_channels_, self.n_timepoints_ = X[0].shape
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._transformer = Catch22Transformer(
-            features=self.features,
-            catch24=self.catch24,
-            outlier_norm=self.outlier_norm,
-            replace_nans=self.replace_nans,
-            n_jobs=self._n_jobs,
-            parallel_backend=self.parallel_backend,
-        )
-
-        self._estimator = _clone_estimator(
-            (
-                RandomForestRegressor(n_estimators=200)
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t = self._transformer.fit_transform(X, y)
-        self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) or
-                list of size (n_instances) of 2D np.ndarray (n_channels,
-                n_timepoints_i), where n_timepoints_i is length of series i
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted target labels.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        return self._estimator.predict(self._transformer.transform(X))
-
-    def _more_tags(self) -> dict:
-        return {
-            "X_types": ["np_list", "3darray"],
-            "equal_length_only": False,
-            "optional_dependency": self.use_pycatch22,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "estimator": RandomForestRegressor(n_estimators=2),
-            "features": (
-                "Mean",
-                "DN_HistogramMode_5",
-                "SB_BinaryStats_mean_longstretch1",
-            ),
-        }
diff --git a/tsml/hybrid/__init__.py b/tsml/hybrid/__init__.py
deleted file mode 100644
index bb4ea15..0000000
--- a/tsml/hybrid/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Hybrid estimators."""
-
-__all__ = ["RISTClassifier", "RISTRegressor"]
-
-from tsml.hybrid._rist import RISTClassifier, RISTRegressor
diff --git a/tsml/hybrid/_rist.py b/tsml/hybrid/_rist.py
deleted file mode 100644
index 709cc6f..0000000
--- a/tsml/hybrid/_rist.py
+++ /dev/null
@@ -1,618 +0,0 @@
-"""Randomised Interval-Shapelet Transformation (RIST) pipeline estimators."""
-
-__author__ = ["MatthewMiddlehurst"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
-from sklearn.ensemble._base import _set_random_states
-from sklearn.utils.validation import check_is_fitted, check_random_state
-
-from tsml.base import BaseTimeSeriesEstimator, _clone_estimator
-from tsml.transformations import (
-    ARCoefficientTransformer,
-    Catch22Transformer,
-    FunctionTransformer,
-    PeriodogramTransformer,
-    RandomDilatedShapeletTransformer,
-    RandomIntervalTransformer,
-)
-from tsml.utils.numba_functions.general import first_order_differences_3d
-from tsml.utils.numba_functions.stats import (
-    row_iqr,
-    row_mean,
-    row_median,
-    row_numba_max,
-    row_numba_min,
-    row_ppv,
-    row_slope,
-    row_std,
-)
-from tsml.utils.validation import _check_optional_dependency, check_n_jobs
-
-
-class RISTClassifier(ClassifierMixin, BaseTimeSeriesEstimator):
-    """Randomised Interval-Shapelet Transformation (RIST) pipeline classifier.
-
-    This classifier is a hybrid pipeline using the RandomIntervalTransformer using
-    Catch22 features and summary stats, and the RandomDilatedShapeletTransformer.
-    Both transforms extract features from different series transformations (1st Order
-    Differences, PeriodogramTransformer, and ARCoefficientTransformer).
-    An ExtraTreesClassifier with 200 trees is used as the estimator for the
-    concatenated feature vector output.
-
-    Parameters
-    ----------
-    n_intervals : int, callable or None, default=None,
-        The number of intervals of random length, position and dimension to be
-        extracted for the interval portion of the pipeline. Input should be an int or
-        a function that takes a 3D np.ndarray input and returns an int. Functions may
-        extract a different number of intervals per `series_transformer` output.
-        If None, extracts `int(np.sqrt(X.shape[2]) * np.sqrt(X.shape[1]) * 15 + 5)`
-        intervals where `Xt` is the series representation data.
-    n_shapelets : int, callable or None, default=None,
-        The number of shapelets of random dilation and position to be extracted for the
-        shapelet portion of the pipeline. Input should be an int or
-        a function that takes a 3D np.ndarray input and returns an int. Functions may
-        extract a different number of shapelets per `series_transformer` output.
-        If None, extracts `int(np.sqrt(Xt.shape[2]) * 200 + 5)` shapelets where `Xt` is
-        the series representation data.
-    series_transformers : TransformerMixin, list, tuple, or None, default=None
-        The transformers to apply to the series before extracting intervals and
-        shapelets. If None, use the series as is. If "default", use [None, 1st Order
-        Differences, PeriodogramTransformer, and ARCoefficientTransformer].
-
-        A list or tuple of transformers will extract intervals from
-        all transformations concatenate the output. Including None in the list or tuple
-        will use the series as is for interval extraction.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    estimator : sklearn classifier, default=None
-        An sklearn estimator to be built using the transformed data. Defaults to an
-        ExtraTreesClassifier with 200 trees.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-
-    See Also
-    --------
-    RandomIntervalTransformer
-    RandomDilatedShapeletTransformer
-    RISTRegressor
-
-    Examples
-    --------
-    >>> from tsml.hybrid import RISTClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0)
-    >>> clf = RISTClassifier(random_state=0)  # doctest: +SKIP
-    >>> clf.fit(X, y)  # doctest: +SKIP
-    RISTClassifier(...)
-    >>> clf.predict(X)  # doctest: +SKIP
-    array([0, 1, 1, 0, 0, 1, 0, 1])
-    """
-
-    def __init__(
-        self,
-        n_intervals=None,
-        n_shapelets=None,
-        series_transformers="default",
-        use_pycatch22=True,
-        use_pyfftw=True,
-        estimator=None,
-        n_jobs=1,
-        random_state=None,
-    ):
-        self.n_intervals = n_intervals
-        self.n_shapelets = n_shapelets
-        self.series_transformers = series_transformers
-        self.use_pycatch22 = use_pycatch22
-        self.use_pyfftw = use_pyfftw
-        self.estimator = estimator
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The class labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(
-            X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3
-        )
-        X = self._convert_X(X)
-
-        self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._estimator = _clone_estimator(
-            (
-                ExtraTreesClassifier(n_estimators=200, criterion="entropy")
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t, self._series_transformers, self._transformers = _fit_transforms(
-            X,
-            y,
-            self.series_transformers,
-            self.n_intervals,
-            self.n_shapelets,
-            self.use_pyfftw,
-            self.use_pycatch22,
-            self.random_state,
-            self._n_jobs,
-        )
-        self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted class labels.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False, ensure_min_series_length=3)
-        X = self._convert_X(X)
-
-        return self._estimator.predict(
-            _transform_data(X, self._series_transformers, self._transformers)
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False, ensure_min_series_length=3)
-        X = self._convert_X(X)
-
-        m = getattr(self._estimator, "predict_proba", None)
-        if callable(m):
-            return self._estimator.predict_proba(
-                _transform_data(X, self._series_transformers, self._transformers)
-            )
-        else:
-            dists = np.zeros((X.shape[0], self.n_classes_))
-            preds = self._estimator.predict(
-                _transform_data(X, self._series_transformers, self._transformers)
-            )
-            for i in range(0, X.shape[0]):
-                dists[i, self.class_dictionary_[preds[i]]] = 1
-            return dists
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22 or self.use_pyfftw,
-            "non_deterministic": True,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "series_transformers": [
-                None,
-                FunctionTransformer(func=first_order_differences_3d, validate=False),
-            ],
-            "n_intervals": 1,
-            "n_shapelets": 2,
-            "estimator": ExtraTreesClassifier(n_estimators=2, criterion="entropy"),
-        }
-
-
-class RISTRegressor(RegressorMixin, BaseTimeSeriesEstimator):
-    """Randomised Interval-Shapelet Transformation (RIST) pipeline regressor.
-
-    This regressor is a hybrid pipeline using the RandomIntervalTransformer using
-    Catch22 features and summary stats, and the RandomDilatedShapeletTransformer.
-    Both transforms extract features from different series transformations (1st Order
-    Differences, PeriodogramTransformer, and ARCoefficientTransformer).
-    An ExtraTreesRegressor with 200 trees is used as the estimator for the
-    concatenated feature vector output.
-
-    Parameters
-    ----------
-    n_intervals : int, callable or None, default=None,
-        The number of intervals of random length, position and dimension to be
-        extracted for the interval portion of the pipeline. Input should be an int or
-        a function that takes a 3D np.ndarray input and returns an int. Functions may
-        extract a different number of intervals per `series_transformer` output.
-        If None, extracts `int(np.sqrt(X.shape[2]) * np.sqrt(X.shape[1]) * 15 + 5)`
-        intervals where `Xt` is the series representation data.
-    n_shapelets : int, callable or None, default=None,
-        The number of shapelets of random dilation and position to be extracted for the
-        shapelet portion of the pipeline. Input should be an int or
-        a function that takes a 3D np.ndarray input and returns an int. Functions may
-        extract a different number of shapelets per `series_transformer` output.
-        If None, extracts `int(np.sqrt(Xt.shape[2]) * 200 + 5)` shapelets where `Xt` is
-        the series representation data.
-    series_transformers : TransformerMixin, list, tuple, or None, default=None
-        The transformers to apply to the series before extracting intervals and
-        shapelets. If None, use the series as is. If "default", use [None, 1st Order
-        Differences, PeriodogramTransformer, and ARCoefficientTransformer].
-
-        A list or tuple of transformers will extract intervals from
-        all transformations concatenate the output. Including None in the list or tuple
-        will use the series as is for interval extraction.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    estimator : sklearn classifier, default=None
-        An sklearn estimator to be built using the transformed data. Defaults to an
-        ExtraTreesRegressor with 200 trees.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-
-    See Also
-    --------
-    RandomIntervalTransformer
-    RandomDilatedShapeletTransformer
-    RISTClassifier
-
-    Examples
-    --------
-    >>> from tsml.hybrid import RISTRegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = RISTRegressor(random_state=0)  # doctest: +SKIP
-    >>> reg.fit(X, y)  # doctest: +SKIP
-    RISTRegressor(...)
-    >>> reg.predict(X)  # doctest: +SKIP
-    array([0.31798318, 1.41426301, 1.06414747, 0.6924721 , 0.56660146,
-           1.26538944, 0.52324808, 1.0939405 ])
-    """
-
-    def __init__(
-        self,
-        n_intervals=None,
-        n_shapelets=None,
-        series_transformers="default",
-        use_pycatch22=True,
-        use_pyfftw=True,
-        estimator=None,
-        n_jobs=1,
-        random_state=None,
-    ):
-        self.n_intervals = n_intervals
-        self.n_shapelets = n_shapelets
-        self.series_transformers = series_transformers
-        self.use_pycatch22 = use_pycatch22
-        self.use_pyfftw = use_pyfftw
-        self.estimator = estimator
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The target labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(
-            X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3
-        )
-        X = self._convert_X(X)
-
-        self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._estimator = _clone_estimator(
-            (
-                ExtraTreesRegressor(n_estimators=200)
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t, self._series_transformers, self._transformers = _fit_transforms(
-            X,
-            y,
-            self.series_transformers,
-            self.n_intervals,
-            self.n_shapelets,
-            self.use_pyfftw,
-            self.use_pycatch22,
-            self.random_state,
-            self._n_jobs,
-        )
-        self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted target labels.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False, ensure_min_series_length=3)
-        X = self._convert_X(X)
-
-        return self._estimator.predict(
-            _transform_data(X, self._series_transformers, self._transformers)
-        )
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22 or self.use_pyfftw,
-            "non_deterministic": True,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "series_transformers": [
-                None,
-                FunctionTransformer(func=first_order_differences_3d, validate=False),
-            ],
-            "n_intervals": 1,
-            "n_shapelets": 2,
-            "estimator": ExtraTreesRegressor(n_estimators=2),
-        }
-
-
-def _fit_transforms(
-    X,
-    y,
-    series_transformers,
-    n_intervals,
-    n_shapelets,
-    use_pyfftw,
-    use_pycatch22,
-    random_state,
-    n_jobs,
-):
-    rng = check_random_state(random_state)
-
-    if series_transformers == "default":
-        series_transformers = [
-            None,
-            FunctionTransformer(func=first_order_differences_3d, validate=False),
-            PeriodogramTransformer(use_pyfftw=use_pyfftw),
-            ARCoefficientTransformer(
-                replace_nan=True, order=int(12 * (X.shape[2] / 100.0) ** 0.25)
-            ),
-        ]
-    elif isinstance(series_transformers, (list, tuple)):
-        series_transformers = [
-            None if st is None else _clone_estimator(st, random_state=rng)
-            for st in series_transformers
-        ]
-    else:
-        series_transformers = [
-            (
-                None
-                if series_transformers is None
-                else _clone_estimator(series_transformers, random_state=rng)
-            )
-        ]
-
-    X_t = np.empty((X.shape[0], 0))
-    transformers = []
-    for st in series_transformers:
-        if st is not None:
-            s = st.fit_transform(X, y)
-        else:
-            s = X
-
-        if n_intervals is None:
-            n_intervals = int(np.sqrt(X.shape[2]) * np.sqrt(X.shape[1]) * 15 + 5)
-        elif callable(n_intervals):
-            n_intervals = n_intervals(s)
-        else:
-            n_intervals = n_intervals
-
-        ct = RandomIntervalTransformer(
-            n_intervals=n_intervals,
-            features=[
-                Catch22Transformer(
-                    outlier_norm=True, replace_nans=True, use_pycatch22=use_pycatch22
-                ),
-                row_mean,
-                row_std,
-                row_slope,
-                row_median,
-                row_iqr,
-                row_numba_min,
-                row_numba_max,
-                row_ppv,
-            ],
-            n_jobs=n_jobs,
-        )
-        _set_random_states(ct, rng)
-        transformers.append(ct)
-        t = ct.fit_transform(s, y)
-
-        X_t = np.hstack((X_t, t))
-
-        if n_shapelets is None:
-            n_shapelets = int(np.sqrt(X.shape[2]) * 200 + 5)
-        elif callable(n_shapelets):
-            n_shapelets = n_shapelets(s)
-        else:
-            n_shapelets = n_shapelets
-
-        st = RandomDilatedShapeletTransformer(max_shapelets=n_shapelets, n_jobs=n_jobs)
-        _set_random_states(st, rng)
-        transformers.append(st)
-        t = st.fit_transform(s, y)
-
-        X_t = np.hstack((X_t, t))
-
-    X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-    return X_t, series_transformers, transformers
-
-
-def _transform_data(X, series_transformers, transformers):
-    X_t = np.empty((X.shape[0], 0))
-    for i, st in enumerate(series_transformers):
-        if st is not None:
-            s = st.transform(X)
-        else:
-            s = X
-
-        t = transformers[i * 2].transform(s)
-        X_t = np.hstack((X_t, t))
-
-        t = transformers[i * 2 + 1].transform(s)
-        X_t = np.hstack((X_t, t))
-
-    X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-    return X_t
diff --git a/tsml/interval_based/__init__.py b/tsml/interval_based/__init__.py
index 5a4cd24..3633a1f 100644
--- a/tsml/interval_based/__init__.py
+++ b/tsml/interval_based/__init__.py
@@ -2,30 +2,14 @@
 
 __all__ = [
     "BaseIntervalForest",
-    "CIFClassifier",
-    "CIFRegressor",
-    "DrCIFClassifier",
-    "DrCIFRegressor",
     "IntervalForestClassifier",
     "IntervalForestRegressor",
     "RandomIntervalClassifier",
     "RandomIntervalRegressor",
     "SupervisedIntervalClassifier",
-    "RISEClassifier",
-    "RISERegressor",
-    "STSFClassifier",
-    "RSTSFClassifier",
-    "TSFClassifier",
-    "TSFRegressor",
 ]
 
 from tsml.interval_based._base import BaseIntervalForest
-from tsml.interval_based._cif import (
-    CIFClassifier,
-    CIFRegressor,
-    DrCIFClassifier,
-    DrCIFRegressor,
-)
 from tsml.interval_based._interval_forest import (
     IntervalForestClassifier,
     IntervalForestRegressor,
@@ -35,6 +19,3 @@
     RandomIntervalRegressor,
     SupervisedIntervalClassifier,
 )
-from tsml.interval_based._rise import RISEClassifier, RISERegressor
-from tsml.interval_based._stsf import RSTSFClassifier, STSFClassifier
-from tsml.interval_based._tsf import TSFClassifier, TSFRegressor
diff --git a/tsml/interval_based/_cif.py b/tsml/interval_based/_cif.py
deleted file mode 100644
index 8738edf..0000000
--- a/tsml/interval_based/_cif.py
+++ /dev/null
@@ -1,979 +0,0 @@
-"""Catch22 Interval Forest (CIF) interval-based estimators."""
-
-__author__ = ["MatthewMiddlehurst"]
-__all__ = ["CIFClassifier", "CIFRegressor", "DrCIFClassifier", "DrCIFRegressor"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-
-from tsml.interval_based._base import BaseIntervalForest
-from tsml.transformations import FunctionTransformer, PeriodogramTransformer
-from tsml.transformations._catch22 import Catch22Transformer
-from tsml.utils.numba_functions.general import first_order_differences_3d
-from tsml.utils.numba_functions.stats import (
-    row_iqr,
-    row_mean,
-    row_median,
-    row_numba_max,
-    row_numba_min,
-    row_slope,
-    row_std,
-)
-from tsml.utils.validation import _check_optional_dependency
-from tsml.vector import CITClassifier
-
-
-class CIFClassifier(ClassifierMixin, BaseIntervalForest):
-    """Canonical Interval Forest (CIF) Classifier.
-
-    Implementation of the interval-based forest making use of the catch22 feature set
-    on randomly selected intervals described in Middlehurst et al. (2020). [1]_
-
-    Overview: Input "n" series with "d" dimensions of length "m".
-    For each tree
-        - Sample n_intervals intervals of random position and length
-        - Subsample att_subsample_size catch22 or summary statistic attributes randomly
-        - Randomly select dimension for each interval
-        - Calculate attributes for each interval, concatenate to form new
-          data set
-        - Build a decision tree on new data set
-    ensemble the trees with averaged probability estimates
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-
-        While random interval extraction will extract the n_intervals intervals total
-        (removing duplicates), supervised intervals will run the supervised extraction
-        process n_intervals times, returning more intervals than specified.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-
-        Ignored for supervised interval_selection_method inputs.
-    att_subsample_size : int, float, list, tuple or None, default=None
-        The number of attributes to subsample for each estimator. If None, use all
-
-        If int, use that number of attributes for all estimators. If float, use that
-        proportion of attributes for all estimators.
-
-        Different subsample sizes for each series_transformers series can be specified
-        using a list or tuple. Any list or tuple input must be the same length as the
-        number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    CIFRegressor
-    DrCIFClassifier
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java
-    /tsml/classifiers/interval_based/CIF.java>`_.
-
-    References
-    ----------
-    .. [1] Matthew Middlehurst and James Large and Anthony Bagnall. "The Canonical
-       Interval Forest (CIF) Classifier for Time Series Classification."
-       IEEE International Conference on Big Data 2020
-
-    Examples
-    --------
-    >>> from tsml.interval_based import CIFClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = CIFClassifier(n_estimators=10, random_state=0)
-    >>> clf.fit(X, y)
-    CIFClassifier(...)
-    >>> clf.predict(X)
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals="sqrt",
-        min_interval_length=3,
-        max_interval_length=np.inf,
-        att_subsample_size=8,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pycatch22=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.use_pycatch22 = use_pycatch22
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-
-        if isinstance(base_estimator, CITClassifier):
-            replace_nan = "nan"
-        else:
-            replace_nan = 0
-
-        interval_features = [
-            Catch22Transformer(outlier_norm=True, use_pycatch22=use_pycatch22),
-            row_mean,
-            row_std,
-            row_slope,
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=None,
-            att_subsample_size=att_subsample_size,
-            replace_nan=replace_nan,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        return self._predict_proba(X)
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-            "att_subsample_size": 2,
-        }
-
-
-class CIFRegressor(RegressorMixin, BaseIntervalForest):
-    """Canonical Interval Forest (CIF) Regressor.
-
-    Implementation of the interval-based forest making use of the catch22 feature set
-    on randomly selected intervals described in Middlehurst et al. (2020). [1]_
-
-    Overview: Input "n" series with "d" dimensions of length "m".
-    For each tree
-        - Sample n_intervals intervals of random position and length
-        - Subsample att_subsample_size catch22 or summary statistic attributes randomly
-        - Randomly select dimension for each interval
-        - Calculate attributes for each interval, concatenate to form new
-          data set
-        - Build a decision tree on new data set
-    ensemble the trees with averaged label estimates
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    att_subsample_size : int, float, list, tuple or None, default=None
-        The number of attributes to subsample for each estimator. If None, use all
-
-        If int, use that number of attributes for all estimators. If float, use that
-        proportion of attributes for all estimators.
-
-        Different subsample sizes for each series_transformers series can be specified
-        using a list or tuple. Any list or tuple input must be the same length as the
-        number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    CIFClassifier
-    DrCIFRegressor
-
-    References
-    ----------
-    .. [1] Matthew Middlehurst and James Large and Anthony Bagnall. "The Canonical
-       Interval Forest (CIF) Classifier for Time Series Classification."
-       IEEE International Conference on Big Data 2020
-
-    Examples
-    --------
-    >>> from tsml.interval_based import CIFRegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = CIFRegressor(n_estimators=10, random_state=0)
-    >>> reg.fit(X, y)
-    CIFRegressor(...)
-    >>> reg.predict(X)
-    array([0.7252543 , 1.50132442, 0.95608366, 1.64399016, 0.42385504,
-           0.60639322, 1.01919317, 1.30157483, 1.66017354, 0.2900776 ])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals="sqrt",
-        min_interval_length=3,
-        max_interval_length=np.inf,
-        att_subsample_size=8,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pycatch22=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.use_pycatch22 = use_pycatch22
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-
-        interval_features = [
-            Catch22Transformer(outlier_norm=True, use_pycatch22=use_pycatch22),
-            row_mean,
-            row_std,
-            row_slope,
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=None,
-            att_subsample_size=att_subsample_size,
-            replace_nan=0,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-            "att_subsample_size": 2,
-        }
-
-
-class DrCIFClassifier(ClassifierMixin, BaseIntervalForest):
-    """Diverse Representation Canonical Interval Forest (DrCIF) Classifier.
-
-    Extension of the CIF algorithm using multiple representations. Implementation of the
-    interval-based forest making use of the catch22 feature set on randomly selected
-    intervals on the base series, periodogram representation and differences
-    representation described in the HIVE-COTE 2.0 paper Middlehurst et al (2021). [1]_
-
-    Overview: Input "n" series with "d" dimensions of length "m".
-    For each tree
-        - Sample n_intervals intervals per representation of random position and length
-        - Subsample att_subsample_size catch22 or summary statistic attributes randomly
-        - Randomly select dimension for each interval
-        - Calculate attributes for each interval from its representation, concatenate
-          to form new data set
-        - Build a decision tree on new data set
-    Ensemble the trees with averaged probability estimates
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-
-        While random interval extraction will extract the n_intervals intervals total
-        (removing duplicates), supervised intervals will run the supervised extraction
-        process n_intervals times, returning more intervals than specified.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-
-        Ignored for supervised interval_selection_method inputs.
-    att_subsample_size : int, float, list, tuple or None, default=None
-        The number of attributes to subsample for each estimator. If None, use all
-
-        If int, use that number of attributes for all estimators. If float, use that
-        proportion of attributes for all estimators.
-
-        Different subsample sizes for each series_transformers series can be specified
-        using a list or tuple. Any list or tuple input must be the same length as the
-        number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    DrCIFRegressor
-    CIFClassifier
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java
-    /tsml/classifiers/interval_based/DrCIF.java>`_.
-
-    References
-    ----------
-    .. [1] Middlehurst, Matthew, James Large, Michael Flynn, Jason Lines, Aaron Bostrom,
-       and Anthony Bagnall. "HIVE-COTE 2.0: a new meta ensemble for time series
-       classification." arXiv preprint arXiv:2104.07551 (2021).
-
-    Examples
-    --------
-    >>> from tsml.interval_based import DrCIFClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = DrCIFClassifier(n_estimators=10, random_state=0)  # doctest: +SKIP
-    >>> clf.fit(X, y)  # doctest: +SKIP
-    DrCIFClassifier(...)
-    >>> clf.predict(X)  # doctest: +SKIP
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals=(4, "sqrt-div"),
-        min_interval_length=3,
-        max_interval_length=0.5,
-        att_subsample_size=10,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pycatch22=True,
-        use_pyfftw=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.use_pycatch22 = use_pycatch22
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-
-        self.use_pyfftw = use_pyfftw
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        if isinstance(base_estimator, CITClassifier):
-            replace_nan = "nan"
-        else:
-            replace_nan = 0
-
-        series_transformers = [
-            None,
-            FunctionTransformer(func=first_order_differences_3d, validate=False),
-            PeriodogramTransformer(use_pyfftw=use_pyfftw),
-        ]
-
-        interval_features = [
-            Catch22Transformer(outlier_norm=True, use_pycatch22=use_pycatch22),
-            row_mean,
-            row_std,
-            row_slope,
-            row_median,
-            row_iqr,
-            row_numba_min,
-            row_numba_max,
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=series_transformers,
-            att_subsample_size=att_subsample_size,
-            replace_nan=replace_nan,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        return self._predict_proba(X)
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22 or self.use_pyfftw,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-            "att_subsample_size": 2,
-        }
-
-
-class DrCIFRegressor(RegressorMixin, BaseIntervalForest):
-    """Diverse Representation Canonical Interval Forest (DrCIF) Regressor.
-
-    Extension of the CIF algorithm using multiple representations. Implementation of the
-    interval-based forest making use of the catch22 feature set on randomly selected
-    intervals on the base series, periodogram representation and differences
-    representation described in the HIVE-COTE 2.0 paper Middlehurst et al (2021). [1]_
-
-    Overview: Input "n" series with "d" dimensions of length "m".
-    For each tree
-        - Sample n_intervals intervals per representation of random position and length
-        - Subsample att_subsample_size catch22 or summary statistic attributes randomly
-        - Randomly select dimension for each interval
-        - Calculate attributes for each interval from its representation, concatenate
-          to form new data set
-        - Build a decision tree on new data set
-    Ensemble the trees with averaged label estimates
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    att_subsample_size : int, float, list, tuple or None, default=None
-        The number of attributes to subsample for each estimator. If None, use all
-
-        If int, use that number of attributes for all estimators. If float, use that
-        proportion of attributes for all estimators.
-
-        Different subsample sizes for each series_transformers series can be specified
-        using a list or tuple. Any list or tuple input must be the same length as the
-        number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pycatch22 : bool, optional, default=True
-        Wraps the C based pycatch22 implementation for aeon.
-        (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
-        ``pycatch22`` package to be installed if True.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    DrCIFClassifier
-    CIFRegressor
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java
-    /tsml/classifiers/interval_based/DrCIF.java>`_.
-
-    References
-    ----------
-    .. [1] Middlehurst, Matthew, James Large, Michael Flynn, Jason Lines, Aaron Bostrom,
-       and Anthony Bagnall. "HIVE-COTE 2.0: a new meta ensemble for time series
-       classification." arXiv preprint arXiv:2104.07551 (2021).
-
-    Examples
-    --------
-    >>> from tsml.interval_based import DrCIFRegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = DrCIFRegressor(n_estimators=10, random_state=0)  # doctest: +SKIP
-    >>> reg.fit(X, y)  # doctest: +SKIP
-    DrCIFRegressor(...)
-    >>> reg.predict(X)  # doctest: +SKIP
-    array([0.7252543 , 1.50132442, 0.95608366, 1.64399016, 0.42385504,
-           0.60639322, 1.01919317, 1.30157483, 1.66017354, 0.2900776 ])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals=(4, "sqrt-div"),
-        min_interval_length=3,
-        max_interval_length=0.5,
-        att_subsample_size=10,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pycatch22=True,
-        use_pyfftw=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.use_pycatch22 = use_pycatch22
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
-
-        self.use_pyfftw = use_pyfftw
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        series_transformers = [
-            None,
-            FunctionTransformer(func=first_order_differences_3d, validate=False),
-            PeriodogramTransformer(use_pyfftw=True),
-        ]
-
-        interval_features = [
-            Catch22Transformer(outlier_norm=True, use_pycatch22=use_pycatch22),
-            row_mean,
-            row_std,
-            row_slope,
-            row_median,
-            row_iqr,
-            row_numba_min,
-            row_numba_max,
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=series_transformers,
-            att_subsample_size=att_subsample_size,
-            replace_nan=0,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pycatch22 or self.use_pyfftw,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-            "att_subsample_size": 2,
-        }
diff --git a/tsml/interval_based/_rise.py b/tsml/interval_based/_rise.py
deleted file mode 100644
index e13f330..0000000
--- a/tsml/interval_based/_rise.py
+++ /dev/null
@@ -1,415 +0,0 @@
-"""Random Interval Spectral Ensemble (RISE) estimators."""
-
-__author__ = ["MatthewMiddlehurst"]
-__all__ = ["RISEClassifier", "RISERegressor"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-
-from tsml.interval_based._base import BaseIntervalForest
-from tsml.transformations import (
-    AutocorrelationFunctionTransformer,
-    PeriodogramTransformer,
-)
-from tsml.utils.validation import _check_optional_dependency
-from tsml.vector import CITClassifier
-
-
-class RISEClassifier(ClassifierMixin, BaseIntervalForest):
-    """Random Interval Spectral Ensemble (RISE) classifier.
-
-    Input: n series length m
-    For each tree
-        - sample a random intervals
-        - take the ACF and PS over this interval, and concatenate features
-        - build a tree on new features
-    Ensemble the trees through averaging probabilities.
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-
-        Ignored for supervised interval_selection_method inputs.
-    acf_lag : int or callable, default=100
-        The maximum number of autocorrelation terms to use. If callable, the function
-        should take a 3D numpy array of shape (n_instances, n_channels, n_timepoints)
-        and return an integer.
-    acf_min_values : int, default=0
-        Never use fewer than this number of terms to find a correlation unless the
-        series length is too short. This will reduce n_lags if needed.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    RISERegressor
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java/tsml/
-    classifiers/interval_based/RISE.java>`_.
-
-    References
-    ----------
-    .. [1] Jason Lines, Sarah Taylor and Anthony Bagnall, "Time Series Classification
-       with HIVE-COTE: The Hierarchical Vote Collective of Transformation-Based
-       Ensembles", ACM Transactions on Knowledge and Data Engineering, 12(5): 2018
-
-    Examples
-    --------
-    >>> from tsml.interval_based import RISEClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = RISEClassifier(n_estimators=10, random_state=0)  # doctest: +SKIP
-    >>> clf.fit(X, y)  # doctest: +SKIP
-    RISEClassifier(...)
-    >>> clf.predict(X)  # doctest: +SKIP
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        min_interval_length=3,
-        max_interval_length=np.inf,
-        acf_lag=100,
-        acf_min_values=4,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pyfftw=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.acf_lag = acf_lag
-        self.acf_min_values = acf_min_values
-
-        self.use_pyfftw = use_pyfftw
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        if isinstance(base_estimator, CITClassifier):
-            replace_nan = "nan"
-        else:
-            replace_nan = 0
-
-        interval_features = [
-            PeriodogramTransformer(use_pyfftw=use_pyfftw, pad_with="mean"),
-            AutocorrelationFunctionTransformer(
-                n_lags=acf_lag, min_values=acf_min_values
-            ),
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=1,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=None,
-            att_subsample_size=None,
-            replace_nan=replace_nan,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        return self._predict_proba(X)
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pyfftw,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "acf_lag": 10,
-            "min_interval_length": 5,
-        }
-
-
-class RISERegressor(RegressorMixin, BaseIntervalForest):
-    """Random Interval Spectral Ensemble (RISE) regressor.
-
-    Input: n series length m
-    For each tree
-        - sample a random intervals
-        - take the ACF and PS over this interval, and concatenate features
-        - build a tree on new features
-    Ensemble the trees through averaging predictions.
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    acf_lag : int or callable, default=100
-        The maximum number of autocorrelation terms to use. If callable, the function
-        should take a 3D numpy array of shape (n_instances, n_channels, n_timepoints)
-        and return an integer.
-    acf_min_values : int, default=0
-        Never use fewer than this number of terms to find a correlation unless the
-        series length is too short. This will reduce n_lags if needed.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    See Also
-    --------
-    RISEClassifier
-
-    References
-    ----------
-    .. [1] Jason Lines, Sarah Taylor and Anthony Bagnall, "Time Series Classification
-       with HIVE-COTE: The Hierarchical Vote Collective of Transformation-Based
-       Ensembles", ACM Transactions on Knowledge and Data Engineering, 12(5): 2018
-
-    Examples
-    --------
-    >>> from tsml.interval_based import RISERegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = RISERegressor(n_estimators=10, random_state=0)  # doctest: +SKIP
-    >>> reg.fit(X, y)  # doctest: +SKIP
-    RISERegressor(...)
-    >>> reg.predict(X)  # doctest: +SKIP
-    array([0.7252543 , 1.50132442, 0.95608366, 1.64399016, 0.42385504,
-           0.60639322, 1.01919317, 1.30157483, 1.66017354, 0.2900776 ])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        min_interval_length=16,
-        max_interval_length=np.inf,
-        acf_lag=100,
-        acf_min_values=4,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pyfftw=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.acf_lag = acf_lag
-        self.acf_min_values = acf_min_values
-
-        self.use_pyfftw = use_pyfftw
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        interval_features = [
-            PeriodogramTransformer(use_pyfftw=use_pyfftw, pad_with="mean"),
-            AutocorrelationFunctionTransformer(
-                n_lags=acf_lag, min_values=acf_min_values
-            ),
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=1,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=interval_features,
-            series_transformers=None,
-            att_subsample_size=None,
-            replace_nan=0,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pyfftw,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "acf_lag": 10,
-            "min_interval_length": 5,
-        }
diff --git a/tsml/interval_based/_stsf.py b/tsml/interval_based/_stsf.py
deleted file mode 100644
index 2f5b393..0000000
--- a/tsml/interval_based/_stsf.py
+++ /dev/null
@@ -1,476 +0,0 @@
-"""Supervised Time Series Forest classifiers."""
-
-__author__ = ["MatthewMiddlehurst"]
-__all__ = ["STSFClassifier", "RSTSFClassifier"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin
-from sklearn.ensemble import ExtraTreesClassifier
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_is_fitted
-
-from tsml.base import BaseTimeSeriesEstimator
-from tsml.interval_based._base import BaseIntervalForest
-from tsml.transformations import (
-    ARCoefficientTransformer,
-    FunctionTransformer,
-    PeriodogramTransformer,
-    SupervisedIntervalTransformer,
-)
-from tsml.utils.numba_functions.general import first_order_differences_3d
-from tsml.utils.numba_functions.stats import (
-    row_iqr,
-    row_mean,
-    row_median,
-    row_numba_max,
-    row_numba_min,
-    row_slope,
-    row_std,
-)
-from tsml.utils.validation import _check_optional_dependency, check_n_jobs
-
-
-class STSFClassifier(ClassifierMixin, BaseIntervalForest):
-    """Supervised Time Series Forest (STSF).
-
-    An ensemble of decision trees built on intervals selected through a supervised
-    process as described in _[1].
-    Overview: Input n series length m
-    For each tree
-        - sample X using class-balanced bagging
-        - sample intervals for all 3 representations and 7 features using supervised
-        - method
-        - find mean, median, std, slope, iqr, min and max using their corresponding
-        - interval for each representation, concatenate to form new data set
-        - build a decision tree on new data set
-    Ensemble the trees with averaged probability estimates.
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    use_pyfftw : bool, default=True
-        Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
-        package to be installed.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/
-     java/tsml/classifiers/interval_based/STSF.java>`_.
-
-    References
-    ----------
-    .. [1] Cabello, Nestor, et al. "Fast and Accurate Time Series Classification
-       Through Supervised Interval Search." IEEE ICDM 2020
-
-    Examples
-    --------
-    >>> from tsml.interval_based import STSFClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = STSFClassifier(n_estimators=10, random_state=0)  # doctest: +SKIP
-    >>> clf.fit(X, y)  # doctest: +SKIP
-    STSFClassifier(...)
-    >>> clf.predict(X)  # doctest: +SKIP
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        min_interval_length=3,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        use_pyfftw=True,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        self.use_pyfftw = use_pyfftw
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-
-        series_transformers = [
-            None,
-            FunctionTransformer(func=first_order_differences_3d, validate=False),
-            PeriodogramTransformer(use_pyfftw=use_pyfftw),
-        ]
-
-        interval_features = [
-            row_mean,
-            row_std,
-            row_slope,
-            row_median,
-            row_iqr,
-            row_numba_min,
-            row_numba_max,
-        ]
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="supervised",
-            n_intervals=1,
-            min_interval_length=min_interval_length,
-            max_interval_length=np.inf,
-            interval_features=interval_features,
-            series_transformers=series_transformers,
-            att_subsample_size=None,
-            replace_nan=0,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        return self._predict_proba(X)
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": self.use_pyfftw,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-        }
-
-
-class RSTSFClassifier(ClassifierMixin, BaseTimeSeriesEstimator):
-    """Random Supervised Time Series Forest (RSTSF) Classifier.
-
-    An ensemble of decision trees built on intervals selected through a supervised
-    process as described in _[1].
-    Overview: Input n series of length m with d dimensions
-        - sample X using class-balanced bagging
-        - sample intervals for all 4 series representations and 9 features using
-            supervised method
-        - build extra trees classifier on transformed interval data
-
-    Parameters
-    ----------
-    n_estimators : int, default=200
-        The number of trees in the forest.
-    n_intervals : int, default=50
-        The number of times the supervised interval selection process is run.
-        Each supervised extraction will output a varying amount of features based on
-        series length, number of dimensions and the number of features.
-    min_interval_length : int, default=3
-        The minimum length of extracted intervals. Minimum value of 3.
-    use_pyfftw : bool, default=True
-        Whether to use pyfftw for the periodogram transformation.
-    random_state : None, int or instance of RandomState, default=None
-        Seed or RandomState object used for random number generation.
-        If random_state is None, use the RandomState singleton used by np.random.
-        If random_state is an int, use a new RandomState instance seeded with seed.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict` functions.
-        `-1` means using all processors.
-
-    See Also
-    --------
-    SupervisedIntervals
-
-    References
-    ----------
-    .. [1] Cabello, N., Naghizade, E., Qi, J. and Kulik, L., 2021. Fast, accurate and
-        interpretable time series classification through randomization. arXiv preprint
-        arXiv:2105.14876.
-
-    Examples
-    --------
-    >>> from tsml.interval_based import RSTSFClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = RSTSFClassifier(n_estimators=10, n_intervals=5, random_state=0)  # doctest: +SKIP
-    >>> clf.fit(X, y)  # doctest: +SKIP
-    RSTSFClassifier(...)
-    >>> clf.predict(X)  # doctest: +SKIP
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        n_estimators=200,
-        n_intervals=50,
-        min_interval_length=3,
-        use_pyfftw=True,
-        random_state=None,
-        n_jobs=1,
-    ):
-        self.n_estimators = n_estimators
-        self.n_intervals = n_intervals
-        self.min_interval_length = min_interval_length
-        self.use_pyfftw = use_pyfftw
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-
-        if use_pyfftw:
-            _check_optional_dependency("pyfftw", "pyfftw", self)
-        _check_optional_dependency("statsmodels", "statsmodels", self)
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The class labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(
-            X=X, y=y, ensure_min_samples=2, ensure_min_series_length=5
-        )
-        X = self._convert_X(X)
-
-        check_classification_targets(y)
-
-        self.n_instances_, self.n_dims_, self.series_length_ = X.shape
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        if self.n_classes_ == 1:
-            return self
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        lags = int(12 * (X.shape[2] / 100.0) ** 0.25)
-
-        self._series_transformers = [
-            FunctionTransformer(func=first_order_differences_3d, validate=False),
-            PeriodogramTransformer(use_pyfftw=self.use_pyfftw),
-            ARCoefficientTransformer(order=lags, replace_nan=True),
-        ]
-
-        transforms = [X] + [t.fit_transform(X) for t in self._series_transformers]
-
-        Xt = np.empty((X.shape[0], 0))
-        self._transformers = []
-        transform_data_lengths = []
-        for t in transforms:
-            si = SupervisedIntervalTransformer(
-                n_intervals=self.n_intervals,
-                min_interval_length=self.min_interval_length,
-                n_jobs=self._n_jobs,
-                random_state=self.random_state,
-                randomised_split_point=True,
-            )
-            features = si.fit_transform(t, y)
-            Xt = np.hstack((Xt, features))
-            self._transformers.append(si)
-            transform_data_lengths.append(features.shape[1])
-
-        self.clf_ = ExtraTreesClassifier(
-            n_estimators=self.n_estimators,
-            criterion="entropy",
-            class_weight="balanced",
-            max_features="sqrt",
-            n_jobs=self._n_jobs,
-            random_state=self.random_state,
-        )
-        self.clf_.fit(Xt, y)
-
-        relevant_features = []
-        for tree in self.clf_.estimators_:
-            relevant_features.extend(tree.tree_.feature[tree.tree_.feature >= 0])
-        relevant_features = np.unique(relevant_features)
-
-        features_to_transform = [False] * Xt.shape[1]
-        for i in relevant_features:
-            features_to_transform[i] = True
-
-        count = 0
-        for r in range(len(transforms)):
-            self._transformers[r].set_features_to_transform(
-                features_to_transform[count : count + transform_data_lengths[r]],
-                raise_error=False,
-            )
-            count += transform_data_lengths[r]
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted class labels.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0)
-
-        Xt = self._predict_transform(X)
-        return self.clf_.predict(Xt)
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat([[1]], X.shape[0], axis=0)
-
-        Xt = self._predict_transform(X)
-        return self.clf_.predict_proba(Xt)
-
-    def _predict_transform(self, X):
-        X = self._validate_data(X=X, ensure_min_series_length=5, reset=False)
-        X = self._convert_X(X)
-
-        transforms = [X] + [t.transform(X) for t in self._series_transformers]
-
-        Xt = np.empty((X.shape[0], 0))
-        for i, t in enumerate(transforms):
-            si = self._transformers[i]
-            Xt = np.hstack((Xt, si.transform(t)))
-
-        return Xt
-
-    def _more_tags(self) -> dict:
-        return {
-            "optional_dependency": True,
-        }
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-        }
diff --git a/tsml/interval_based/_tsf.py b/tsml/interval_based/_tsf.py
deleted file mode 100644
index ef35c5d..0000000
--- a/tsml/interval_based/_tsf.py
+++ /dev/null
@@ -1,394 +0,0 @@
-"""Time Series Forest (TSF) estimators.
-
-Interval-based TSF estimators, extracts basic summary features from random intervals.
-"""
-
-__author__ = ["MatthewMiddlehurst"]
-__all__ = ["TSFClassifier", "TSFRegressor"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-
-from tsml.interval_based._base import BaseIntervalForest
-from tsml.vector import CITClassifier
-
-
-class TSFClassifier(ClassifierMixin, BaseIntervalForest):
-    """Time series forest (TSF) classifier.
-
-    A time series forest is an ensemble of decision trees built on random intervals.
-    Overview: Input n series length m.
-    For each tree
-        - sample sqrt(m) intervals,
-        - find mean, std and slope for each interval, concatenate to form new
-        data set,
-        - build a decision tree on new data set.
-    Ensemble the trees with averaged probability estimates.
-
-    This implementation deviates from the original in minor ways. It samples
-    intervals with replacement and does not use the tree splitting criteria
-    refinement described in [1] (this can be done with the CITClassifier base
-    estimator).
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-
-        While random interval extraction will extract the n_intervals intervals total
-        (removing duplicates), supervised intervals will run the supervised extraction
-        process n_intervals times, returning more intervals than specified.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-
-        Ignored for supervised interval_selection_method inputs.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/
-     java/tsml/classifiers/interval_based/TSF.java>`_.
-
-    References
-    ----------
-    .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for
-       classification and feature extraction", Information Sciences, 239, 2013
-
-    Examples
-    --------
-    >>> from tsml.interval_based import TSFClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0)
-    >>> clf = TSFClassifier(n_estimators=10, random_state=0)
-    >>> clf.fit(X, y)
-    TSFClassifier(...)
-    >>> clf.predict(X)
-    array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals="sqrt",
-        min_interval_length=3,
-        max_interval_length=np.inf,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        if isinstance(base_estimator, CITClassifier):
-            replace_nan = "nan"
-        else:
-            replace_nan = 0
-
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=None,
-            series_transformers=None,
-            att_subsample_size=None,
-            replace_nan=replace_nan,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        return self._predict_proba(X)
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-        }
-
-
-class TSFRegressor(RegressorMixin, BaseIntervalForest):
-    """Time series forest (TSF) regressor.
-
-    A time series forest is an ensemble of decision trees built on random intervals.
-    Overview: Input n series length m.
-    For each tree
-        - sample sqrt(m) intervals,
-        - find mean, std and slope for each interval, concatenate to form new
-        data set,
-        - build a decision tree on new data set.
-    Ensemble the trees with averaged predictions.
-
-    This implementation deviates from the original in minor ways. It samples
-    intervals with replacement and does not use the tree splitting criteria
-    refinement described in [1].
-
-    Parameters
-    ----------
-    base_estimator : BaseEstimator or None, default=None
-        scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
-        simple decision tree.
-    n_estimators : int, default=200
-        Number of estimators to build for the ensemble.
-    n_intervals : int, str, list or tuple, default="sqrt"
-        Number of intervals to extract per tree for each series_transformers series.
-
-        An int input will extract that number of intervals from the series, while a str
-        input will return a function of the series length (may differ per
-        series_transformers output) to extract that number of intervals.
-        Valid str inputs are:
-            - "sqrt": square root of the series length.
-            - "sqrt-div": sqrt of series length divided by the number
-                of series_transformers.
-
-        A list or tuple of ints and/or strs will extract the number of intervals using
-        the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
-        will extract sqrt(n_timepoints) + 4 intervals.
-
-        Different number of intervals for each series_transformers series can be
-        specified using a nested list or tuple. Any list or tuple input containing
-        another list or tuple must be the same length as the number of
-        series_transformers.
-    min_interval_length : int, float, list, or tuple, default=3
-        Minimum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the minimum interval length.
-
-        Different minimum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    max_interval_length : int, float, list, or tuple, default=np.inf
-        Maximum length of intervals to extract from series. float inputs take a
-        proportion of the series length to use as the maximum interval length.
-
-        Different maximum interval lengths for each series_transformers series can be
-        specified using a list or tuple. Any list or tuple input must be the same length
-        as the number of series_transformers.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_estimators.
-        Default of 0 means n_estimators are used.
-    contract_max_n_estimators : int, default=500
-        Max number of estimators when time_limit_in_minutes is set.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `predict`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    total_intervals_ : int
-        Total number of intervals per tree from all representations.
-    estimators_ : list of shape (n_estimators) of BaseEstimator
-        The collections of estimators trained in fit.
-    intervals_ : list of shape (n_estimators) of TransformerMixin
-        Stores the interval extraction transformer for all estimators.
-    transformed_data_ : list of shape (n_estimators) of ndarray with shape
-    (n_instances_ ,total_intervals * att_subsample_size)
-        The transformed dataset for all estimators. Only saved when
-        save_transformed_data is true.
-
-    References
-    ----------
-    .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for
-       classification and feature extraction", Information Sciences, 239, 2013
-
-    Examples
-    --------
-    >>> from tsml.interval_based import TSFRegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=10, series_length=12,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = TSFRegressor(n_estimators=10, random_state=0)
-    >>> reg.fit(X, y)
-    TSFRegressor(...)
-    >>> reg.predict(X)
-    array([0.7252543 , 1.50132442, 0.95608366, 1.64399016, 0.42385504,
-           0.60639322, 1.01919317, 1.30157483, 1.66017354, 0.2900776 ])
-    """
-
-    def __init__(
-        self,
-        base_estimator=None,
-        n_estimators=200,
-        n_intervals="sqrt",
-        min_interval_length=3,
-        max_interval_length=np.inf,
-        time_limit_in_minutes=None,
-        contract_max_n_estimators=500,
-        save_transformed_data=False,
-        random_state=None,
-        n_jobs=1,
-        parallel_backend=None,
-    ):
-        super().__init__(
-            base_estimator=base_estimator,
-            n_estimators=n_estimators,
-            interval_selection_method="random",
-            n_intervals=n_intervals,
-            min_interval_length=min_interval_length,
-            max_interval_length=max_interval_length,
-            interval_features=None,
-            series_transformers=None,
-            att_subsample_size=None,
-            replace_nan=0,
-            time_limit_in_minutes=time_limit_in_minutes,
-            contract_max_n_estimators=contract_max_n_estimators,
-            save_transformed_data=save_transformed_data,
-            random_state=random_state,
-            n_jobs=n_jobs,
-            parallel_backend=parallel_backend,
-        )
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "n_estimators": 2,
-            "n_intervals": 2,
-        }
diff --git a/tsml/interval_based/tests/test_interval_forest.py b/tsml/interval_based/tests/test_interval_forest.py
index f790c57..3a92417 100644
--- a/tsml/interval_based/tests/test_interval_forest.py
+++ b/tsml/interval_based/tests/test_interval_forest.py
@@ -15,6 +15,7 @@
 )
 from tsml.utils.numba_functions.stats import row_mean, row_numba_min
 from tsml.utils.testing import generate_3d_test_data
+from tsml.utils.validation import _check_optional_dependency
 from tsml.vector import CITClassifier
 
 
@@ -109,15 +110,22 @@ def test_interval_forest_n_intervals(n_intervals, n_intervals_len):
     assert data[0].shape[1] == n_intervals_len
 
 
-att_subsample_c22 = Catch22Transformer(
-    features=[
-        "DN_HistogramMode_5",
-        "DN_HistogramMode_10",
-        "SB_BinaryStats_diff_longstretch0",
-    ]
-)
+if _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False):
+    att_subsample_c22 = Catch22Transformer(
+        features=[
+            "DN_HistogramMode_5",
+            "DN_HistogramMode_10",
+            "SB_BinaryStats_diff_longstretch0",
+        ]
+    )
+else:
+    att_subsample_c22 = SevenNumberSummaryTransformer()
 
 
+@pytest.mark.skipif(
+    not _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False),
+    reason="pycatch22 not installed",
+)
 @pytest.mark.parametrize(
     "features,output_len",
     [
@@ -127,9 +135,7 @@ def test_interval_forest_n_intervals(n_intervals, n_intervals_len):
         (
             [
                 row_mean,
-                Catch22Transformer(
-                    features=["DN_HistogramMode_5", "DN_HistogramMode_10"]
-                ),
+                _clone_estimator(att_subsample_c22),
                 row_numba_min,
             ],
             4,
diff --git a/tsml/shapelet_based/__init__.py b/tsml/shapelet_based/__init__.py
index bd53d01..dba6f01 100644
--- a/tsml/shapelet_based/__init__.py
+++ b/tsml/shapelet_based/__init__.py
@@ -2,17 +2,12 @@
 
 __all__ = [
     "MrSQMClassifier",
-    "RDSTClassifier",
-    "RDSTRegressor",
     "RandomShapeletForestClassifier",
     "RandomShapeletForestRegressor",
-    "ShapeletTransformClassifier",
 ]
 
 from tsml.shapelet_based._mrsqm import MrSQMClassifier
-from tsml.shapelet_based._rdst import RDSTClassifier, RDSTRegressor
 from tsml.shapelet_based._rsf import (
     RandomShapeletForestClassifier,
     RandomShapeletForestRegressor,
 )
-from tsml.shapelet_based._stc import ShapeletTransformClassifier
diff --git a/tsml/shapelet_based/_rdst.py b/tsml/shapelet_based/_rdst.py
deleted file mode 100644
index c1d75ee..0000000
--- a/tsml/shapelet_based/_rdst.py
+++ /dev/null
@@ -1,545 +0,0 @@
-"""Random Dilated Shapelet Transform (RDST) estimators.
-
-Random Dilated Shapelet Transform estimator pipelines that simply perform a random
-shapelet dilated transform and build (by default) a RidgeCV on the output.
-"""
-
-__author__ = ["MatthewMiddlehurst", "baraline"]
-__all__ = ["RDSTClassifier", "RDSTRegressor"]
-
-import warnings
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin, RegressorMixin
-from sklearn.linear_model import RidgeClassifierCV, RidgeCV
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_is_fitted
-
-from tsml.base import BaseTimeSeriesEstimator, _clone_estimator
-from tsml.transformations import RandomDilatedShapeletTransformer
-from tsml.utils.validation import check_n_jobs
-
-
-class RDSTClassifier(ClassifierMixin, BaseTimeSeriesEstimator):
-    """A random dilated shapelet transform (RDST) classifier.
-
-    Implementation of the random dilated shapelet transform classifier pipeline
-    along the lines of [1][2]. Transforms the data using the
-    `RandomDilatedShapeletTransform` and then builds a `RidgeClassifierCV` classifier
-    with standard scalling.
-
-    This is a duplicate of the original implementation in aeon, adapted for bugfixing
-    and experimentation. All credit to the original author @baraline for the
-    implementation.
-
-    Parameters
-    ----------
-    estimator : BaseEstimator or None, default=None
-        Base estimator for the ensemble, can be supplied a sklearn `BaseEstimator`. If
-        `None` a default `RidgeClassifierCV` classifier is used with standard scalling.
-    max_shapelets : int, default=10000
-        The maximum number of shapelet to keep for the final transformation.
-        A lower number of shapelets can be kept if alpha similarity have discarded the
-        whole dataset.
-    shapelet_lengths : array, default=None
-        The set of possible length for shapelets. Each shapelet length is uniformly
-        drawn from this set. If None, the shapelets length will be equal to
-        min(max(2,series_length//2),11).
-    proba_normalization : float, default=0.8
-        This probability (between 0 and 1) indicate the chance of each shapelet to be
-        initialized such as it will use a z-normalized distance, inducing either scale
-        sensitivity or invariance. A value of 1 would mean that all shapelets will use
-        a z-normalized distance.
-    threshold_percentiles : array, default=None
-        The two perceniles used to select the threshold used to compute the Shapelet
-        Occurrence feature. If None, the 5th and the 10th percentiles (i.e. [5,10])
-        will be used.
-    alpha_similarity : float, default=0.5
-        The strenght of the alpha similarity pruning. The higher the value, the lower
-        the allowed number of common indexes with previously sampled shapelets
-        when sampling a new candidate with the same dilation parameter.
-        It can cause the number of sampled shapelets to be lower than max_shapelets if
-        the whole search space has been covered. The default is 0.5, and the maximum is
-        1. Value above it have no effect for now.
-    use_prime_dilations : bool, default=False
-        If True, restrict the value of the shapelet dilation parameter to be prime
-        values. This can greatly speed-up the algorithm for long time series and/or
-        short shapelet length, possibly at the cost of some accuracy.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit in ``transformed_data_``.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both ``fit`` and ``predict``.
-        `-1` means using all processors.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    transformed_data_ : list of shape (n_estimators) of ndarray
-        The transformed training dataset for all classifiers. Only saved when
-        ``save_transformed_data`` is `True`.
-
-    See Also
-    --------
-    RDSTRegressor
-    RandomDilatedShapeletTransformer
-
-    References
-    ----------
-    .. [1] Antoine Guillaume et al. "Random Dilated Shapelet Transform: A New Approach
-       for Time Series Shapelets", Pattern Recognition and Artificial Intelligence.
-       ICPRAI 2022.
-    .. [2] Antoine Guillaume, "Time series classification with shapelets: Application
-       to predictive maintenance on event logs", PhD Thesis, University of Orléans,
-       2023.
-
-    Examples
-    --------
-    >>> from tsml.shapelet_based import RDSTClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0)
-    >>> clf = RDSTClassifier(random_state=0)
-    >>> clf.fit(X, y)
-    RDSTClassifier(...)
-    >>> pred = clf.predict(X)
-    """
-
-    def __init__(
-        self,
-        max_shapelets=10000,
-        shapelet_lengths=None,
-        proba_normalization=0.8,
-        threshold_percentiles=None,
-        alpha_similarity=0.5,
-        use_prime_dilations=False,
-        estimator=None,
-        save_transformed_data=False,
-        n_jobs=1,
-        random_state=None,
-    ):
-        self.max_shapelets = max_shapelets
-        self.shapelet_lengths = shapelet_lengths
-        self.proba_normalization = proba_normalization
-        self.threshold_percentiles = threshold_percentiles
-        self.alpha_similarity = alpha_similarity
-        self.use_prime_dilations = use_prime_dilations
-        self.estimator = estimator
-        self.save_transformed_data = save_transformed_data
-        self.n_jobs = n_jobs
-        self.random_state = random_state
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The class labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2)
-        X = self._convert_X(X)
-
-        check_classification_targets(y)
-
-        self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        if self.n_classes_ == 1:
-            return self
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._transformer = RandomDilatedShapeletTransformer(
-            max_shapelets=self.max_shapelets,
-            shapelet_lengths=self.shapelet_lengths,
-            proba_normalization=self.proba_normalization,
-            threshold_percentiles=self.threshold_percentiles,
-            alpha_similarity=self.alpha_similarity,
-            use_prime_dilations=self.use_prime_dilations,
-            random_state=self.random_state,
-            n_jobs=self._n_jobs,
-        )
-
-        self._estimator = _clone_estimator(
-            (
-                make_pipeline(
-                    StandardScaler(with_mean=False),
-                    RidgeClassifierCV(alphas=np.logspace(-4, 4, 20)),
-                )
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t = self._transformer.fit_transform(X, y)
-        X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-
-        if self.save_transformed_data:
-            self.transformed_data_ = X_t
-
-        if X_t.shape[1] == 0:
-            warnings.warn("No shapelets found in training data.", stacklevel=2)
-            self._no_atts = True
-            self._majority_class = np.argmax(np.unique(y, return_counts=True)[1])
-        else:
-            self._no_atts = False
-            self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted class labels.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0)
-
-        if self._no_atts:
-            return np.repeat([self.classes_[self._majority_class]], X.shape[0], axis=0)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        X_t = self._transformer.transform(X)
-        X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-
-        return self._estimator.predict(X_t)
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat([[1]], X.shape[0], axis=0)
-
-        if self._no_atts:
-            p = np.zeros((X.shape[0], self.n_classes_))
-            p[:, self._majority_class] = 1
-            return p
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        X_t = self._transformer.transform(X)
-        X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-
-        m = getattr(self._estimator, "predict_proba", None)
-        if callable(m):
-            return self._estimator.predict_proba(X_t)
-        else:
-            dists = np.zeros((X.shape[0], self.n_classes_))
-            preds = self._estimator.predict(X_t)
-            for i in range(0, X.shape[0]):
-                dists[i, self.class_dictionary_[preds[i]]] = 1
-            return dists
-
-    def _more_tags(self) -> dict:
-        return {"non_deterministic": True}
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "max_shapelets": 10,
-        }
-
-
-class RDSTRegressor(RegressorMixin, BaseTimeSeriesEstimator):
-    """A random dilated shapelet transform (RDST) regressor.
-
-    Implementation of the random dilated shapelet transform pipeline
-    along the lines of [1][2]. Transforms the data using the
-    `RandomDilatedShapeletTransform` and then builds a `RidgeCV` regressor
-    with standard scalling.
-
-    Parameters
-    ----------
-    estimator : BaseEstimator or None, default=None
-        Base estimator for the ensemble, can be supplied a sklearn `BaseEstimator`. If
-        `None` a default `RidgeCV` regressor is used with standard scalling.
-    max_shapelets : int, default=10000
-        The maximum number of shapelet to keep for the final transformation.
-        A lower number of shapelets can be kept if alpha similarity have discarded the
-        whole dataset.
-    shapelet_lengths : array, default=None
-        The set of possible length for shapelets. Each shapelet length is uniformly
-        drawn from this set. If None, the shapelets length will be equal to
-        min(max(2,series_length//2),11).
-    proba_normalization : float, default=0.8
-        This probability (between 0 and 1) indicate the chance of each shapelet to be
-        initialized such as it will use a z-normalized distance, inducing either scale
-        sensitivity or invariance. A value of 1 would mean that all shapelets will use
-        a z-normalized distance.
-    threshold_percentiles : array, default=None
-        The two perceniles used to select the threshold used to compute the Shapelet
-        Occurrence feature. If None, the 5th and the 10th percentiles (i.e. [5,10])
-        will be used.
-    alpha_similarity : float, default=0.5
-        The strenght of the alpha similarity pruning. The higher the value, the lower
-        the allowed number of common indexes with previously sampled shapelets
-        when sampling a new candidate with the same dilation parameter.
-        It can cause the number of sampled shapelets to be lower than max_shapelets if
-        the whole search space has been covered. The default is 0.5, and the maximum is
-        1. Value above it have no effect for now.
-    use_prime_dilations : bool, default=False
-        If True, restrict the value of the shapelet dilation parameter to be prime
-        values. This can greatly speed-up the algorithm for long time series and/or
-        short shapelet length, possibly at the cost of some accuracy.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit in ``transformed_data_``.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both ``fit`` and ``predict``.
-        `-1` means using all processors.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    transformed_data_ : list of shape (n_estimators) of ndarray
-        The transformed training dataset for all regressors. Only saved when
-        ``save_transformed_data`` is `True`.
-
-    See Also
-    --------
-    RDSTClassifier
-    RandomDilatedShapeletTransformer
-
-    References
-    ----------
-    .. [1] Antoine Guillaume et al. "Random Dilated Shapelet Transform: A New Approach
-       for Time Series Shapelets", Pattern Recognition and Artificial Intelligence.
-       ICPRAI 2022.
-    .. [2] Antoine Guillaume, "Time series classification with shapelets: Application
-       to predictive maintenance on event logs", PhD Thesis, University of Orléans,
-       2023.
-
-    Examples
-    --------
-    >>> from tsml.shapelet_based import RDSTRegressor
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10,
-    ...                              regression_target=True, random_state=0)
-    >>> reg = RDSTRegressor(random_state=0)
-    >>> reg.fit(X, y)
-    RDSTRegressor(...)
-    >>> pred = reg.predict(X)
-    """
-
-    def __init__(
-        self,
-        max_shapelets=10000,
-        shapelet_lengths=None,
-        proba_normalization=0.8,
-        threshold_percentiles=None,
-        alpha_similarity=0.5,
-        use_prime_dilations=False,
-        estimator=None,
-        n_jobs=1,
-        random_state=None,
-    ):
-        self.max_shapelets = max_shapelets
-        self.shapelet_lengths = shapelet_lengths
-        self.proba_normalization = proba_normalization
-        self.threshold_percentiles = threshold_percentiles
-        self.alpha_similarity = alpha_similarity
-        self.use_prime_dilations = use_prime_dilations
-        self.estimator = estimator
-        self.n_jobs = n_jobs
-        self.random_state = random_state
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The target labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2, y_numeric=True)
-        X = self._convert_X(X)
-
-        self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._transformer = RandomDilatedShapeletTransformer(
-            max_shapelets=self.max_shapelets,
-            shapelet_lengths=self.shapelet_lengths,
-            proba_normalization=self.proba_normalization,
-            threshold_percentiles=self.threshold_percentiles,
-            alpha_similarity=self.alpha_similarity,
-            use_prime_dilations=self.use_prime_dilations,
-            random_state=self.random_state,
-            n_jobs=self._n_jobs,
-        )
-
-        self._estimator = _clone_estimator(
-            (
-                make_pipeline(
-                    StandardScaler(with_mean=False),
-                    RidgeCV(alphas=np.logspace(-4, 4, 20)),
-                )
-                if self.estimator is None
-                else self.estimator
-            ),
-            self.random_state,
-        )
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        X_t = self._transformer.fit_transform(X, y)
-        X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-
-        if X_t.shape[1] == 0:
-            warnings.warn("No shapelets found in training data.", stacklevel=2)
-            self._no_atts = True
-            self._y_mean = np.mean(y)
-        else:
-            self._no_atts = False
-            self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted target labels.
-        """
-        check_is_fitted(self)
-
-        if self._no_atts:
-            return np.full(X.shape[0], self._y_mean)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        X_t = self._transformer.transform(X)
-        X_t = np.nan_to_num(X_t, nan=0.0, posinf=0.0, neginf=0.0)
-
-        return self._estimator.predict(X_t)
-
-    def _more_tags(self) -> dict:
-        return {"non_deterministic": True}
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "max_shapelets": 10,
-        }
diff --git a/tsml/shapelet_based/_stc.py b/tsml/shapelet_based/_stc.py
deleted file mode 100644
index aa9ccbc..0000000
--- a/tsml/shapelet_based/_stc.py
+++ /dev/null
@@ -1,311 +0,0 @@
-"""A shapelet transform classifier (STC).
-
-Shapelet transform classifier pipeline that simply performs a (configurable) shapelet
-transform then builds (by default) a rotation forest classifier on the output.
-"""
-
-__author__ = ["TonyBagnall", "MatthewMiddlehurst"]
-__all__ = ["ShapeletTransformClassifier"]
-
-from typing import List, Union
-
-import numpy as np
-from sklearn.base import ClassifierMixin
-from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import check_is_fitted
-
-from tsml.base import BaseTimeSeriesEstimator, _clone_estimator
-from tsml.transformations._shapelet_transform import RandomShapeletTransformer
-from tsml.utils.validation import check_n_jobs
-from tsml.vector import RotationForestClassifier
-
-
-class ShapeletTransformClassifier(ClassifierMixin, BaseTimeSeriesEstimator):
-    """A shapelet transform classifier (STC).
-
-    Implementation of the binary shapelet transform classifier pipeline along the lines
-    of [1][2] but with random shapelet sampling. Transforms the data using the
-    configurable `RandomShapeletTransformer` and then builds a
-    `RotationForestClassifier` classifier.
-
-    As some implementations and applications contract the transformation solely,
-    contracting is available for the transform only and both classifier and transform.
-
-    Parameters
-    ----------
-    n_shapelet_samples : int, default=10000
-        The number of candidate shapelets to be considered for the final transform.
-        Filtered down to ``<= max_shapelets``, keeping the shapelets with the most
-        information gain.
-    max_shapelets : int or None, default=None
-        Max number of shapelets to keep for the final transform. Each class value will
-        have its own max, set to ``n_classes_ / max_shapelets``. If `None`, uses the
-        minimum between ``10 * n_instances_`` and `1000`.
-    max_shapelet_length : int or None, default=None
-        Lower bound on candidate shapelet lengths for the transform. If ``None``, no
-        max length is used
-    estimator : BaseEstimator or None, default=None
-        Base estimator for the ensemble, can be supplied a sklearn `BaseEstimator`. If
-        `None` a default `RotationForest` classifier is used.
-    transform_limit_in_minutes : int, default=0
-        Time contract to limit transform time in minutes for the shapelet transform,
-        overriding `n_shapelet_samples`. A value of `0` means ``n_shapelet_samples``
-        is used.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding ``n_shapelet_samples``
-        and ``transform_limit_in_minutes``. The ``estimator`` will only be contracted if
-        a ``time_limit_in_minutes parameter`` is present. Default of `0` means
-        ``n_shapelet_samples`` or ``transform_limit_in_minutes`` is used.
-    contract_max_n_shapelet_samples : int, default=np.inf
-        Max number of shapelets to extract when contracting the transform with
-        ``transform_limit_in_minutes`` or ``time_limit_in_minutes``.
-    save_transformed_data : bool, default=False
-        Save the data transformed in fit in ``transformed_data_``.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both ``fit`` and ``predict``.
-        `-1` means using all processors.
-    batch_size : int or None, default=100
-        Number of shapelet candidates processed before being merged into the set of best
-        shapelets in the transform.
-    random_state : int, RandomState instance or None, default=None
-        If `int`, random_state is the seed used by the random number generator;
-        If `RandomState` instance, random_state is the random number generator;
-        If `None`, the random number generator is the `RandomState` instance used
-        by `np.random`.
-
-    Attributes
-    ----------
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_channels_ : int
-        The number of dimensions per case in the training set.
-    n_timepoints_ : int
-        The length of each series in the training set.
-    n_classes_ : int
-        Number of classes. Extracted from the data.
-    classes_ : ndarray of shape (n_classes_)
-        Holds the label for each class.
-    class_dictionary_ : dict
-        A dictionary mapping class labels to class indices in classes_.
-    transformed_data_ : list of shape (n_estimators) of ndarray
-        The transformed training dataset for all classifiers. Only saved when
-        ``save_transformed_data`` is `True`.
-
-    See Also
-    --------
-    RandomShapeletTransformer
-    RotationForestClassifier
-
-    Notes
-    -----
-    For the Java version, see
-    `tsml <https://github.com/uea-machine-learning/tsml/blob/master/src/main/
-    java/tsml/classifiers/shapelet_based/ShapeletTransformClassifier.java>`_.
-
-    References
-    ----------
-    .. [1] Jon Hills et al., "Classification of time series by shapelet transformation",
-       Data Mining and Knowledge Discovery, 28(4), 851--881, 2014.
-    .. [2] A. Bostrom and A. Bagnall, "Binary Shapelet Transform for Multiclass Time
-       Series Classification", Transactions on Large-Scale Data and Knowledge Centered
-       Systems, 32, 2017.
-
-    Examples
-    --------
-    >>> from tsml.shapelet_based import ShapeletTransformClassifier
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0)
-    >>> clf = ShapeletTransformClassifier(random_state=0)
-    >>> clf.fit(X, y)
-    ShapeletTransformClassifier(...)
-    >>> clf.predict(X)
-    array([0, 1, 1, 0, 0, 1, 0, 1])
-    """
-
-    def __init__(
-        self,
-        n_shapelet_samples=10000,
-        max_shapelets=None,
-        max_shapelet_length=None,
-        estimator=None,
-        transform_limit_in_minutes=0,
-        time_limit_in_minutes=0,
-        contract_max_n_shapelet_samples=np.inf,
-        save_transformed_data=False,
-        n_jobs=1,
-        batch_size=100,
-        random_state=None,
-    ):
-        self.n_shapelet_samples = n_shapelet_samples
-        self.max_shapelets = max_shapelets
-        self.max_shapelet_length = max_shapelet_length
-        self.estimator = estimator
-
-        self.transform_limit_in_minutes = transform_limit_in_minutes
-        self.time_limit_in_minutes = time_limit_in_minutes
-        self.contract_max_n_shapelet_samples = contract_max_n_shapelet_samples
-        self.save_transformed_data = save_transformed_data
-
-        self.random_state = random_state
-        self.batch_size = batch_size
-        self.n_jobs = n_jobs
-
-        super().__init__()
-
-    def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object:
-        """Fit the estimator to training data.
-
-        Parameters
-        ----------
-        X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints)
-            The training data.
-        y : 1D np.ndarray of shape (n_instances)
-            The class labels for fitting, indices correspond to instance indices in X
-
-        Returns
-        -------
-        self :
-            Reference to self.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2)
-        X = self._convert_X(X)
-
-        check_classification_targets(y)
-
-        self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape
-        self.classes_ = np.unique(y)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        if self.n_classes_ == 1:
-            return self
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        self._transform_limit_in_minutes = 0
-        if self.time_limit_in_minutes > 0:
-            # contracting 2/3 transform (with 1/5 of that taken away for final
-            # transform), 1/3 classifier
-            third = self.time_limit_in_minutes / 3
-            self._classifier_limit_in_minutes = third
-            self._transform_limit_in_minutes = (third * 2) / 5 * 4
-        elif self.transform_limit_in_minutes > 0:
-            self._transform_limit_in_minutes = self.transform_limit_in_minutes
-
-        self._transformer = RandomShapeletTransformer(
-            n_shapelet_samples=self.n_shapelet_samples,
-            max_shapelets=self.max_shapelets,
-            max_shapelet_length=self.max_shapelet_length,
-            time_limit_in_minutes=self._transform_limit_in_minutes,
-            contract_max_n_shapelet_samples=self.contract_max_n_shapelet_samples,
-            n_jobs=self.n_jobs,
-            batch_size=self.batch_size,
-            random_state=self.random_state,
-        )
-
-        self._estimator = _clone_estimator(
-            RotationForestClassifier() if self.estimator is None else self.estimator,
-            self.random_state,
-        )
-
-        if isinstance(self._estimator, RotationForestClassifier):
-            self._estimator.save_transformed_data = self.save_transformed_data
-
-        m = getattr(self._estimator, "n_jobs", None)
-        if m is not None:
-            self._estimator.n_jobs = self._n_jobs
-
-        m = getattr(self._estimator, "time_limit_in_minutes", None)
-        if m is not None and self.time_limit_in_minutes > 0:
-            self._estimator.time_limit_in_minutes = self._classifier_limit_in_minutes
-
-        X_t = self._transformer.fit_transform(X, y)
-
-        if self.save_transformed_data:
-            self.transformed_data_ = X_t
-
-        self._estimator.fit(X_t, y)
-
-        return self
-
-    def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances)
-            Predicted class labels.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        return self._estimator.predict(self._transformer.transform(X))
-
-    def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray:
-        """Predicts labels probabilities for sequences in X.
-
-        Parameters
-        ----------
-        X : 3D np.array of shape (n_instances, n_channels, n_timepoints)
-            The testing data.
-
-        Returns
-        -------
-        y : array-like of shape (n_instances, n_classes_)
-            Predicted probabilities using the ordering in classes_.
-        """
-        check_is_fitted(self)
-
-        # treat case of single class seen in fit
-        if self.n_classes_ == 1:
-            return np.repeat([[1]], X.shape[0], axis=0)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        m = getattr(self._estimator, "predict_proba", None)
-        if callable(m):
-            return self._estimator.predict_proba(self._transformer.transform(X))
-        else:
-            dists = np.zeros((X.shape[0], self.n_classes_))
-            preds = self._estimator.predict(self._transformer.transform(X))
-            for i in range(0, X.shape[0]):
-                dists[i, self.class_dictionary_[preds[i]]] = 1
-            return dists
-
-    @classmethod
-    def get_test_params(
-        cls, parameter_set: Union[str, None] = None
-    ) -> Union[dict, List[dict]]:
-        """Return unit test parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : None or str, default=None
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict
-            Parameters to create testing instances of the class.
-        """
-        return {
-            "estimator": RotationForestClassifier(n_estimators=2),
-            "n_shapelet_samples": 10,
-            "max_shapelets": 3,
-            "batch_size": 5,
-        }
diff --git a/tsml/transformations/__init__.py b/tsml/transformations/__init__.py
index 5765685..5186259 100644
--- a/tsml/transformations/__init__.py
+++ b/tsml/transformations/__init__.py
@@ -11,9 +11,6 @@
     # "FixedIntervalTransformer",
     "PeriodogramTransformer",
     # "QuantileTransformer",
-    # "SFATransformer",
-    "RandomShapeletTransformer",
-    "RandomDilatedShapeletTransformer",
     "SevenNumberSummaryTransformer",
     "TransformerConcatenator",
 ]
@@ -28,9 +25,5 @@
     SupervisedIntervalTransformer,
 )
 from tsml.transformations._periodogram import PeriodogramTransformer
-from tsml.transformations._shapelet_transform import (
-    RandomDilatedShapeletTransformer,
-    RandomShapeletTransformer,
-)
 from tsml.transformations._summary_features import SevenNumberSummaryTransformer
 from tsml.transformations._transform_concatenator import TransformerConcatenator
diff --git a/tsml/transformations/_catch22.py b/tsml/transformations/_catch22.py
index 6aee5f3..932f4a8 100644
--- a/tsml/transformations/_catch22.py
+++ b/tsml/transformations/_catch22.py
@@ -6,20 +6,14 @@
 __author__ = ["MatthewMiddlehurst"]
 __all__ = ["Catch22Transformer"]
 
-import math
 
 import numpy as np
 from joblib import Parallel
-from numba import njit
 from sklearn.base import TransformerMixin
 from sklearn.utils.parallel import delayed
 
 from tsml.base import BaseTimeSeriesEstimator
-from tsml.utils.numba_functions.general import (
-    z_normalise_series,
-    z_normalise_series_with_mean,
-)
-from tsml.utils.numba_functions.stats import mean, numba_max, numba_min
+from tsml.utils.numba_functions.general import z_normalise_series
 from tsml.utils.validation import _check_optional_dependency, check_n_jobs
 
 feature_names = [
@@ -116,22 +110,6 @@ class Catch22Transformer(TransformerMixin, BaseTimeSeriesEstimator):
     .. [2] Fulcher, B. D., Little, M. A., & Jones, N. S. (2013). Highly comparative
     time-series analysis: the empirical structure of time series and their methods.
     Journal of the Royal Society Interface, 10(83), 20130048.
-
-    Examples
-    --------
-    >>> from tsml.transformations import Catch22Transformer
-    >>> from tsml.utils.testing import generate_3d_test_data
-    >>> X, _ = generate_3d_test_data(n_samples=4, series_length=10, random_state=0)
-    >>> tnf = Catch22Transformer(replace_nans=True)
-    >>> tnf.fit(X)
-    Catch22Transformer(...)
-    >>> print(tnf.transform(X)[0])
-    [6.27596874e-02 3.53871087e-01 4.00000000e+00 7.00000000e-01
-     2.00000000e-01 5.66227710e-01 2.00000000e+00 3.08148791e-34
-     1.96349541e+00 9.99913411e-01 1.39251594e+00 3.89048349e-01
-     2.00000000e+00 1.00000000e+00 3.00000000e+00 2.04319187e+00
-     1.00000000e+00 2.44474814e-01 0.00000000e+00 0.00000000e+00
-     8.23045267e-03 0.00000000e+00]
     """
 
     def __init__(
@@ -140,7 +118,6 @@ def __init__(
         catch24=False,
         outlier_norm=False,
         replace_nans=False,
-        use_pycatch22=True,
         n_jobs=1,
         parallel_backend=None,
     ):
@@ -148,12 +125,10 @@ def __init__(
         self.catch24 = catch24
         self.outlier_norm = outlier_norm
         self.replace_nans = replace_nans
-        self.use_pycatch22 = use_pycatch22
         self.n_jobs = n_jobs
         self.parallel_backend = parallel_backend
 
-        if use_pycatch22:
-            _check_optional_dependency("pycatch22", "pycatch22", self)
+        _check_optional_dependency("pycatch22", "pycatch22", self)
 
         super().__init__()
 
@@ -187,67 +162,37 @@ def transform(self, X, y=None):
 
         threads_to_use = check_n_jobs(self.n_jobs)
 
-        if self.use_pycatch22:
-            import pycatch22
-
-            features = [
-                pycatch22.DN_HistogramMode_5,
-                pycatch22.DN_HistogramMode_10,
-                pycatch22.SB_BinaryStats_diff_longstretch0,
-                pycatch22.DN_OutlierInclude_p_001_mdrmd,
-                pycatch22.DN_OutlierInclude_n_001_mdrmd,
-                pycatch22.CO_f1ecac,
-                pycatch22.CO_FirstMin_ac,
-                pycatch22.SP_Summaries_welch_rect_area_5_1,
-                pycatch22.SP_Summaries_welch_rect_centroid,
-                pycatch22.FC_LocalSimple_mean3_stderr,
-                pycatch22.CO_trev_1_num,
-                pycatch22.CO_HistogramAMI_even_2_5,
-                pycatch22.IN_AutoMutualInfoStats_40_gaussian_fmmi,
-                pycatch22.MD_hrv_classic_pnn40,
-                pycatch22.SB_BinaryStats_mean_longstretch1,
-                pycatch22.SB_MotifThree_quantile_hh,
-                pycatch22.FC_LocalSimple_mean1_tauresrat,
-                pycatch22.CO_Embed2_Dist_tau_d_expfit_meandiff,
-                pycatch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,
-                pycatch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,
-                pycatch22.SB_TransitionMatrix_3ac_sumdiagcov,
-                pycatch22.PD_PeriodicityWang_th0_01,
-            ]
-        else:
-            features = [
-                Catch22Transformer._DN_HistogramMode_5,
-                Catch22Transformer._DN_HistogramMode_10,
-                Catch22Transformer._SB_BinaryStats_diff_longstretch0,
-                Catch22Transformer._DN_OutlierInclude_p_001_mdrmd,
-                Catch22Transformer._DN_OutlierInclude_n_001_mdrmd,
-                Catch22Transformer._CO_f1ecac,
-                Catch22Transformer._CO_FirstMin_ac,
-                Catch22Transformer._SP_Summaries_welch_rect_area_5_1,
-                Catch22Transformer._SP_Summaries_welch_rect_centroid,
-                Catch22Transformer._FC_LocalSimple_mean3_stderr,
-                Catch22Transformer._CO_trev_1_num,
-                Catch22Transformer._CO_HistogramAMI_even_2_5,
-                Catch22Transformer._IN_AutoMutualInfoStats_40_gaussian_fmmi,
-                Catch22Transformer._MD_hrv_classic_pnn40,
-                Catch22Transformer._SB_BinaryStats_mean_longstretch1,
-                Catch22Transformer._SB_MotifThree_quantile_hh,
-                Catch22Transformer._FC_LocalSimple_mean1_tauresrat,
-                Catch22Transformer._CO_Embed2_Dist_tau_d_expfit_meandiff,
-                Catch22Transformer._SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,
-                Catch22Transformer._SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,
-                Catch22Transformer._SB_TransitionMatrix_3ac_sumdiagcov,
-                Catch22Transformer._PD_PeriodicityWang_th0_01,
-            ]
+        import pycatch22
+
+        features = [
+            pycatch22.DN_HistogramMode_5,
+            pycatch22.DN_HistogramMode_10,
+            pycatch22.SB_BinaryStats_diff_longstretch0,
+            pycatch22.DN_OutlierInclude_p_001_mdrmd,
+            pycatch22.DN_OutlierInclude_n_001_mdrmd,
+            pycatch22.CO_f1ecac,
+            pycatch22.CO_FirstMin_ac,
+            pycatch22.SP_Summaries_welch_rect_area_5_1,
+            pycatch22.SP_Summaries_welch_rect_centroid,
+            pycatch22.FC_LocalSimple_mean3_stderr,
+            pycatch22.CO_trev_1_num,
+            pycatch22.CO_HistogramAMI_even_2_5,
+            pycatch22.IN_AutoMutualInfoStats_40_gaussian_fmmi,
+            pycatch22.MD_hrv_classic_pnn40,
+            pycatch22.SB_BinaryStats_mean_longstretch1,
+            pycatch22.SB_MotifThree_quantile_hh,
+            pycatch22.FC_LocalSimple_mean1_tauresrat,
+            pycatch22.CO_Embed2_Dist_tau_d_expfit_meandiff,
+            pycatch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1,
+            pycatch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1,
+            pycatch22.SB_TransitionMatrix_3ac_sumdiagcov,
+            pycatch22.PD_PeriodicityWang_th0_01,
+        ]
 
         c22_list = Parallel(
             n_jobs=threads_to_use, backend=self.parallel_backend, prefer="threads"
         )(
-            delayed(
-                self._transform_case_pycatch22
-                if self.use_pycatch22
-                else self._transform_case
-            )(
+            delayed(self._transform_case_pycatch22)(
                 X[i],
                 f_idx,
                 features,
@@ -260,96 +205,6 @@ def transform(self, X, y=None):
 
         return np.array(c22_list)
 
-    def _transform_case(self, X, f_idx, features):
-        c22 = np.zeros(len(f_idx) * len(X))
-
-        if hasattr(self, "_transform_features") and len(
-            self._transform_features
-        ) == len(c22):
-            transform_feature = self._transform_features
-        else:
-            transform_feature = [True] * len(c22)
-
-        f_count = -1
-        for i, series in enumerate(X):
-            dim = i * len(f_idx)
-            outlier_series = None
-            smin = None
-            smax = None
-            smean = None
-            fft = None
-            ac = None
-            acfz = None
-
-            for n, feature in enumerate(f_idx):
-                f_count += 1
-                if not transform_feature[f_count]:
-                    continue
-
-                args = [series]
-
-                if feature == 0 or feature == 1 or feature == 11:
-                    if smin is None:
-                        smin = numba_min(series)
-                    if smax is None:
-                        smax = numba_max(series)
-                    args = [series, smin, smax]
-                elif feature == 2 or feature == 22:
-                    if smean is None:
-                        smean = mean(series)
-                    args = [series, smean]
-                elif feature == 3 or feature == 4:
-                    if self.outlier_norm:
-                        if smean is None:
-                            smean = mean(series)
-                        if outlier_series is None:
-                            outlier_series = z_normalise_series_with_mean(series, smean)
-                        args = [outlier_series]
-                    else:
-                        args = [series]
-                elif feature == 7 or feature == 8:
-                    if smean is None:
-                        smean = mean(series)
-                    if fft is None:
-                        nfft = int(
-                            np.power(2, np.ceil(np.log(len(series)) / np.log(2)))
-                        )
-                        fft = np.fft.fft(series - smean, n=nfft)
-                    args = [series, fft]
-                elif feature == 5 or feature == 6 or feature == 12:
-                    if smean is None:
-                        smean = mean(series)
-                    if fft is None:
-                        nfft = int(
-                            np.power(2, np.ceil(np.log(len(series)) / np.log(2)))
-                        )
-                        fft = np.fft.fft(series - smean, n=nfft)
-                    if ac is None:
-                        ac = _autocorr(series, fft)
-                    args = [ac]
-                elif feature == 16 or feature == 17 or feature == 20:
-                    if smean is None:
-                        smean = mean(series)
-                    if fft is None:
-                        nfft = int(
-                            np.power(2, np.ceil(np.log(len(series)) / np.log(2)))
-                        )
-                        fft = np.fft.fft(series - smean, n=nfft)
-                    if ac is None:
-                        ac = _autocorr(series, fft)
-                    if acfz is None:
-                        acfz = _ac_first_zero(ac)
-                    args = [series, acfz]
-
-                if feature == 22:
-                    c22[dim + n] = smean
-                elif feature == 23:
-                    c22[dim + n] = np.std(series)
-                else:
-                    c22[dim + n] = features[feature](*args)
-
-        return c22
-
     def _transform_case_pycatch22(self, X, f_idx, features):
         c22 = np.zeros(len(f_idx) * len(X))
 
@@ -401,7 +256,7 @@ def _more_tags(self) -> dict:
         return {
             "X_types": ["np_list", "3darray"],
             "requires_fit": False,
-            "optional_dependency": self.use_pycatch22,
+            "optional_dependency": True,
         }
 
     @classmethod
@@ -422,875 +277,7 @@ def get_test_params(cls, parameter_set="default"):
             `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
             `create_test_instance` uses the first (or only) dictionary in `params`
         """
-        return {
-            "use_pycatch22": False,
-        }
-
-    @staticmethod
-    def _DN_HistogramMode_5(X, smin, smax):
-        # Mode of z-scored distribution (5-bin histogram).
-        return _histogram_mode(X, 5, smin, smax)
-
-    @staticmethod
-    def _DN_HistogramMode_10(X, smin, smax):
-        # Mode of z-scored distribution (10-bin histogram).
-        return _histogram_mode(X, 10, smin, smax)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SB_BinaryStats_diff_longstretch0(X, smean):
-        # Longest period of consecutive values above the mean.
-        mean_binary = np.zeros(len(X))
-        for i in range(len(X)):
-            if X[i] - smean > 0:
-                mean_binary[i] = 1
-
-        return _long_stretch(mean_binary, 1)
-
-    @staticmethod
-    def _DN_OutlierInclude_p_001_mdrmd(X):
-        # Time intervals between successive extreme events above the mean.
-        return _outlier_include(X)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _DN_OutlierInclude_n_001_mdrmd(X):
-        # Time intervals between successive extreme events below the mean.
-        return _outlier_include(-X)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _CO_f1ecac(X_ac):
-        # First 1/e crossing of autocorrelation function.
-        threshold = 0.36787944117144233  # 1 / np.exp(1)
-        for i in range(1, len(X_ac)):
-            if (X_ac[i - 1] - threshold) * (X_ac[i] - threshold) < 0:
-                return i
-        return len(X_ac)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _CO_FirstMin_ac(X_ac):
-        # First minimum of autocorrelation function.
-        for i in range(1, len(X_ac) - 1):
-            if X_ac[i] < X_ac[i - 1] and X_ac[i] < X_ac[i + 1]:
-                return i
-        return len(X_ac)
-
-    @staticmethod
-    def _SP_Summaries_welch_rect_area_5_1(X, X_fft):
-        # Total power in lowest fifth of frequencies in the Fourier power spectrum.
-        return _summaries_welch_rect(X, False, X_fft)
-
-    @staticmethod
-    def _SP_Summaries_welch_rect_centroid(X, X_fft):
-        # Centroid of the Fourier power spectrum.
-        return _summaries_welch_rect(X, True, X_fft)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _FC_LocalSimple_mean3_stderr(X):
-        # Mean error from a rolling 3-sample mean forecasting.
-        if len(X) - 3 < 3:
-            return 0
-        res = _local_simple_mean(X, 3)
-        return np.std(res)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _CO_trev_1_num(X):
-        # Time-reversibility statistic, ((x_t+1 − x_t)^3)_t.
-        y = np.zeros(len(X) - 1)
-        for i in range(len(y)):
-            y[i] = np.power(X[i + 1] - X[i], 3)
-        return np.mean(y)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _CO_HistogramAMI_even_2_5(X, smin, smax):
-        # Automutual information, m = 2, τ = 5.
-        new_min = smin - 0.1
-        new_max = smax + 0.1
-        bin_width = (new_max - new_min) / 5
-
-        histogram = np.zeros((5, 5))
-        sumx = np.zeros(5)
-        sumy = np.zeros(5)
-        v = 1.0 / (len(X) - 2) if len(X) > 2 else 0.0
-        for i in range(len(X) - 2):
-            idx1 = int((X[i] - new_min) / bin_width)
-            idx2 = int((X[i + 2] - new_min) / bin_width)
-
-            histogram[idx1][idx2] += v
-            sumx[idx1] += v
-            sumy[idx2] += v
-
-        nsum = 0
-        for i in range(5):
-            for n in range(5):
-                if histogram[i][n] > 0:
-                    nsum += histogram[i][n] * np.log(
-                        histogram[i][n] / sumx[i] / sumy[n]
-                    )
-
-        return nsum
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _IN_AutoMutualInfoStats_40_gaussian_fmmi(X_ac):
-        # First minimum of the automutual information function.
-        tau = int(min(40, np.ceil(len(X_ac) / 2)))
-
-        diffs = np.zeros(tau - 1)
-        prev = -0.5 * np.log(1 - np.power(X_ac[1], 2))
-        for i in range(len(diffs)):
-            corr = -0.5 * np.log(1 - np.power(X_ac[i + 2], 2))
-            diffs[i] = corr - prev
-            prev = corr
-
-        for i in range(len(diffs) - 1):
-            if diffs[i] * diffs[i + 1] < 0 and diffs[i] < 0:
-                return i + 1
-
-        return tau
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _MD_hrv_classic_pnn40(X):
-        # Proportion of successive differences exceeding 0.04σ (Mietus 2002).
-        diffs = np.zeros(len(X) - 1)
-        for i in range(len(diffs)):
-            diffs[i] = np.abs(X[i + 1] - X[i]) * 1000
-
-        nsum = 0
-        for diff in diffs:
-            if diff > 40:
-                nsum += 1
-
-        return nsum / len(diffs)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SB_BinaryStats_mean_longstretch1(X):
-        # Longest period of successive incremental decreases.
-        diff_binary = np.zeros(len(X) - 1)
-        for i in range(len(diff_binary)):
-            if X[i + 1] - X[i] >= 0:
-                diff_binary[i] = 1
-
-        return _long_stretch(diff_binary, 0)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SB_MotifThree_quantile_hh(X):
-        # Shannon entropy of two successive letters in equiprobable 3-letter
-        # symbolization.
-        indicies = np.argsort(X)
-        bins = np.zeros(len(X))
-        q1 = int(len(X) / 3)
-        q2 = q1 * 2
-        l1 = np.zeros(q1, dtype=np.int_)
-        for i in range(q1):
-            l1[i] = indicies[i]
-        l2 = np.zeros(q1, dtype=np.int_)
-        c1 = 0
-        for i in range(q1, q2):
-            bins[indicies[i]] = 1
-            l2[c1] = indicies[i]
-            c1 += 1
-        l3 = np.zeros(len(indicies) - q2, dtype=np.int_)
-        c2 = 0
-        for i in range(q2, len(indicies)):
-            bins[indicies[i]] = 2
-            l3[c2] = indicies[i]
-            c2 += 1
-
-        found_last = False
-        nsum = 0
-        for i in range(3):
-            if i == 0:
-                o = l1
-            elif i == 1:
-                o = l2
-            else:
-                o = l3
-
-            if not found_last:
-                for n in range(len(o)):
-                    if o[n] == len(X) - 1:
-                        o = np.delete(o, n)
-                        break
-
-            for n in range(3):
-                nsum2 = 0
-
-                for v in o:
-                    if bins[v + 1] == n:
-                        nsum2 += 1
-
-                if nsum2 > 0:
-                    nsum2 /= len(X) - 1
-                    nsum += nsum2 * np.log(nsum2)
-
-        return -nsum
-
-    @staticmethod
-    def _FC_LocalSimple_mean1_tauresrat(X, acfz):
-        # Change in correlation length after iterative differencing.
-        if len(X) < 2:
-            return 0
-        res = _local_simple_mean(X, 1)
-        mean = np.mean(res)
-
-        nfft = int(np.power(2, np.ceil(np.log(len(res)) / np.log(2))))
-        fft = np.fft.fft(res - mean, n=nfft)
-        ac = _autocorr(res, fft)
-
-        return _ac_first_zero(ac) / acfz
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _CO_Embed2_Dist_tau_d_expfit_meandiff(X, acfz):
-        # Exponential fit to successive distances in 2-d embedding space.
-        tau = acfz
-        if tau > len(X) / 10:
-            tau = int(len(X) / 10)
-
-        d = np.zeros(len(X) - tau - 1)
-        d_mean = 0
-        for i in range(len(d)):
-            n = np.sqrt(
-                np.power(X[i + 1] - X[i], 2) + np.power(X[i + tau + 1] - X[i + tau], 2)
-            )
-            d[i] = n
-            d_mean += n
-        d_mean /= len(X) - tau - 1
-
-        smin = np.min(d)
-        smax = np.max(d)
-        srange = smax - smin
-        std = np.std(d)
-
-        if std == 0:
-            return np.nan
-
-        num_bins = int(
-            np.ceil(srange / (3.5 * np.std(d) / np.power(len(d), 0.3333333333333333)))
-        )
-
-        if num_bins == 0:
-            return np.nan
-        bin_width = srange / num_bins
-
-        histogram = np.zeros(num_bins)
-        for val in d:
-            idx = int((val - smin) / bin_width)
-            if idx >= num_bins:
-                idx = num_bins - 1
-            histogram[idx] += 1
-
-        sum = 0
-        for i in range(num_bins):
-            center = ((smin + bin_width * i) * 2 + bin_width) / 2
-            n = np.exp(-center / d_mean) / d_mean
-            if n < 0:
-                n = 0
-
-            sum += np.abs(histogram[i] / len(d) - n)
-
-        return sum / num_bins
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1(X):
-        # Proportion of slower timescale fluctuations that scale with DFA (50%
-        # sampling).
-        cs = np.zeros(int(len(X) / 2))
-        cs[0] = X[0]
-        for i in range(1, len(cs)):
-            cs[i] = cs[i - 1] + X[i * 2]
-
-        return _fluct_prop(cs, len(X), True)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1(X):
-        # Proportion of slower timescale fluctuations that scale with linearly rescaled
-        # range fits.
-        cs = np.zeros(len(X))
-        cs[0] = X[0]
-        for i in range(1, len(X)):
-            cs[i] = cs[i - 1] + X[i]
-
-        return _fluct_prop(cs, len(X), False)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _SB_TransitionMatrix_3ac_sumdiagcov(X, acfz):
-        # Trace of covariance of transition matrix between symbols in 3-letter
-        # alphabet.
-        ds = np.zeros(int((len(X) - 1) / acfz + 1))
-        for i in range(len(ds)):
-            ds[i] = X[i * acfz]
-        indicies = np.argsort(ds)
-
-        bins = np.zeros(len(ds), dtype=np.int32)
-        q1 = int(len(ds) / 3)
-        q2 = q1 * 2
-        for i in range(q1 + 1, q2 + 1):
-            bins[indicies[i]] = 1
-        for i in range(q2 + 1, len(indicies)):
-            bins[indicies[i]] = 2
-
-        t = np.zeros((3, 3))
-        for i in range(len(ds) - 1):
-            t[bins[i + 1]][bins[i]] += 1
-        t /= len(ds) - 1
-
-        means = np.zeros(3)
-        for i in range(3):
-            means[i] = np.mean(t[i])
-
-        cov = np.zeros((3, 3))
-        for i in range(3):
-            for n in range(3):
-                covariance = 0
-                for j in range(3):
-                    covariance += (t[i][j] - means[i]) * (t[n][j] - means[n])
-                covariance /= 2
-
-                cov[i][n] = covariance
-                cov[n][i] = covariance
-
-        ssum = 0
-        for i in range(3):
-            ssum += cov[i][i]
-
-        return ssum
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _PD_PeriodicityWang_th0_01(X):
-        # Periodicity measure of (Wang et al. 2007).
-        y_spline = _spline_fit(X)
-
-        y_sub = np.zeros(len(X))
-        for i in range(len(X)):
-            y_sub[i] = X[i] - y_spline[i]
-
-        acmax = int(np.ceil(len(X) / 3.0))
-        acf = np.zeros(acmax)
-        for tau in range(1, acmax + 1):
-            covariance = 0
-            for i in range(len(X) - tau):
-                covariance += y_sub[i] * y_sub[i + tau]
-            acf[tau - 1] = covariance / (len(X) - tau)
-
-        troughs = np.zeros(acmax, dtype=np.int32)
-        peaks = np.zeros(acmax, dtype=np.int32)
-        n_troughs = 0
-        n_peaks = 0
-        for i in range(1, acmax - 1):
-            slope_in = acf[i] - acf[i - 1]
-            slope_out = acf[i + 1] - acf[i]
-
-            if slope_in < 0 and slope_out > 0:
-                troughs[n_troughs] = i
-                n_troughs += 1
-            elif slope_in > 0 and slope_out < 0:
-                peaks[n_peaks] = i
-                n_peaks += 1
-
-        out = 0
-        for i in range(n_peaks):
-            j = -1
-            while troughs[j + 1] < peaks[i] and j + 1 < n_troughs:
-                j += 1
-
-            if j == -1 or acf[peaks[i]] - acf[troughs[j]] < 0.01 or acf[peaks[i]] < 0:
-                continue
-
-            out = peaks[i]
-            break
-
-        return out
-
-
-@njit(fastmath=True, cache=True)
-def _histogram_mode(X, num_bins, smin, smax):
-    bin_width = (smax - smin) / num_bins
-
-    if bin_width == 0:
-        return np.nan
-
-    histogram = np.zeros(num_bins)
-    for val in X:
-        idx = int((val - smin) / bin_width)
-        idx = num_bins - 1 if idx >= num_bins else idx
-        histogram[idx] += 1
-
-    edges = np.zeros(num_bins + 1, dtype=np.float32)
-    for i in range(len(edges)):
-        edges[i] = i * bin_width + smin
-
-    max_count = 0
-    num_maxs = 1
-    max_sum = 0
-    for i in range(num_bins):
-        v = (edges[i] + edges[i + 1]) / 2
-        if histogram[i] > max_count:
-            max_count = histogram[i]
-            num_maxs = 1
-            max_sum = v
-        elif histogram[i] == max_count:
-            num_maxs += 1
-            max_sum += v
-
-    return max_sum / num_maxs
-
-
-@njit(fastmath=True, cache=True)
-def _long_stretch(X_binary, val):
-    last_val = 0
-    max_stretch = 0
-    for i in range(len(X_binary)):
-        if X_binary[i] != val or i == len(X_binary) - 1:
-            stretch = i - last_val
-            if stretch > max_stretch:
-                max_stretch = stretch
-            last_val = i
-
-    return max_stretch
-
-
-@njit(fastmath=True, cache=True)
-def _outlier_include(X):
-    total = 0
-    threshold = 0
-    for v in X:
-        if v >= 0:
-            total += 1
-            if v > threshold:
-                threshold = v
-
-    if threshold < 0.01:
-        return 0
-
-    num_thresholds = int(threshold / 0.01) + 1
-    means = np.zeros(num_thresholds)
-    dists = np.zeros(num_thresholds)
-    medians = np.zeros(num_thresholds)
-    for i in range(num_thresholds):
-        d = i * 0.01
-
-        count = 0
-        r = np.zeros(len(X))
-        for n in range(len(X)):
-            if X[n] >= d:
-                r[count] = n + 1
-                count += 1
-
-        if count == 0:
-            continue
-
-        diff = np.zeros(count - 1)
-        for n in range(len(diff)):
-            diff[n] = r[n + 1] - r[n]
-
-        means[i] = np.mean(diff) if len(diff) > 0 else 9999999999
-        dists[i] = len(diff) * 100 / total
-        medians[i] = np.median(r[:count]) / (len(X) / 2) - 1
-
-    mj = 0
-    fbi = num_thresholds - 1
-    for i in range(num_thresholds):
-        if dists[i] > 2:
-            mj = i
-        if means[i] == 9999999999:
-            fbi = num_thresholds - 1 - i
-
-    trim_limit = max(mj, fbi)
-
-    return np.median(medians[: trim_limit + 1])
-
-
-def _autocorr(X, X_fft):
-    ca = np.fft.ifft(_multiply_complex_arr(X_fft))
-    return _get_acf(X, ca)
-
-
-@njit(fastmath=True, cache=True)
-def _multiply_complex_arr(X_fft):
-    c = np.zeros(len(X_fft), dtype=np.complex128)
-    for i, n in enumerate(X_fft):
-        c[i] = n * (n.real + 1j * -n.imag)
-    return c
-
-
-@njit(fastmath=True, cache=True)
-def _get_acf(X, ca):
-    acf = np.zeros(len(X))
-    if ca[0].real != 0:
-        for i in range(len(X)):
-            acf[i] = ca[i].real / ca[0].real
-    return acf
-
-
-@njit(fastmath=True, cache=True)
-def _summaries_welch_rect(X, centroid, X_fft):
-    new_length = int(len(X_fft) / 2) + 1
-    p = np.zeros(new_length)
-    pi2 = 2 * math.pi
-    p[0] = (np.power(_complex_magnitude(X_fft[0]), 2) / len(X)) / pi2
-    for i in range(1, new_length - 1):
-        p[i] = ((np.power(_complex_magnitude(X_fft[i]), 2) / len(X)) * 2) / pi2
-    p[new_length - 1] = (
-        np.power(_complex_magnitude(X_fft[new_length - 1]), 2) / len(X)
-    ) / pi2
-
-    w = np.zeros(new_length)
-    a = 1.0 / len(X_fft)
-    for i in range(0, new_length):
-        w[i] = i * a * math.pi * 2
-
-    if centroid:
-        cs = np.zeros(new_length)
-        cs[0] = p[0]
-        for i in range(1, new_length):
-            cs[i] = cs[i - 1] + p[i]
-
-        threshold = cs[new_length - 1] / 2
-        for i in range(1, new_length):
-            if cs[i] > threshold:
-                return w[i]
-        return np.nan
-    else:
-        tau = int(np.floor(new_length / 5))
-        nsum = 0
-        for i in range(tau):
-            nsum += p[i]
-
-        return nsum * (w[1] - w[0])
-
-
-@njit(fastmath=True, cache=True)
-def _complex_magnitude(c):
-    return np.sqrt(c.real * c.real + c.imag * c.imag)
-
-
-@njit(fastmath=True, cache=True)
-def _local_simple_mean(X, train_length):
-    res = np.zeros(len(X) - train_length)
-    for i in range(len(res)):
-        nsum = 0
-        for n in range(train_length):
-            nsum += X[i + n]
-        res[i] = X[i + train_length] - nsum / train_length
-    return res
-
-
-@njit(fastmath=True, cache=True)
-def _ac_first_zero(X_ac):
-    for i in range(1, len(X_ac)):
-        if X_ac[i] <= 0:
-            return i
-
-    return len(X_ac)
-
-
-@njit(fastmath=True, cache=True)
-def _fluct_prop(X, og_length, dfa):
-    a = np.zeros(50, dtype=np.int_)
-    a[0] = 5
-    n_tau = 1
-    smin = 1.6094379124341003  # Math.log(5);
-    smax = np.log(og_length / 2)
-    inc = (smax - smin) / 49
-    for i in range(1, 50):
-        val = int(np.round(np.exp(smin + inc * i) + 0.000000000001))
-        if val != a[n_tau - 1]:
-            a[n_tau] = val
-            n_tau += 1
-
-    if n_tau < 12:
-        return np.nan
-
-    f = np.zeros(n_tau)
-    for i in range(n_tau):
-        tau = a[i]
-        buff_size = int(len(X) / tau)
-        lag = 0
-        if buff_size == 0:
-            buff_size = 1
-            lag = 1
-
-        buffer = np.zeros((buff_size, tau))
-        count = 0
-        for n in range(buff_size):
-            for j in range(tau - lag):
-                buffer[n][j] = X[count]
-                count += 1
-
-        d = np.zeros(tau)
-        for n in range(tau):
-            d[n] = n + 1
-
-        for n in range(buff_size):
-            c1, c2 = _linear_regression(d, buffer[n], tau, 0)
-
-            for j in range(tau):
-                buffer[n][j] = buffer[n][j] - (c1 * (j + 1) + c2)
-
-            if dfa:
-                for j in range(tau):
-                    f[i] += buffer[n][j] * buffer[n][j]
-            else:
-                f[i] += np.power(np.max(buffer[n]) - np.min(buffer[n]), 2)
-
-        if dfa:
-            f[i] = np.sqrt(f[i] / (buff_size * tau))
-        else:
-            f[i] = np.sqrt(f[i] / buff_size)
-
-    log_a = np.zeros(n_tau)
-    log_f = np.zeros(n_tau)
-    for i in range(n_tau):
-        log_a[i] = np.log(a[i])
-        log_f[i] = np.log(f[i])
-
-    sserr = np.zeros(n_tau - 11)
-    for i in range(6, n_tau - 5):
-        c1_1, c1_2 = _linear_regression(log_a, log_f, i, 0)
-        c2_1, c2_2 = _linear_regression(log_a, log_f, n_tau - i + 1, i - 1)
-
-        sum1 = 0
-        for n in range(i):
-            sum1 += np.power(log_a[n] * c1_1 + c1_2 - log_f[n], 2)
-        sserr[i - 6] += np.sqrt(sum1)
-
-        sum2 = 0
-        for n in range(n_tau - i + 1):
-            sum2 += np.power(log_a[n + i - 1] * c2_1 + c2_2 - log_f[n + i - 1], 2)
-        sserr[i - 6] += np.sqrt(sum2)
-
-    return (np.argmin(sserr) + 6) / n_tau
-
-
-@njit(fastmath=True, cache=True)
-def _linear_regression(X, y, n, lag):
-    sumx = 0
-    sumx2 = 0
-    sumxy = 0
-    sumy = 0
-    for i in range(lag, n + lag):
-        sumx += X[i]
-        sumx2 += X[i] * X[i]
-        sumxy += X[i] * y[i]
-        sumy += y[i]
-
-    denom = n * sumx2 - sumx * sumx
-    if denom == 0:
-        return 0, 0
-
-    return (n * sumxy - sumx * sumy) / denom, (sumy * sumx2 - sumx * sumxy) / denom
-
-
-@njit(fastmath=True, cache=True)
-def _spline_fit(X):
-    breaks = np.array([0, len(X) / 2 - 1, len(X) - 1])
-    h0 = np.array([breaks[1] - breaks[0], breaks[2] - breaks[1]])
-    h_copy = np.array([h0[0], h0[1], h0[0], h0[1]])
-    hl = np.array([h_copy[3], h_copy[2], h_copy[1]])
-    hr = np.array([h_copy[0], h_copy[1], h_copy[2]])
-
-    hlCS = np.zeros(3)
-    hlCS[0] = hl[0]
-    for i in range(1, 3):
-        hlCS[i] = hlCS[i - 1] + hl[i]
-
-    bl = np.zeros(3)
-    for i in range(3):
-        bl[i] = breaks[0] - hlCS[i]
-
-    hrCS = np.zeros(3)
-    hrCS[0] = hr[0]
-    for i in range(1, 3):
-        hrCS[i] = hrCS[i - 1] + hr[i]
-
-    br = np.zeros(3)
-    for i in range(3):
-        br[i] = breaks[2] - hrCS[i]
-
-    breaksExt = np.zeros(9)
-    for i in range(3):
-        breaksExt[i] = bl[2 - i]
-        breaksExt[i + 3] = breaks[i]
-        breaksExt[i + 6] = br[i]
-
-    hExt = np.zeros(8)
-    for i in range(8):
-        hExt[i] = breaksExt[i + 1] - breaksExt[i]
-
-    coeffs = np.zeros((32, 4))
-    for i in range(0, 32, 4):
-        coeffs[i][0] = 1
-
-    ii = np.zeros((4, 8), dtype=np.int32)
-    for i in range(8):
-        ii[0][i] = i
-        ii[1][i] = min(1 + i, 7)
-        ii[2][i] = min(2 + i, 7)
-        ii[3][i] = min(3 + i, 7)
-
-    H = np.zeros(32)
-    for i in range(32):
-        H[i] = hExt[ii[i % 4][int(i / 4)]]
-
-    for k in range(1, 4):
-        for j in range(k):
-            for u in range(32):
-                coeffs[u][j] *= H[u] / (k - j)
-
-        Q = np.zeros((4, 8))
-        for u in range(32):
-            for m in range(4):
-                Q[u % 4][int(u / 4)] += coeffs[u][m]
-
-        for u in range(8):
-            for m in range(1, 4):
-                Q[m][u] += Q[m - 1][u]
-
-        for u in range(32):
-            if u % 4 > 0:
-                coeffs[u][k] = Q[u % 4 - 1][int(u / 4)]
-
-        fmax = np.zeros(32)
-        for i in range(8):
-            for j in range(4):
-                fmax[i * 4 + j] = Q[3][i]
-
-        for j in range(k + 1):
-            for u in range(32):
-                coeffs[u][j] /= max(1e-16, fmax[u])
-
-        for i in range(29):
-            for j in range(k + 1):
-                coeffs[i][j] -= coeffs[3 + i][j]
-
-        for i in range(0, 32, 4):
-            coeffs[i][k] = 0
-
-    scale = np.ones(32)
-    for k in range(3):
-        for i in range(32):
-            scale[i] /= max(1e-16, H[i])
-
-        for i in range(32):
-            coeffs[i][3 - (k + 1)] *= scale[i]
-
-    jj = np.zeros((4, 2), dtype=np.int32)
-    for i in range(4):
-        for j in range(2):
-            if i == 0:
-                jj[i][j] = 4 * (1 + j)
-            else:
-                jj[i][j] = 3
-
-    for i in range(1, 4):
-        for j in range(2):
-            jj[i][j] += jj[i - 1][j]
-
-    coeffs_out = np.zeros((8, 4))
-    for i in range(8):
-        coeffs_out[i] = coeffs[jj[i % 4][int(i / 4)] - 1]
-
-    xsB = np.zeros(len(X) * 4)
-    indexB = np.zeros(len(xsB), dtype=np.int32)
-    breakInd = 1
-    for i in range(len(X)):
-        if i >= breaks[1] and breakInd < 2:
-            breakInd += 1
-
-        for j in range(4):
-            xsB[i * 4 + j] = i - breaks[breakInd - 1]
-            indexB[i * 4 + j] = j + (breakInd - 1) * 4
-
-    vB = np.zeros(len(xsB))
-    for i in range(len(xsB)):
-        vB[i] = coeffs_out[indexB[i]][0]
-
-    for i in range(1, 4):
-        for j in range(len(xsB)):
-            vB[j] = vB[j] * xsB[j] + coeffs_out[indexB[j]][i]
-
-    A = np.zeros(len(X) * 5)
-    breakInd = 0
-    for i in range(len(xsB)):
-        if i / 4 >= breaks[1]:
-            breakInd = 1
-        A[i % 4 + breakInd + int(i / 4) * 5] = vB[i]
-
-    AT = np.zeros(len(A))
-    ATA = np.zeros(25)
-    ATb = np.zeros(5)
-    for i in range(len(X)):
-        for j in range(5):
-            AT[j * len(X) + i] = A[i * 5 + j]
-
-    for i in range(5):
-        for j in range(5):
-            for k in range(len(X)):
-                ATA[i * 5 + j] += AT[i * len(X) + k] * A[k * 5 + j]
-
-    for i in range(5):
-        for k in range(len(X)):
-            ATb[i] += AT[i * len(X) + k] * X[k]
-
-    AElim = np.zeros((5, 5))
-    for i in range(5):
-        n = i * 5
-        AElim[i] = ATA[n : n + 5]
-
-    for i in range(5):
-        for j in range(i + 1, 5):
-            factor = AElim[j][i] / max(1e-16, AElim[i][i])
-            ATb[j] = ATb[j] - factor * ATb[i]
-
-            for k in range(i, 5):
-                AElim[j][k] = AElim[j][k] - factor * AElim[i][k]
-
-    x = np.zeros(5)
-    for i in range(4, -1, -1):
-        bMinusATemp = ATb[i]
-        for j in range(i + 1, 5):
-            bMinusATemp -= x[j] * AElim[i][j]
-
-        x[i] = bMinusATemp / max(1e-16, AElim[i][i])
-
-    C = np.zeros((5, 8))
-    for i in range(32):
-        C[int(i % 4 + int(i / 4) % 2)][int(i / 4)] = coeffs_out[i % 8][int(i / 8)]
-
-    coeffs_spline = np.zeros((2, 4))
-    for j in range(8):
-        coefc = int(j / 2)
-        coefr = j % 2
-        for i in range(5):
-            coeffs_spline[coefr][coefc] += C[i][j] * x[i]
-
-    y_out = np.zeros(len(X))
-    for i in range(len(X)):
-        second_half = 0 if i < breaks[1] else 1
-        y_out[i] = coeffs_spline[second_half][0]
-
-    for i in range(1, 4):
-        for j in range(len(X)):
-            second_half = 0 if j < breaks[1] else 1
-            y_out[j] = (
-                y_out[j] * (j - breaks[1] * second_half) + coeffs_spline[second_half][i]
-            )
-
-    return y_out
+        return {}
 
 
 def _verify_features(features, catch24):
diff --git a/tsml/transformations/_fpca.py b/tsml/transformations/_fpca.py
index 207912f..71bc12a 100644
--- a/tsml/transformations/_fpca.py
+++ b/tsml/transformations/_fpca.py
@@ -163,6 +163,9 @@ def transform(self, X):
 
         return X_t
 
+    def _more_tags(self) -> dict:
+        return {"optional_dependency": True}
+
     @classmethod
     def get_test_params(cls, parameter_set="default"):
         """Return testing parameter settings for the estimator.
diff --git a/tsml/transformations/_periodogram.py b/tsml/transformations/_periodogram.py
index 91131a6..c230180 100644
--- a/tsml/transformations/_periodogram.py
+++ b/tsml/transformations/_periodogram.py
@@ -27,7 +27,7 @@ class PeriodogramTransformer(TransformerMixin, BaseTimeSeriesEstimator):
         options. By default, the series will be padded with zeros.
     constant_value : int, default=0
         The value to use when padding with a constant value.
-    use_pyfftw : bool, default=True
+    use_pyfftw : bool, default=False
         Whether to use the pyfftw library for FFT calculations. Requires the pyfftw
         package to be installed.
     n_jobs : int, default=1
@@ -57,7 +57,7 @@ def __init__(
         pad_series=True,
         pad_with="constant",
         constant_value=0,
-        use_pyfftw=True,
+        use_pyfftw=False,
         n_jobs=1,
     ):
         self.use_pyfftw = use_pyfftw
diff --git a/tsml/transformations/_sfa.py b/tsml/transformations/_sfa.py
deleted file mode 100644
index da7c269..0000000
--- a/tsml/transformations/_sfa.py
+++ /dev/null
@@ -1,1193 +0,0 @@
-"""Symbolic Fourier Approximation (SFA) Transformer.
-Configurable SFA transform for discretising time series into words.
-"""
-
-__author__ = ["patrickzib", "MatthewMiddlehurst"]
-__all__ = ["SFATransformer"]
-
-import math
-import sys
-
-import numpy as np
-from numba import njit, objmode, prange
-from numba.core import types
-from numba.typed import Dict
-from scipy.sparse import csr_matrix, hstack
-from sklearn.base import TransformerMixin
-from sklearn.feature_selection import chi2, f_classif
-from sklearn.preprocessing import KBinsDiscretizer
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.utils import check_random_state
-
-from tsml.base import BaseTimeSeriesEstimator
-
-binning_methods = {"equi-depth", "equi-width", "information-gain", "kmeans", "quantile"}
-
-
-class SFATransformer(TransformerMixin, BaseTimeSeriesEstimator):
-    """Symbolic Fourier Approximation (SFA) Transformer.
-    Overview: for each series:
-        run a sliding window across the series
-        for each window
-            shorten the series with DFT
-            discretise the shortened series into bins set by MFC
-            form a word from these discrete values
-    by default SFA produces a single word per series (window_size=0)
-    if a window is used, it forms a histogram of counts of words.
-
-    Parameters
-    ----------
-        word_length:         int, default = 8
-            length of word to shorten window to (using PAA)
-        alphabet_size:       int, default = 4
-            number of values to discretise each value to
-        window_size:         int, default = 12
-            size of window for sliding. Input series
-            length for whole series transform
-        norm:                boolean, default = False
-            mean normalise words by dropping first fourier coefficient
-        binning_method:      {"equi-depth", "equi-width", "information-gain", "kmeans",
-                              "quantile"},
-                             default="equi-depth"
-            the binning method used to derive the breakpoints.
-        anova:               boolean, default = False
-            If True, the Fourier coefficient selection is done via a one-way
-            ANOVA test. If False, the first Fourier coefficients are selected.
-            Only applicable if labels are given
-        variance:               boolean, default = False
-            If True, the Fourier coefficient selection is done via the largest
-            variance. If False, the first Fourier coefficients are selected.
-            Only applicable if labels are given
-        save_words:          boolean, default = False
-            whether to save the words generated for each series (default False)
-        bigrams:             boolean, default = False
-            whether to create bigrams of SFA words
-        feature_selection: {"chi2", "none", "random"}, default: chi2
-            Sets the feature selections strategy to be used. Chi2 reduces the number
-            of words significantly and is thus much faster (preferred). Random also
-            reduces the number significantly. None applies not feature selectiona and
-            yields large bag of words, e.g. much memory may be needed.
-        p_threshold:  int, default=0.05 (disabled by default)
-            If feature_selection=chi2 is chosen, feature selection is applied based on
-            the chi-squared test. This is the p-value threshold to use for chi-squared
-            test on bag-of-words (lower means more strict). 1 indicates that the test
-            should not be performed.
-        max_feature_count:  int, default=256
-            If feature_selection=random is chosen, this parameter defines the number of
-            randomly chosen unique words used.
-        skip_grams:     boolean, default = False
-            whether to create skip-grams of SFA words
-        remove_repeat_words: boolean, default = False
-            whether to use numerosity reduction (default False)
-        return_sparse:  boolean, default=True
-            if set to true, a scipy sparse matrix will be returned as BOP model.
-            If set to false a dense array will be returned as BOP model. Sparse
-            arrays are much more compact.
-        n_jobs:     int, optional, default = 1
-            The number of jobs to run in parallel for both `transform`.
-            ``-1`` means using all processors.
-        return_pandas_data_series:          boolean, default = False
-            set to true to return Pandas Series as a result of transform.
-            setting to true reduces speed significantly but is required for
-            automatic test.
-
-    Attributes
-    ----------
-    breakpoints: = []
-    num_insts = 0
-    num_atts = 0
-
-    References
-    ----------
-    .. [1] Schäfer, Patrick, and Mikael Högqvist. "SFA: a symbolic fourier approximation
-    and  index for similarity search in high dimensional datasets." Proceedings of the
-    15th international conference on extending database technology. 2012.
-    """
-
-    def __init__(
-        self,
-        word_length=8,
-        alphabet_size=4,
-        window_size=12,
-        norm=False,
-        binning_method="equi-depth",
-        anova=False,
-        variance=False,
-        bigrams=False,
-        skip_grams=False,
-        remove_repeat_words=False,
-        lower_bounding=True,
-        save_words=False,
-        dilation=1,
-        first_difference=False,
-        feature_selection="none",
-        sections=1,
-        max_feature_count=256,
-        p_threshold=0.05,
-        random_state=None,
-        return_sparse=True,
-        return_pandas_data_series=False,
-        n_jobs=1,
-    ):
-        self.words = []
-        self.breakpoints = []
-
-        # we cannot select more than window_size many letters in a word
-        self.word_length = word_length
-
-        self.alphabet_size = alphabet_size
-        self.window_size = window_size
-
-        self.norm = norm
-        self.lower_bounding = lower_bounding
-        self.inverse_sqrt_win_size = (
-            1.0 / math.sqrt(window_size) if not lower_bounding else 1.0
-        )
-
-        self.remove_repeat_words = remove_repeat_words
-
-        self.save_words = save_words
-
-        self.binning_method = binning_method
-        self.anova = anova
-        self.variance = variance
-
-        self.bigrams = bigrams
-        self.skip_grams = skip_grams
-        self.n_jobs = n_jobs
-        self.sections = sections
-
-        self.n_instances = 0
-        self.series_length = 0
-        self.letter_bits = 0
-
-        self.dilation = dilation
-        self.first_difference = first_difference
-
-        # Feature selection part
-        self.feature_selection = feature_selection
-        self.max_feature_count = max_feature_count
-        self.feature_count = 0
-        self.relevant_features = None
-
-        # feature selection is applied based on the chi-squared test.
-        self.p_threshold = p_threshold
-
-        self.return_sparse = return_sparse
-        self.return_pandas_data_series = return_pandas_data_series
-
-        self.random_state = random_state
-        super().__init__()
-
-        if not return_pandas_data_series:
-            self._output_convert = "off"
-
-    def fit_transform(self, X, y=None):
-        """Fit to data, then transform it."""
-        if self.alphabet_size < 2:
-            raise ValueError("Alphabet size must be an integer greater than 2")
-
-        if self.binning_method == "information-gain" and y is None:
-            raise ValueError(
-                "Class values must be provided for information gain binning"
-            )
-
-        if self.variance and self.anova:
-            raise ValueError(
-                "Please set either variance or anova Fourier coefficient" " selection"
-            )
-
-        if self.binning_method not in binning_methods:
-            raise TypeError("binning_method must be one of: ", binning_methods)
-
-        offset = 2 if self.norm else 0
-        self.word_length_actual = min(self.window_size - offset, self.word_length)
-        self.dft_length = (
-            self.window_size - offset
-            if (self.anova or self.variance) is True
-            else self.word_length_actual
-        )
-        # make dft_length an even number (same number of reals and imags)
-        self.dft_length = self.dft_length + self.dft_length % 2
-        self.word_length_actual = self.word_length_actual + self.word_length_actual % 2
-
-        self.support = np.arange(self.word_length_actual)
-        self.letter_bits = np.uint32(math.ceil(math.log2(self.alphabet_size)))
-        # self.word_bits = self.word_length_actual * self.letter_bits
-
-        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)
-        X = X.squeeze(1)
-
-        X2, self.X_index = _dilation(X, self.dilation, self.first_difference)
-        self.n_instances, self.series_length = X2.shape
-        self.breakpoints = self._binning(X2, y)
-        self._is_fitted = True
-
-        words, dfts = _transform_case(
-            X2,
-            self.window_size,
-            self.dft_length,
-            self.word_length_actual,
-            self.norm,
-            self.remove_repeat_words,
-            self.support,
-            self.anova,
-            self.variance,
-            self.breakpoints,
-            self.letter_bits,
-            self.bigrams,
-            self.skip_grams,
-            self.inverse_sqrt_win_size,
-            self.lower_bounding,
-        )
-
-        if self.remove_repeat_words:
-            words = remove_repeating_words(words)
-
-        if self.save_words:
-            self.words = words
-
-        # fitting: learns the feature selection strategy, too
-        bag = self.transform_to_bag(words, self.word_length_actual, y)
-
-        # transform: applies the feature selection strategy
-        # bag = self.add_level(bag, words)
-        # bag[bag == 0] = -10
-        return bag
-
-    def add_level(self, bag, words):
-        """Add one pyramid level."""
-        empty_dict = Dict.empty(
-            key_type=types.uint32,
-            value_type=types.uint32,
-        )
-
-        idx = self.X_index[: words.shape[1]] < len(self.X_index) // 2
-
-        bag_lvl2_l = create_bag_transform(
-            self.X_index,
-            self.feature_count,
-            self.feature_selection,
-            self.relevant_features if self.relevant_features else empty_dict,
-            words[:, idx],
-            self.remove_repeat_words,
-            self.sections,
-        )[0]
-        bag_lvl2_r = create_bag_transform(
-            self.X_index,
-            self.feature_count,
-            self.feature_selection,
-            self.relevant_features if self.relevant_features else empty_dict,
-            words[:, ~idx],
-            self.remove_repeat_words,
-            self.sections,
-        )[0]
-        if type(bag) is np.ndarray:
-            return np.concatenate([bag, bag_lvl2_l, bag_lvl2_r], axis=1)
-        else:
-            return hstack([bag, bag_lvl2_l, bag_lvl2_r])
-
-    def fit(self, X, y=None):
-        """Calculate word breakpoints using MCB or IGB.
-
-        Parameters
-        ----------
-        X : pandas DataFrame or 3d numpy array, input time series.
-        y : array_like, target values (optional, ignored).
-
-        Returns
-        -------
-        self: object
-        """
-        # with parallel_backend("loky", inner_max_num_threads=n_jobs):
-        self.fit_transform(X, y)
-        return self
-
-    def transform(self, X, y=None):
-        """Transform data into SFA words.
-
-        Parameters
-        ----------
-        X : pandas DataFrame or 3d numpy array, input time series.
-        y : array_like, target values (optional, ignored).
-
-        Returns
-        -------
-        List of dictionaries containing SFA words
-        """
-        self.check_is_fitted()
-        X = check_X(X, enforce_univariate=True, coerce_to_numpy=True)
-        X = X.squeeze(1)
-
-        X2, self.X_index = _dilation(X, self.dilation, self.first_difference)
-        words, dfts = _transform_case(
-            X2,
-            self.window_size,
-            self.dft_length,
-            self.word_length_actual,
-            self.norm,
-            self.remove_repeat_words,
-            self.support,
-            self.anova,
-            self.variance,
-            self.breakpoints,
-            self.letter_bits,
-            self.bigrams,
-            self.skip_grams,
-            self.inverse_sqrt_win_size,
-            self.lower_bounding,
-        )
-
-        # only save at fit
-        # if self.save_words:
-        #    self.words = words
-
-        # transform: applies the feature selection strategy
-        empty_dict = Dict.empty(
-            key_type=types.uint32,
-            value_type=types.uint32,
-        )
-
-        # transform
-        bags = create_bag_transform(
-            self.X_index,
-            self.feature_count,
-            self.feature_selection,
-            self.relevant_features if self.relevant_features else empty_dict,
-            words,
-            self.remove_repeat_words,
-            self.sections,
-        )[0]
-
-        # bags = self.add_level(bags, words)
-
-        if self.return_pandas_data_series:
-            bb = pd.DataFrame()
-            bb[0] = [pd.Series(bag) for bag in bags]
-            return bb
-        elif self.return_sparse:
-            bags = csr_matrix(bags, dtype=np.uint32)
-
-        # bags[bags==0] = -10
-        return bags
-
-    def transform_to_bag(self, words, word_len, y=None):
-        """Transform words to bag-of-pattern and apply feature selection."""
-        bag_of_words = None
-        rng = check_random_state(self.random_state)
-
-        if self.feature_selection == "none" and (
-            self.breakpoints.shape[1] <= 2
-            and not self.bigrams
-            and self.word_length <= 8
-        ):
-            bag_of_words = create_bag_none(
-                self.X_index,
-                self.breakpoints,
-                words.shape[0],
-                words,
-                word_len,  # self.word_length_actual,
-                self.remove_repeat_words,
-                sections=self.sections,
-            )
-        else:
-            feature_names = create_feature_names(words)
-
-            if self.feature_selection == "none":
-                feature_count = len(list(feature_names))
-                relevant_features_idx = np.arange(feature_count, dtype=np.uint32)
-                bag_of_words, self.relevant_features = create_bag_feature_selection(
-                    self.X_index,
-                    words.shape[0],
-                    relevant_features_idx,
-                    np.array(list(feature_names)),
-                    words,
-                    self.remove_repeat_words,
-                    self.sections,
-                )
-
-            # Random feature selection
-            elif self.feature_selection == "random":
-                feature_count = min(self.max_feature_count, len(feature_names))
-                relevant_features_idx = rng.choice(
-                    len(feature_names), replace=False, size=feature_count
-                )
-                bag_of_words, self.relevant_features = create_bag_feature_selection(
-                    self.X_index,
-                    words.shape[0],
-                    relevant_features_idx,
-                    np.array(list(feature_names)),
-                    words,
-                    self.remove_repeat_words,
-                    self.sections,
-                )
-
-            # Chi-squared feature selection
-            elif self.feature_selection == "chi2":
-                feature_names_array = np.array(list(feature_names))
-                feature_count = len(feature_names_array)
-                relevant_features_idx = np.arange(feature_count, dtype=np.uint32)
-                bag_of_words, _ = create_bag_feature_selection(
-                    self.X_index,
-                    words.shape[0],
-                    relevant_features_idx,
-                    feature_names_array,
-                    words,
-                    self.remove_repeat_words,
-                    self.sections,
-                )
-
-                chi2_statistics, p = chi2(bag_of_words, y)
-                relevant_features_idx = np.argsort(p)[: self.max_feature_count]
-                # relevant_features_idx = np.where(p <= self.p_threshold)[0]
-
-                # select subset of features
-                bag_of_words = bag_of_words[:, relevant_features_idx]
-
-                relevant_features_idx = relevant_features_idx[
-                    relevant_features_idx < len(feature_names_array)
-                ]
-                self.relevant_features = Dict.empty(
-                    key_type=types.uint32, value_type=types.uint32
-                )
-                for k, v in zip(
-                    feature_names_array[relevant_features_idx],
-                    np.arange(len(relevant_features_idx)),
-                ):
-                    self.relevant_features[k] = v
-
-        self.feature_count = bag_of_words.shape[1]
-
-        if self.return_pandas_data_series:
-            bb = pd.DataFrame()
-            bb[0] = [pd.Series(bag) for bag in bag_of_words]
-            return bb
-        elif self.return_sparse:
-            bag_of_words = csr_matrix(bag_of_words, dtype=np.uint32)
-        return bag_of_words
-
-    def _binning(self, X, y=None):
-        dft = _binning_dft(
-            X,
-            self.window_size,
-            self.series_length,
-            self.dft_length,
-            self.norm,
-            self.inverse_sqrt_win_size,
-            self.lower_bounding,
-        )
-
-        if y is not None:
-            y = np.repeat(y, dft.shape[0] / len(y))
-
-        if self.variance and y is not None:
-            # determine variance
-            dft_variance = np.var(dft, axis=0)
-
-            # select word-length-many indices with the largest variance
-            self.support = np.argsort(-dft_variance)[: self.word_length_actual]
-
-            # sort remaining indices
-            self.support = np.sort(self.support)
-
-            # select the Fourier coefficients with highest f-score
-            dft = dft[:, self.support]
-            self.dft_length = np.max(self.support) + 1
-            self.dft_length = self.dft_length + self.dft_length % 2  # even
-
-        if self.anova and y is not None:
-            non_constant = np.where(
-                ~np.isclose(dft.var(axis=0), np.zeros_like(dft.shape[1]))
-            )[0]
-
-            # select word-length many indices with best f-score
-            if self.word_length_actual <= non_constant.size:
-                f, _ = f_classif(dft[:, non_constant], y)
-                self.support = non_constant[np.argsort(-f)][: self.word_length_actual]
-
-            # sort remaining indices
-            self.support = np.sort(self.support)
-
-            # select the Fourier coefficients with highest f-score
-            dft = dft[:, self.support]
-            self.dft_length = np.max(self.support) + 1
-            self.dft_length = self.dft_length + self.dft_length % 2  # even
-
-        if self.binning_method == "information-gain":
-            return self._igb(dft, y)
-        elif self.binning_method == "kmeans" or self.binning_method == "quantile":
-            return self._k_bins_discretizer(dft)
-        else:
-            return self._mcb(dft)
-
-    def _k_bins_discretizer(self, dft):
-        encoder = KBinsDiscretizer(
-            n_bins=self.alphabet_size, strategy=self.binning_method
-        )
-        encoder.fit(dft)
-        if encoder.bin_edges_.ndim == 1:
-            breaks = encoder.bin_edges_.reshape((-1, 1))
-        else:
-            breaks = encoder.bin_edges_
-        breakpoints = np.zeros((self.word_length_actual, self.alphabet_size))
-
-        for letter in range(self.word_length_actual):
-            for bp in range(1, len(breaks[letter]) - 1):
-                breakpoints[letter, bp - 1] = breaks[letter, bp]
-
-        breakpoints[:, self.alphabet_size - 1] = sys.float_info.max
-        return breakpoints
-
-    def _mcb(self, dft):
-        breakpoints = np.zeros((self.word_length_actual, self.alphabet_size))
-
-        dft = np.round(dft, 2)
-        for letter in range(self.word_length_actual):
-            column = np.sort(dft[:, letter])
-            bin_index = 0
-
-            # use equi-depth binning
-            if self.binning_method == "equi-depth":
-                target_bin_depth = len(dft) / self.alphabet_size
-
-                for bp in range(self.alphabet_size - 1):
-                    bin_index += target_bin_depth
-                    breakpoints[letter, bp] = column[int(bin_index)]
-
-            # use equi-width binning aka equi-frequency binning
-            elif self.binning_method == "equi-width":
-                target_bin_width = (column[-1] - column[0]) / self.alphabet_size
-
-                for bp in range(self.alphabet_size - 1):
-                    breakpoints[letter, bp] = (bp + 1) * target_bin_width + column[0]
-
-        breakpoints[:, self.alphabet_size - 1] = sys.float_info.max
-        return breakpoints
-
-    def _igb(self, dft, y):
-        breakpoints = np.zeros((self.word_length_actual, self.alphabet_size))
-        clf = DecisionTreeClassifier(
-            criterion="entropy",
-            max_depth=np.uint32(np.log2(self.alphabet_size)),
-            max_leaf_nodes=self.alphabet_size,
-            random_state=1,
-        )
-
-        for i in range(self.word_length_actual):
-            clf.fit(dft[:, i][:, None], y)
-            threshold = clf.tree_.threshold[clf.tree_.children_left != -1]
-            for bp in range(len(threshold)):
-                breakpoints[i, bp] = threshold[bp]
-            for bp in range(len(threshold), self.alphabet_size):
-                breakpoints[i, bp] = np.inf
-
-        return np.sort(breakpoints, axis=1)
-
-    def _shorten_bags(self, word_len, y):
-        if self.save_words is False:
-            raise ValueError(
-                "Words from transform must be saved using save_word to shorten bags."
-            )
-        if self.bigrams:
-            raise ValueError("Bigrams are currently not supported.")
-        if self.variance or self.anova:
-            raise ValueError(
-                "Variance or Anova based feature selection is currently not supported."
-            )
-
-        # determine the new word-length
-        new_len = min(word_len, self.word_length_actual)
-
-        # the difference in word-length to shorten the words to
-        new_len_diff = self.word_length_actual - new_len
-
-        if new_len_diff > 0:
-            new_words = shorten_words(self.words, new_len_diff, self.letter_bits)
-        else:
-            new_words = self.words
-
-        # retrain feature selection-strategy
-        return self.transform_to_bag(new_words, new_len, y)
-
-    @classmethod
-    def get_test_params(cls, parameter_set="default"):
-        """Return testing parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : str, default="default"
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict, default = {}
-            Parameters to create testing instances of the class
-            Each dict are parameters to construct an "interesting" test instance, i.e.,
-            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
-            `create_test_instance` uses the first (or only) dictionary in `params`
-        """
-        # small window size for testing
-        params = {
-            "word_length": 2,
-            "window_size": 6,
-            "return_sparse": True,
-            "return_pandas_data_series": True,
-            "feature_selection": "chi2",
-            "alphabet_size": 2,
-        }
-        return params
-
-    def set_fitted(self):
-        """Whether `fit` has been called."""
-        self._is_fitted = True
-
-    def __getstate__(self):
-        """Return state as dictionary for pickling, required for typed Dict objects."""
-        state = self.__dict__.copy()
-
-        if type(state["relevant_features"]) == Dict:
-            state["relevant_features"] = dict(state["relevant_features"])
-        return state
-
-    def __setstate__(self, state):
-        """Set current state using input pickling, required for typed Dict objects."""
-        self.__dict__.update(state)
-        if type(self.relevant_features) == dict:
-            typed_dict = Dict.empty(key_type=types.uint32, value_type=types.uint32)
-            for key, value in self.relevant_features.items():
-                typed_dict[key] = value
-            self.relevant_features = typed_dict
-
-
-@njit(fastmath=True, cache=True)
-def _binning_dft(
-    X,
-    window_size,
-    series_length,
-    dft_length,
-    norm,
-    inverse_sqrt_win_size,
-    lower_bounding,
-):
-    num_windows_per_inst = math.ceil(series_length / window_size)
-
-    # Splits individual time series into windows and returns the DFT for each
-    data = np.zeros((len(X), num_windows_per_inst, window_size))
-    for i in prange(len(X)):
-        for j in range(num_windows_per_inst - 1):
-            data[i, j] = X[i, window_size * j : window_size * (j + 1)]
-
-        start = series_length - window_size
-        data[i, -1] = X[i, start:series_length]
-
-    dft = np.zeros((len(X), num_windows_per_inst, dft_length))
-    for i in prange(len(X)):
-        return_val = _fast_fourier_transform(
-            data[i], norm, dft_length, inverse_sqrt_win_size
-        )
-        dft[i] = return_val
-
-    if lower_bounding:
-        dft[:, :, 1::2] = dft[:, :, 1::2] * -1  # lower bounding
-
-    return dft.reshape(dft.shape[0] * dft.shape[1], dft_length)
-
-
-@njit(fastmath=True, cache=True)
-def _fast_fourier_transform(X, norm, dft_length, inverse_sqrt_win_size):
-    """Perform a discrete fourier transform using the fast fourier transform.
-    if self.norm is True, then the first term of the DFT is ignored
-    Input
-    -------
-    X : The training input samples.  array-like or sparse matrix of
-    shape = [n_samps, num_atts]
-
-    Returns
-    -------
-    1D array of fourier term, real_0,imag_0, real_1, imag_1 etc, length
-    num_atts or
-    num_atts-2 if if self.norm is True
-    """
-    # first two are real and imaginary parts
-    start = 2 if norm else 0
-    length = start + dft_length
-    dft = np.zeros((len(X), length))  # , dtype=np.float64
-
-    stds = np.zeros(len(X))
-    for i in range(len(stds)):
-        stds[i] = np.std(X[i])
-    # stds = np.std(X, axis=1)  # not available in numba
-    stds = np.where(stds < 1e-8, 1, stds)
-
-    with objmode(X_ffts="complex128[:,:]"):
-        X_ffts = np.fft.rfft(X, axis=1)  # complex128
-    reals = np.real(X_ffts)  # float64[]
-    imags = np.imag(X_ffts)  # float64[]
-    dft[:, 0::2] = reals[:, 0 : length // 2]
-    dft[:, 1::2] = imags[:, 0 : length // 2]
-    dft /= stds.reshape(-1, 1)
-    dft *= inverse_sqrt_win_size
-
-    return dft[:, start:]
-
-
-@njit(fastmath=True, cache=True)
-def _transform_case(
-    X,
-    window_size,
-    dft_length,
-    word_length,
-    norm,
-    remove_repeat_words,
-    support,
-    anova,
-    variance,
-    breakpoints,
-    letter_bits,
-    bigrams,
-    skip_grams,
-    inverse_sqrt_win_size,
-    lower_bounding,
-):
-    dfts = _mft(
-        X,
-        window_size,
-        dft_length,
-        norm,
-        support,
-        anova,
-        variance,
-        inverse_sqrt_win_size,
-        lower_bounding,
-    )
-
-    words = generate_words(
-        dfts,
-        bigrams,
-        skip_grams,
-        window_size,
-        breakpoints,
-        word_length,
-        letter_bits,
-    )
-
-    if remove_repeat_words:
-        words = remove_repeating_words(words)
-
-    return words, dfts
-
-
-@njit(fastmath=True, cache=True)
-def remove_repeating_words(words):
-    for i in range(words.shape[0]):
-        last_word = 0
-        for j in range(words.shape[1]):
-            if last_word == words[i, j]:
-                # We encode the repeated words as 0 and remove them
-                # This is implementged using np.nonzero in numba. Thus must be 0
-                words[i, j] = 0
-            last_word = words[i, j]
-
-    return words
-
-
-@njit(fastmath=True, cache=True)
-def _calc_incremental_mean_std(series, end, window_size):
-    stds = np.zeros(end)
-    window = series[0:window_size]
-    series_sum = np.sum(window)
-    square_sum = np.sum(np.multiply(window, window))
-
-    r_window_length = 1.0 / window_size
-    mean = series_sum * r_window_length
-    buf = math.sqrt(max(square_sum * r_window_length - mean * mean, 0.0))
-    stds[0] = buf if buf > 1e-8 else 1
-
-    for w in range(1, end):
-        series_sum += series[w + window_size - 1] - series[w - 1]
-        mean = series_sum * r_window_length
-        square_sum += (
-            series[w + window_size - 1] * series[w + window_size - 1]
-            - series[w - 1] * series[w - 1]
-        )
-        buf = math.sqrt(max(square_sum * r_window_length - mean * mean, 0.0))
-        stds[w] = buf if buf > 1e-8 else 1
-
-    return stds
-
-
-@njit(fastmath=True, cache=True)
-def _get_phis(window_size, length):
-    phis = np.zeros(length)
-    i = np.arange(length // 2)
-    const = 2 * np.pi / window_size
-    phis[0::2] = np.cos((-i) * const)
-    phis[1::2] = -np.sin((-i) * const)
-    return phis
-
-
-@njit(fastmath=True, cache=True)
-def generate_words(
-    dfts, bigrams, skip_grams, window_size, breakpoints, word_length, letter_bits
-):
-    needed_size = dfts.shape[1]
-    if bigrams:
-        # allocate memory for bigrams
-        needed_size += max(0, dfts.shape[1] - window_size)
-    if skip_grams:
-        # allocate memory for 2- and 3-skip-grams
-        needed_size += max(0, 2 * dfts.shape[1] - 5 * window_size)
-
-    words = np.zeros((dfts.shape[0], needed_size), dtype=np.uint32)
-
-    letter_bits = np.uint32(letter_bits)
-    word_bits = word_length * letter_bits  # dfts.shape[2] * letter_bits
-
-    # special case: binary breakpoints
-    if breakpoints.shape[1] == 2:
-        vector = np.zeros((breakpoints.shape[0]), dtype=np.float32)
-        for i in range(breakpoints.shape[0]):
-            vector[i] = breakpoints.shape[1] ** i
-
-        for a in prange(dfts.shape[0]):
-            match = (dfts[a] <= breakpoints[:, 0]).astype(np.float32)
-            words[a, : dfts.shape[1]] = np.dot(match, vector).astype(np.uint32)
-
-    # general case: alphabet-size many breakpoints
-    else:
-        for a in prange(dfts.shape[0]):
-            for i in range(word_length):  # range(dfts.shape[2]):
-                words[a, : dfts.shape[1]] = (
-                    words[a, : dfts.shape[1]] << letter_bits
-                ) | np.digitize(dfts[a, :, i], breakpoints[i], right=True)
-
-    # add bigrams
-    if bigrams:
-        for a in prange(0, dfts.shape[1] - window_size):
-            first_word = words[:, a]
-            second_word = words[:, a + window_size]
-            words[:, dfts.shape[1] + a] = (first_word << word_bits) | second_word
-
-    # # add 2,3-skip-grams
-    # if skip_grams:
-    #     for s in range(2, 4):
-    #         for a in range(0, dfts.shape[1] - s * window_size):
-    #             first_word = words[:, a]
-    #             second_word = words[:, a + s * window_size]
-    #             words[:, dfts.shape[1] + a] = (first_word << word_bits) | second_word
-
-    return words
-
-
-@njit(fastmath=True, cache=True)
-def _mft(
-    X,
-    window_size,
-    dft_length,
-    norm,
-    support,
-    anova,
-    variance,
-    inverse_sqrt_win_size,
-    lower_bounding,
-):
-    start_offset = 2 if norm else 0
-    length = dft_length + start_offset + dft_length % 2
-    end = max(1, len(X[0]) - window_size + 1)
-
-    #  compute mask for only those indices needed and not all indices
-    if anova or variance:
-        support = support + start_offset
-        indices = np.full(length, False)
-        mask = np.full(length, False)
-        for s in support:
-            indices[s] = True
-            mask[s] = True
-            if (s % 2) == 0:  # even
-                indices[s + 1] = True
-            else:  # uneven
-                indices[s - 1] = True
-        mask = mask[indices]
-    else:
-        indices = np.full(length, True)
-
-    phis = _get_phis(window_size, length)
-    transformed = np.zeros((X.shape[0], end, length))
-
-    # 1. First run using DFT
-    with objmode(X_ffts="complex128[:,:]"):
-        X_ffts = np.fft.rfft(X[:, :window_size], axis=1)  # complex128
-    reals = np.real(X_ffts)  # float64[]
-    imags = np.imag(X_ffts)  # float64[]
-    transformed[:, 0, 0::2] = reals[:, 0 : length // 2]
-    transformed[:, 0, 1::2] = imags[:, 0 : length // 2]
-
-    # 2. Other runs using MFT
-    # X2 = X.reshape(X.shape[0], X.shape[1], 1)
-    # Bugfix to allow for slices on original X like in TEASER
-    X2 = X.copy().reshape(X.shape[0], X.shape[1], 1)
-
-    # compute only those indices needed and not all
-    phis2 = phis[indices]
-    transformed2 = transformed[:, :, indices]
-    for i in range(1, end):
-        reals = transformed2[:, i - 1, 0::2] + X2[:, i + window_size - 1] - X2[:, i - 1]
-        imags = transformed2[:, i - 1, 1::2]
-        transformed2[:, i, 0::2] = (
-            reals * phis2[:length:2] - imags * phis2[1 : (length + 1) : 2]
-        )
-        transformed2[:, i, 1::2] = (
-            reals * phis2[1 : (length + 1) : 2] + phis2[:length:2] * imags
-        )
-
-    transformed2 = transformed2 * inverse_sqrt_win_size
-
-    if lower_bounding:
-        transformed2[:, :, 1::2] = transformed2[:, :, 1::2] * -1
-
-    # compute STDs
-    stds = np.zeros((X.shape[0], end))
-    for a in range(X.shape[0]):
-        stds[a] = _calc_incremental_mean_std(X[a], end, window_size)
-
-    # divide all by stds and use only the best indices
-    if anova or variance:
-        return transformed2[:, :, mask] / stds.reshape(stds.shape[0], stds.shape[1], 1)
-    else:
-        return (transformed2 / stds.reshape(stds.shape[0], stds.shape[1], 1))[
-            :, :, start_offset:
-        ]
-
-
-def _dilation(X, d, first_difference):
-    padding = np.zeros((len(X), 10))
-    X = np.concatenate((padding, X, padding), axis=1)
-
-    # adding first order differences
-    if first_difference:
-        X = np.diff(X, axis=1, prepend=0)
-        # X = np.concatenate((X, X2), axis=1)
-
-    # adding dilation
-    X_dilated = _dilation2(X, d)
-
-    return (
-        X_dilated,
-        _dilation2(np.arange(X_dilated.shape[1], dtype=np.float_).reshape(1, -1), d)[0],
-    )
-
-
-@njit(cache=True, fastmath=True)
-def _dilation2(X, d):
-    # dilation on actual data
-    if d > 1:
-        start = 0
-        data = np.zeros(X.shape, dtype=np.float_)
-        for i in range(0, d):
-            curr = X[:, i::d]
-            end = curr.shape[1]
-            data[:, start : start + end] = curr
-            start += end
-        return data
-    else:
-        return X.astype(np.float_)
-
-
-@njit(cache=True, fastmath=True)
-def create_feature_names(sfa_words):
-    feature_names = set()
-    for t_words in sfa_words:
-        for t_word in t_words:
-            feature_names.add(t_word)
-    return feature_names
-
-
-@njit(fastmath=True, cache=True)
-def create_bag_none(
-    X_index,
-    breakpoints,
-    n_instances,
-    sfa_words,
-    word_length,
-    remove_repeat_words,
-    sections,
-):
-    feature_count = np.uint32(breakpoints.shape[1] ** word_length)
-    needed_size = feature_count
-    if sections > 1:
-        needed_size = 2 * feature_count
-    all_win_words = np.zeros((n_instances, needed_size), dtype=np.int32)
-
-    for j in prange(sfa_words.shape[0]):
-        # this mask is used to encode the repeated words
-        if remove_repeat_words:
-            masked = np.nonzero(sfa_words[j])
-            all_win_words[j, :feature_count] = np.bincount(
-                sfa_words[j][masked], minlength=feature_count
-            )
-        else:
-            all_win_words[j, :feature_count] = np.bincount(
-                sfa_words[j], minlength=feature_count
-            )
-
-    # count number of sections the word is present
-    if sections > 1:
-        section_count = np.zeros(
-            (n_instances, feature_count, sections), dtype=np.uint32
-        )
-        max_index = max(X_index) + 1
-        for j in range(sfa_words.shape[0]):
-            for i in range(sfa_words.shape[1]):
-                section_count[
-                    j, sfa_words[j, i], int(X_index[i] / max_index * sections)
-                ] = 1
-                # all_win_words[j, feature_count + sfa_words[j, i]] = max(
-                #     X_index[i], all_win_words[j, feature_count + sfa_words[j, i]]
-                # )
-        all_win_words[:, feature_count:] = section_count.sum(axis=-1)
-
-    return all_win_words
-
-
-@njit(fastmath=True, cache=True)
-def create_bag_feature_selection(
-    X_index,
-    n_instances,
-    relevant_features_idx,
-    feature_names,
-    sfa_words,
-    remove_repeat_words,
-    sections,
-):
-    relevant_features = Dict.empty(key_type=types.uint32, value_type=types.uint32)
-    for k, v in zip(
-        feature_names[relevant_features_idx],
-        np.arange(len(relevant_features_idx), dtype=np.uint32),
-    ):
-        relevant_features[k] = v
-
-    if remove_repeat_words:
-        if 0 in relevant_features:
-            del relevant_features[0]
-
-    needed_size = len(relevant_features_idx)
-    # if sections > 1:
-    #    needed_size = 2 * needed_size
-
-    all_win_words = np.zeros((n_instances, needed_size), dtype=np.int32)
-    for j in range(sfa_words.shape[0]):
-        for key in sfa_words[j]:
-            if key in relevant_features:
-                all_win_words[j, relevant_features[key]] += 1
-
-    # count number of sections the word is present
-    # if sections > 1:
-    #     section_count = np.zeros(
-    #         (sfa_words.shape[0], len(relevant_features_idx), sections),
-    #         dtype=np.uint32
-    #     )
-    #     max_index = max(X_index) + 1
-    #     for j in range(sfa_words.shape[0]):
-    #         for i, key in enumerate(sfa_words[j]):
-    #             if key in relevant_features:
-    #                 section_count[
-    #                     j,
-    #                     relevant_features[key],
-    #                     int(X_index[i] / max_index * sections),
-    #                 ] = 1
-    #
-    #     all_win_words[:, len(relevant_features_idx) :] = section_count.sum(axis=-1)
-
-    return all_win_words, relevant_features
-
-
-@njit(fastmath=True, cache=True)
-def create_bag_transform(
-    X_index,
-    feature_count,
-    feature_selection,
-    relevant_features,
-    sfa_words,
-    remove_repeat_words,
-    sections,
-):
-    all_win_words = np.zeros((len(sfa_words), feature_count), np.int32)
-    for j in prange(sfa_words.shape[0]):
-        if len(relevant_features) == 0 and feature_selection == "none":
-            # this mask is used to encode the repeated words
-            if remove_repeat_words:
-                masked = np.nonzero(sfa_words[j])
-                all_win_words[j, :feature_count] = np.bincount(
-                    sfa_words[j][masked], minlength=feature_count
-                )
-            else:
-                all_win_words[j, :feature_count] = np.bincount(
-                    sfa_words[j], minlength=feature_count
-                )
-        else:
-            if remove_repeat_words:
-                if 0 in relevant_features:
-                    del relevant_features[0]
-
-            for _, key in enumerate(sfa_words[j]):
-                if key in relevant_features:
-                    o = relevant_features[key]
-                    all_win_words[j, o] += 1
-
-    # count number of sections the word is present
-    if sections > 1:
-        if len(relevant_features) == 0 and feature_selection == "none":
-            section_count = np.zeros(
-                (sfa_words.shape[0], feature_count // 2, sections), dtype=np.uint32
-            )
-            max_index = max(X_index) + 1
-            for j in range(sfa_words.shape[0]):
-                for i in range(sfa_words.shape[1]):
-                    section_count[
-                        j, sfa_words[j, i], int(X_index[i] / max_index * sections)
-                    ] = 1
-
-                    # all_win_words[j, feature_count // 2 + sfa_words[j, i]] = max(
-                    #    X_index[i],
-                    #    all_win_words[j, feature_count // 2 + sfa_words[j, i]],
-                    # )
-
-            all_win_words[:, feature_count // 2 :] = section_count.sum(axis=-1)
-        # else:
-        #     section_count = np.zeros(
-        #         (sfa_words.shape[0], feature_count // 2, sections), dtype=np.uint32
-        #     )
-        #     max_index = max(X_index) + 1
-        #     for j in range(sfa_words.shape[0]):
-        #         for i, key in enumerate(sfa_words[j]):
-        #             if key in relevant_features:
-        #                 section_count[
-        #                     j,
-        #                     relevant_features[key],
-        #                     int(X_index[i] / max_index * sections),
-        #                 ] = 1
-        #
-        #     all_win_words[:, feature_count // 2 :] = section_count.sum(axis=-1)
-
-    return all_win_words, all_win_words.shape[1]
-
-
-@njit(fastmath=True, cache=True)
-def shorten_words(words, amount, letter_bits):
-    new_words = np.zeros((words.shape[0], words.shape[1]), dtype=np.uint32)
-
-    # Unigrams
-    shift_len = amount * letter_bits
-    for j in prange(words.shape[1]):
-        # shorten a word by set amount of letters
-        new_words[:, j] = words[:, j] >> shift_len
-
-    # TODO Bigrams
-    # if bigrams:
-    #     for a in range(0, n_instances):
-    #         first_word = new_words[:, a]
-    #         second_word = new_words[:, a + window_size]
-    #         words[:, n_instances + a] = (first_word << word_bits) | second_word
-
-    return new_words
diff --git a/tsml/transformations/_shapelet_transform.py b/tsml/transformations/_shapelet_transform.py
deleted file mode 100644
index 7548145..0000000
--- a/tsml/transformations/_shapelet_transform.py
+++ /dev/null
@@ -1,1532 +0,0 @@
-"""Shapelet transformers.
-
-A transformer from the time domain into the shapelet domain.
-"""
-
-__author__ = ["MatthewMiddlehurst", "baraline"]
-__all__ = ["RandomShapeletTransformer", "RandomDilatedShapeletTransformer"]
-
-import heapq
-import math
-import time
-import warnings
-
-import numpy as np
-from joblib import Parallel
-from numba import njit, prange, set_num_threads
-from numba.typed.typedlist import List
-from sklearn import preprocessing
-from sklearn.base import TransformerMixin
-from sklearn.preprocessing import LabelEncoder
-from sklearn.utils import check_random_state
-from sklearn.utils.parallel import delayed
-from sklearn.utils.validation import check_is_fitted
-
-from tsml.base import BaseTimeSeriesEstimator
-from tsml.distances import manhattan_distance
-from tsml.utils.numba_functions.general import (
-    choice_log,
-    combinations_1d,
-    get_subsequence,
-    get_subsequence_with_mean_std,
-    set_numba_random_seed,
-    sliding_mean_std_one_series,
-    z_normalise_series,
-)
-from tsml.utils.numba_functions.stats import prime_up_to
-from tsml.utils.validation import check_n_jobs
-
-
-class RandomShapeletTransformer(TransformerMixin, BaseTimeSeriesEstimator):
-    """Random Shapelet Transform.
-
-    Implementation of the binary shapelet transform along the lines of [1]_[2]_, with
-    randomly extracted shapelets.
-
-    Overview: Input "n" series with "d" dimensions of length "m". Continuously extract
-    candidate shapelets and filter them in batches.
-        For each candidate shapelet
-            - Extract a shapelet from an instance with random length, position and
-              dimension
-            - Using its distance to train cases, calculate the shapelets information
-              gain
-            - Abandon evaluating the shapelet if it is impossible to obtain a higher
-              information gain than the current worst
-        For each shapelet batch
-            - Add each candidate to its classes shapelet heap, removing the lowest
-              information gain shapelet if the max number of shapelets has been met
-            - Remove self-similar shapelets from the heap
-    Using the final set of filtered shapelets, transform the data into a vector of
-    of distances from a series to each shapelet.
-
-    Parameters
-    ----------
-    n_shapelet_samples : int, default=10000
-        The number of candidate shapelets to be considered for the final transform.
-        Filtered down to <= max_shapelets, keeping the shapelets with the most
-        information gain.
-    max_shapelets : int or None, default=None
-        Max number of shapelets to keep for the final transform. Each class value will
-        have its own max, set to n_classes / max_shapelets. If None uses the min between
-        10 * n_instances and 1000
-    min_shapelet_length : int, default=3
-        Lower bound on candidate shapelet lengths.
-    max_shapelet_length : int or None, default= None
-        Upper bound on candidate shapelet lengths. If None no max length is used.
-    remove_self_similar : boolean, default=True
-        Remove overlapping "self-similar" shapelets when merging candidate shapelets.
-    time_limit_in_minutes : int, default=0
-        Time contract to limit build time in minutes, overriding n_shapelet_samples.
-        Default of 0 means n_shapelet_samples is used.
-    contract_max_n_shapelet_samples : int, default=np.inf
-        Max number of shapelets to extract when time_limit_in_minutes is set.
-    n_jobs : int, default=1
-        The number of jobs to run in parallel for both `fit` and `transform`.
-        ``-1`` means using all processors.
-    parallel_backend : str, ParallelBackendBase instance or None, default=None
-        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
-        value of "threads" is used by default.
-        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
-        See the joblib Parallel documentation for more details.
-    batch_size : int or None, default=100
-        Number of shapelet candidates processed before being merged into the set of best
-        shapelets.
-    random_state : int or None, default=None
-        Seed for random number generation.
-
-    Attributes
-    ----------
-    n_classes_ : int
-        The number of classes.
-    n_instances_ : int
-        The number of train cases.
-    n_dims_ : int
-        The number of dimensions per case.
-    series_length_ : int
-        The length of each series.
-    classes_ : list
-        The classes labels.
-    shapelets_ : list
-        The stored shapelets and relating information after a dataset has been
-        processed.
-        Each item in the list is a tuple containing the following 7 items:
-        (shapelet information gain, shapelet length, start position the shapelet was
-        extracted from, shapelet dimension, index of the instance the shapelet was
-        extracted from in fit, class value of the shapelet, The z-normalised shapelet
-        array)
-
-    See Also
-    --------
-    ShapeletTransformClassifier
-
-    Notes
-    -----
-    For the Java version, see
-    `TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/
-    java/tsml/transformers/ShapeletTransform.java>`_.
-
-    References
-    ----------
-    .. [1] Jon Hills et al., "Classification of time series by shapelet transformation",
-       Data Mining and Knowledge Discovery, 28(4), 851--881, 2014.
-    .. [2] A. Bostrom and A. Bagnall, "Binary Shapelet Transform for Multiclass Time
-       Series Classification", Transactions on Large-Scale Data and Knowledge Centered
-       Systems, 32, 2017.
-
-    Examples
-    --------
-    >>> from tsml.transformations._shapelet_transform import RandomShapeletTransformer
-    >>> from tsml.datasets import load_minimal_chinatown
-    >>> X_train, y_train = load_minimal_chinatown(split="train")
-    >>> t = RandomShapeletTransformer(
-    ...     n_shapelet_samples=500,
-    ...     max_shapelets=10,
-    ...     batch_size=100,
-    ... )
-    >>> t.fit(X_train, y_train)
-    RandomShapeletTransformer(...)
-    >>> X_t = t.transform(X_train)
-    """
-
-    def __init__(
-        self,
-        n_shapelet_samples=10000,
-        max_shapelets=None,
-        min_shapelet_length=3,
-        max_shapelet_length=None,
-        remove_self_similar=True,
-        time_limit_in_minutes=0.0,
-        contract_max_n_shapelet_samples=np.inf,
-        n_jobs=1,
-        parallel_backend=None,
-        batch_size=100,
-        random_state=None,
-    ):
-        self.n_shapelet_samples = n_shapelet_samples
-        self.max_shapelets = max_shapelets
-        self.min_shapelet_length = min_shapelet_length
-        self.max_shapelet_length = max_shapelet_length
-        self.remove_self_similar = remove_self_similar
-
-        self.time_limit_in_minutes = time_limit_in_minutes
-        self.contract_max_n_shapelet_samples = contract_max_n_shapelet_samples
-
-        self.n_jobs = n_jobs
-        self.parallel_backend = parallel_backend
-        self.batch_size = batch_size
-        self.random_state = random_state
-
-        super().__init__()
-
-    def fit(self, X, y=None):
-        """Fit the shapelet transform to a specified X and y.
-
-        Parameters
-        ----------
-        X: pandas DataFrame or np.ndarray
-            The training input samples.
-        y: array-like or list
-            The class values for X.
-
-        Returns
-        -------
-        self : RandomShapeletTransformer
-            This estimator.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2)
-        X = self._convert_X(X)
-
-        self.n_instances_, self.n_dims_, self.series_length_ = X.shape
-        self.classes_, self._class_counts = np.unique(y, return_counts=True)
-        self.n_classes_ = self.classes_.shape[0]
-        self.class_dictionary_ = {}
-        for index, class_val in enumerate(self.classes_):
-            self.class_dictionary_[class_val] = index
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-
-        le = preprocessing.LabelEncoder()
-        y = le.fit_transform(y)
-
-        self._max_shapelets = self.max_shapelets
-        if self.max_shapelets is None:
-            self._max_shapelets = min(10 * self.n_instances_, 1000)
-        if self._max_shapelets < self.n_classes_:
-            self._max_shapelets = self.n_classes_
-
-        self._max_shapelet_length = self.max_shapelet_length
-        if self.max_shapelet_length is None:
-            self._max_shapelet_length = self.series_length_
-
-        time_limit = self.time_limit_in_minutes * 60
-        start_time = time.time()
-        fit_time = 0
-
-        max_shapelets_per_class = int(self._max_shapelets / self.n_classes_)
-        if max_shapelets_per_class < 1:
-            max_shapelets_per_class = 1
-
-        shapelets = List(
-            [List([(-1.0, -1, -1, -1, -1, -1)]) for _ in range(self.n_classes_)]
-        )
-        n_shapelets_extracted = 0
-
-        if time_limit > 0:
-            while (
-                fit_time < time_limit
-                and n_shapelets_extracted < self.contract_max_n_shapelet_samples
-            ):
-                candidate_shapelets = Parallel(
-                    n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads"
-                )(
-                    delayed(self._extract_random_shapelet)(
-                        X,
-                        y,
-                        n_shapelets_extracted + i,
-                        shapelets,
-                        max_shapelets_per_class,
-                    )
-                    for i in range(self.batch_size)
-                )
-
-                for i, heap in enumerate(shapelets):
-                    self._merge_shapelets(
-                        heap,
-                        List(candidate_shapelets),
-                        max_shapelets_per_class,
-                        i,
-                    )
-
-                if self.remove_self_similar:
-                    for i, heap in enumerate(shapelets):
-                        to_keep = self._remove_self_similar_shapelets(heap)
-                        shapelets[i] = List([n for (n, b) in zip(heap, to_keep) if b])
-
-                n_shapelets_extracted += self.batch_size
-                fit_time = time.time() - start_time
-        else:
-            while n_shapelets_extracted < self.n_shapelet_samples:
-                n_shapelets_to_extract = (
-                    self.batch_size
-                    if n_shapelets_extracted + self.batch_size
-                    <= self.n_shapelet_samples
-                    else self.n_shapelet_samples - n_shapelets_extracted
-                )
-
-                candidate_shapelets = Parallel(
-                    n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads"
-                )(
-                    delayed(self._extract_random_shapelet)(
-                        X,
-                        y,
-                        n_shapelets_extracted + i,
-                        shapelets,
-                        max_shapelets_per_class,
-                    )
-                    for i in range(n_shapelets_to_extract)
-                )
-
-                for i, heap in enumerate(shapelets):
-                    self._merge_shapelets(
-                        heap,
-                        List(candidate_shapelets),
-                        max_shapelets_per_class,
-                        i,
-                    )
-
-                if self.remove_self_similar:
-                    for i, heap in enumerate(shapelets):
-                        to_keep = self._remove_self_similar_shapelets(heap)
-                        shapelets[i] = List([n for (n, b) in zip(heap, to_keep) if b])
-
-                n_shapelets_extracted += n_shapelets_to_extract
-
-        self.shapelets_ = [
-            (
-                s[0],
-                s[1],
-                s[2],
-                s[3],
-                s[4],
-                self.classes_[s[5]],
-                z_normalise_series(X[s[4], s[3], s[2] : s[2] + s[1]]),
-            )
-            for class_shapelets in shapelets
-            for s in class_shapelets
-            if s[0] > 0
-        ]
-        self.shapelets_.sort(reverse=True, key=lambda s: (s[0], s[1], s[2], s[3], s[4]))
-
-        to_keep = self._remove_identical_shapelets(List(self.shapelets_))
-        self.shapelets_ = [n for (n, b) in zip(self.shapelets_, to_keep) if b]
-
-        self._sorted_indicies = []
-        for s in self.shapelets_:
-            sabs = np.abs(s[6])
-            self._sorted_indicies.append(
-                np.array(
-                    sorted(range(s[1]), reverse=True, key=lambda j, sabs=sabs: sabs[j])
-                )
-            )
-
-        return self
-
-    def transform(self, X, y=None):
-        """Transform X according to the extracted shapelets.
-
-        Parameters
-        ----------
-        X : pandas DataFrame or np.ndarray
-            The input data to transform.
-
-        Returns
-        -------
-        output : pandas DataFrame
-            The transformed dataframe in tabular format.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        output = np.zeros((len(X), len(self.shapelets_)))
-
-        for i, series in enumerate(X):
-            dists = Parallel(
-                n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads"
-            )(
-                delayed(_online_shapelet_distance)(
-                    series[shapelet[3]],
-                    shapelet[6],
-                    self._sorted_indicies[n],
-                    shapelet[2],
-                    shapelet[1],
-                )
-                for n, shapelet in enumerate(self.shapelets_)
-            )
-
-            output[i] = dists
-
-        return output
-
-    def _extract_random_shapelet(self, X, y, i, shapelets, max_shapelets_per_class):
-        rs = 255 if self.random_state == 0 else self.random_state
-        rs = (
-            None
-            if self.random_state is None
-            else (rs * 37 * (i + 1)) % np.iinfo(np.int32).max
-        )
-        rng = check_random_state(rs)
-
-        inst_idx = i % self.n_instances_
-        cls_idx = int(y[inst_idx])
-        worst_quality = (
-            shapelets[cls_idx][0][0]
-            if len(shapelets[cls_idx]) == max_shapelets_per_class
-            else -1
-        )
-
-        length = (
-            rng.randint(0, self._max_shapelet_length - self.min_shapelet_length)
-            + self.min_shapelet_length
-        )
-        position = rng.randint(0, self.series_length_ - length)
-        dim = rng.randint(0, self.n_dims_)
-
-        shapelet = z_normalise_series(X[inst_idx, dim, position : position + length])
-        sabs = np.abs(shapelet)
-        sorted_indicies = np.array(
-            sorted(range(length), reverse=True, key=lambda j: sabs[j])
-        )
-
-        quality = self._find_shapelet_quality(
-            X,
-            y,
-            shapelet,
-            sorted_indicies,
-            position,
-            length,
-            dim,
-            inst_idx,
-            self._class_counts[cls_idx],
-            self.n_instances_ - self._class_counts[cls_idx],
-            worst_quality,
-        )
-
-        return quality, length, position, dim, inst_idx, cls_idx
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _find_shapelet_quality(
-        X,
-        y,
-        shapelet,
-        sorted_indicies,
-        position,
-        length,
-        dim,
-        inst_idx,
-        this_cls_count,
-        other_cls_count,
-        worst_quality,
-    ):
-        # todo optimise this more, we spend 99% of time here
-        orderline = []
-        this_cls_traversed = 0
-        other_cls_traversed = 0
-
-        for i, series in enumerate(X):
-            if i != inst_idx:
-                distance = _online_shapelet_distance(
-                    series[dim], shapelet, sorted_indicies, position, length
-                )
-            else:
-                distance = 0
-
-            if y[i] == y[inst_idx]:
-                cls = 1
-                this_cls_traversed += 1
-            else:
-                cls = -1
-                other_cls_traversed += 1
-
-            orderline.append((distance, cls))
-            orderline.sort()
-
-            if worst_quality > 0:
-                quality = _calc_early_binary_ig(
-                    orderline,
-                    this_cls_traversed,
-                    other_cls_traversed,
-                    this_cls_count - this_cls_traversed,
-                    other_cls_count - other_cls_traversed,
-                    worst_quality,
-                )
-
-                if quality <= worst_quality:
-                    return -1
-
-        quality = _calc_binary_ig(orderline, this_cls_count, other_cls_count)
-
-        return round(quality, 12)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _merge_shapelets(
-        shapelet_heap, candidate_shapelets, max_shapelets_per_class, cls_idx
-    ):
-        for shapelet in candidate_shapelets:
-            if shapelet[5] == cls_idx and shapelet[0] > 0:
-                if (
-                    len(shapelet_heap) == max_shapelets_per_class
-                    and shapelet[0] < shapelet_heap[0][0]
-                ):
-                    continue
-
-                heapq.heappush(shapelet_heap, shapelet)
-
-                if len(shapelet_heap) > max_shapelets_per_class:
-                    heapq.heappop(shapelet_heap)
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _remove_self_similar_shapelets(shapelet_heap):
-        to_keep = [True] * len(shapelet_heap)
-
-        for i in range(len(shapelet_heap)):
-            if to_keep[i] is False:
-                continue
-
-            for n in range(i + 1, len(shapelet_heap)):
-                if to_keep[n] and _is_self_similar(shapelet_heap[i], shapelet_heap[n]):
-                    if (shapelet_heap[i][0], shapelet_heap[i][1]) >= (
-                        shapelet_heap[n][0],
-                        shapelet_heap[n][1],
-                    ):
-                        to_keep[n] = False
-                    else:
-                        to_keep[i] = False
-                        break
-
-        return to_keep
-
-    @staticmethod
-    @njit(fastmath=True, cache=True)
-    def _remove_identical_shapelets(shapelets):
-        to_keep = [True] * len(shapelets)
-
-        for i in range(len(shapelets)):
-            for n in range(i + 1, len(shapelets)):
-                if (
-                    to_keep[n]
-                    and shapelets[i][1] == shapelets[n][1]
-                    and np.array_equal(shapelets[i][6], shapelets[n][6])
-                ):
-                    to_keep[n] = False
-
-        return to_keep
-
-    def _more_tags(self) -> dict:
-        return {"requires_y": True}
-
-    @classmethod
-    def get_test_params(cls, parameter_set="default"):
-        """Return testing parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : str, default="default"
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-
-        Returns
-        -------
-        params : dict or list of dict, default = {}
-            Parameters to create testing instances of the class
-            Each dict are parameters to construct an "interesting" test instance, i.e.,
-            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
-            `create_test_instance` uses the first (or only) dictionary in `params`
-        """
-        return {"max_shapelets": 5, "n_shapelet_samples": 50, "batch_size": 20}
-
-
-@njit(fastmath=True, cache=True)
-def _online_shapelet_distance(series, shapelet, sorted_indicies, position, length):
-    subseq = series[position : position + length]
-
-    sum = 0.0
-    sum2 = 0.0
-    for i in subseq:
-        sum += i
-        sum2 += i * i
-
-    mean = sum / length
-    std = (sum2 - mean * mean * length) / length
-    if std > 0:
-        subseq = (subseq - mean) / std
-    else:
-        subseq = np.zeros(length)
-
-    best_dist = 0
-    for i, n in zip(shapelet, subseq):
-        temp = i - n
-        best_dist += temp * temp
-
-    i = 1
-    traverse = [True, True]
-    sums = [sum, sum]
-    sums2 = [sum2, sum2]
-
-    while traverse[0] or traverse[1]:
-        for n in range(2):
-            mod = -1 if n == 0 else 1
-            pos = position + mod * i
-            traverse[n] = pos >= 0 if n == 0 else pos <= len(series) - length
-
-            if not traverse[n]:
-                continue
-
-            start = series[pos - n]
-            end = series[pos - n + length]
-
-            sums[n] += mod * end - mod * start
-            sums2[n] += mod * end * end - mod * start * start
-
-            mean = sums[n] / length
-            std = math.sqrt((sums2[n] - mean * mean * length) / length)
-
-            dist = 0
-            use_std = std != 0
-            for j in range(length):
-                val = (series[pos + sorted_indicies[j]] - mean) / std if use_std else 0
-                temp = shapelet[sorted_indicies[j]] - val
-                dist += temp * temp
-
-                if dist > best_dist:
-                    break
-
-            if dist < best_dist:
-                best_dist = dist
-
-        i += 1
-
-    return best_dist if best_dist == 0 else 1 / length * best_dist
-
-
-@njit(fastmath=True, cache=True)
-def _calc_early_binary_ig(
-    orderline,
-    c1_traversed,
-    c2_traversed,
-    c1_to_add,
-    c2_to_add,
-    worst_quality,
-):
-    initial_ent = _binary_entropy(
-        c1_traversed + c1_to_add,
-        c2_traversed + c2_to_add,
-    )
-
-    total_all = c1_traversed + c2_traversed + c1_to_add + c2_to_add
-
-    bsf_ig = 0
-    # actual observations in orderline
-    c1_count = 0
-    c2_count = 0
-
-    # evaluate each split point
-    for split in range(len(orderline)):
-        next_class = orderline[split][1]  # +1 if this class, -1 if other
-        if next_class > 0:
-            c1_count += 1
-        else:
-            c2_count += 1
-
-        # optimistically add this class to left side first and other to right
-        left_prop = (split + 1 + c1_to_add) / total_all
-        ent_left = _binary_entropy(c1_count + c1_to_add, c2_count)
-
-        # because right side must optimistically contain everything else
-        right_prop = 1 - left_prop
-
-        ent_right = _binary_entropy(
-            c1_traversed - c1_count,
-            c2_traversed - c2_count + c2_to_add,
-        )
-
-        ig = initial_ent - left_prop * ent_left - right_prop * ent_right
-        bsf_ig = max(ig, bsf_ig)
-
-        # now optimistically add this class to right, other to left
-        left_prop = (split + 1 + c2_to_add) / total_all
-        ent_left = _binary_entropy(c1_count, c2_count + c2_to_add)
-
-        # because right side must optimistically contain everything else
-        right_prop = 1 - left_prop
-
-        ent_right = _binary_entropy(
-            c1_traversed - c1_count + c1_to_add,
-            c2_traversed - c2_count,
-        )
-
-        ig = initial_ent - left_prop * ent_left - right_prop * ent_right
-        bsf_ig = max(ig, bsf_ig)
-
-        if bsf_ig > worst_quality:
-            return bsf_ig
-
-    return bsf_ig
-
-
-@njit(fastmath=True, cache=True)
-def _calc_binary_ig(orderline, c1, c2):
-    initial_ent = _binary_entropy(c1, c2)
-
-    total_all = c1 + c2
-
-    bsf_ig = 0
-    c1_count = 0
-    c2_count = 0
-
-    # evaluate each split point
-    for split in range(len(orderline)):
-        next_class = orderline[split][1]  # +1 if this class, -1 if other
-        if next_class > 0:
-            c1_count += 1
-        else:
-            c2_count += 1
-
-        left_prop = (split + 1) / total_all
-        ent_left = _binary_entropy(c1_count, c2_count)
-
-        right_prop = 1 - left_prop
-        ent_right = _binary_entropy(
-            c1 - c1_count,
-            c2 - c2_count,
-        )
-
-        ig = initial_ent - left_prop * ent_left - right_prop * ent_right
-        bsf_ig = max(ig, bsf_ig)
-
-    return bsf_ig
-
-
-@njit(fastmath=True, cache=True)
-def _binary_entropy(c1, c2):
-    ent = 0
-    if c1 != 0:
-        ent -= c1 / (c1 + c2) * np.log2(c1 / (c1 + c2))
-    if c2 != 0:
-        ent -= c2 / (c1 + c2) * np.log2(c2 / (c1 + c2))
-    return ent
-
-
-@njit(fastmath=True, cache=True)
-def _is_self_similar(s1, s2):
-    # not self similar if from different series or dimension
-    if s1[4] == s2[4] and s1[3] == s2[3]:
-        if s2[2] <= s1[2] <= s2[2] + s2[1]:
-            return True
-        if s1[2] <= s2[2] <= s1[2] + s1[1]:
-            return True
-
-    return False
-
-
-class RandomDilatedShapeletTransformer(TransformerMixin, BaseTimeSeriesEstimator):
-    """Random Dilated Shapelet Transform (RDST) as described in [1]_[2]_.
-
-    Overview: The input is n series with d channels of length m. First step is to
-    extract candidate shapelets from the inputs. This is done randomly, and for
-    each candidate shapelet:
-        - Length is randomly selected from shapelet_lengths parameter
-        - Dilation is sampled as a function the shapelet length and time series length
-        - Normalization is chosen randomly given the probability given as parameter
-        - Value is sampled randomly from an input time series given the length and
-        dilation parameter.
-        - Threshold is randomly chosen between two percentiles of the distribution
-        of the distance vector between the shapelet and another time series. This time
-        serie is drawn from the same class if classes are given during fit. Otherwise,
-        a random sample will be used. If there is only one sample per class, the same
-        sample will be used.
-    Then, once the set of shapelets have been initialized, we extract the shapelet
-    features from each pair of shapelets and input series. Three features are extracted:
-        - min d(S,X): the minimum value of the distance vector between a shapelet S and
-        a time series X.
-        - argmin d(S,X): the location of the minumum.
-        - SO(d(S,X), threshold): The number of point in the distance vector that are
-        bellow the threshold parameter of the shapelet.
-
-    This is a duplicate of the original implementation in aeon, adapted for bugfixing
-    and experimentation. All credit to the original author @baraline for the
-    implementation.
-
-    Parameters
-    ----------
-    max_shapelets : int, default=10000
-        The maximum number of shapelets to keep for the final transformation.
-        A lower number of shapelets can be kept if alpha similarity have discarded the
-        whole dataset.
-    shapelet_lengths : array, default=None
-        The set of possible length for shapelets. Each shapelet length is uniformly
-        drawn from this set. If None, the shapelets length will be equal to
-        min(max(2,series_length//2),11).
-    proba_normalization : float, default=0.8
-        This probability (between 0 and 1) indicate the chance of each shapelet to be
-        initialized such as it will use a z-normalized distance, inducing either scale
-        sensitivity or invariance. A value of 1 would mean that all shapelets will use
-        a z-normalized distance.
-    distance_function: function
-        A distance function defined as a numba function with signature as
-        (x: np.ndarray, y: np.ndarray) -> float. The default distance function is the
-        manhattan distance.
-    threshold_percentiles : array, default=None
-        The two percentiles used to select the threshold used to compute the Shapelet
-        Occurrence feature. If None, the 5th and the 10th percentiles (i.e. [5,10])
-        will be used.
-    alpha_similarity : float, default=0.5
-        The strength of the alpha similarity pruning. The higher the value, the lower
-        the allowed number of common indexes with previously sampled shapelets
-        when sampling a new candidate with the same dilation parameter.
-        It can cause the number of sampled shapelets to be lower than max_shapelets if
-        the whole search space has been covered. The default is 0.5, and the maximum is
-        1. Value above it have no effect for now.
-    use_prime_dilations : bool, default=False
-        If True, restrict the value of the shapelet dilation parameter to be prime
-        values. This can greatly speed up the algorithm for long time series and/or
-        short shapelet length, possibly at the cost of some accuracy.
-    n_jobs : int, default=1
-        The number of threads used for both `fit` and `transform`.
-    random_state : int or None, default=None
-        Seed for random number generation.
-
-    Attributes
-    ----------
-    shapelets : list
-        The stored shapelets. Each item in the list is a tuple containing:
-            - shapelet values
-            - length parameter
-            - dilation parameter
-            - threshold parameter
-            - normalization parameter
-            - mean parameter
-            - standard deviation parameter
-
-    Notes
-    -----
-    This implementation use all the features for multivariate shapelets, without
-    affecting a random feature subsets to each shapelet as done in the original
-    implementation. See `convst
-    https://github.com/baraline/convst/blob/main/convst/transformers/rdst.py`_.
-
-    References
-    ----------
-    .. [1] Antoine Guillaume et al. "Random Dilated Shapelet Transform: A New Approach
-       for Time Series Shapelets", Pattern Recognition and Artificial Intelligence.
-       ICPRAI 2022.
-    .. [2] Antoine Guillaume, "Time series classification with shapelets: Application
-       to predictive maintenance on event logs", PhD Thesis, University of Orléans,
-       2023.
-    """
-
-    _tags = {
-        "scitype:transform-output": "Primitives",
-        "fit_is_empty": False,
-        "univariate-only": False,
-        "X_inner_mtype": "numpy3D",
-        "y_inner_mtype": "numpy1D",
-        "requires_y": False,
-        "capability:inverse_transform": False,
-        "handles-missing-data": False,
-    }
-
-    def __init__(
-        self,
-        max_shapelets=10000,
-        shapelet_lengths=None,
-        proba_normalization=0.8,
-        threshold_percentiles=None,
-        alpha_similarity=0.5,
-        use_prime_dilations=False,
-        random_state=None,
-        n_jobs=1,
-    ):
-        self.max_shapelets = max_shapelets
-        self.shapelet_lengths = shapelet_lengths
-        self.proba_normalization = proba_normalization
-        self.threshold_percentiles = threshold_percentiles
-        self.alpha_similarity = alpha_similarity
-        self.use_prime_dilations = use_prime_dilations
-        self.random_state = random_state
-        self.n_jobs = n_jobs
-
-        super().__init__()
-
-    def fit(self, X, y=None):
-        """Fit the random dilated shapelet transform to a specified X and y.
-
-        Parameters
-        ----------
-        X: np.ndarray shape (n_instances, n_channels, series_length)
-            The training input samples.
-        y: array-like or list, default=None
-            The class values for X. If not specified, a random sample (i.e. not of the
-            same class) will be used when computing the threshold for the Shapelet
-            Occurence feature.
-
-        Returns
-        -------
-        self : RandomDilatedShapeletTransform
-            This estimator.
-        """
-        X, y = self._validate_data(X=X, y=y, ensure_min_samples=2)
-        X = self._convert_X(X)
-
-        self._random_state = (
-            np.int32(self.random_state) if isinstance(self.random_state, int) else None
-        )
-
-        self.n_instances_, self.n_channels_, self.series_length_ = X.shape
-
-        self._check_input_params()
-
-        self._n_jobs = check_n_jobs(self.n_jobs)
-        set_num_threads(self._n_jobs)
-
-        if y is None:
-            y = np.zeros(self.n_instances_)
-        else:
-            y = LabelEncoder().fit_transform(y)
-
-        if any(self.shapelet_lengths_ > self.series_length_):
-            raise ValueError(
-                "Shapelets lengths can't be superior to input length,",
-                f"but got shapelets_lengths = {self.shapelet_lengths_} ",
-                f"with input length = {self.series_length_}",
-            )
-
-        self.shapelets_ = _random_dilated_shapelet_extraction(
-            X,
-            y,
-            self.max_shapelets,
-            self.shapelet_lengths_,
-            self.proba_normalization,
-            self.threshold_percentiles_,
-            self.alpha_similarity,
-            self.use_prime_dilations,
-            self._random_state,
-        )
-
-        return self
-
-    def transform(self, X, y=None):
-        """Transform X according to the extracted shapelets.
-
-        Parameters
-        ----------
-        X : np.ndarray shape (n_instances, n_channels, series_length)
-            The input data to transform.
-
-        Returns
-        -------
-        X_new : 2D np.array of shape = (n_instances, 3*n_shapelets)
-            The transformed data.
-        """
-        check_is_fitted(self)
-
-        X = self._validate_data(X=X, reset=False)
-        X = self._convert_X(X)
-
-        X_new = _dilated_shapelet_transform(X, self.shapelets_)
-        return X_new
-
-    def _check_input_params(self):
-        if isinstance(self.max_shapelets, bool):
-            raise TypeError(
-                f"'max_shapelets' must be an integer, got {self.max_shapelets}."
-            )
-
-        if not isinstance(self.max_shapelets, (int, np.integer)):
-            raise TypeError(
-                f"'max_shapelets' must be an integer, got {self.max_shapelets}."
-            )
-        self.shapelet_lengths_ = self.shapelet_lengths
-        if self.shapelet_lengths_ is None:
-            self.shapelet_lengths_ = np.array(
-                [min(max(2, self.series_length_ // 2), 11)]
-            )
-        else:
-            if not isinstance(self.shapelet_lengths_, (list, tuple, np.ndarray)):
-                raise TypeError(
-                    "'shapelet_lengths' must be a list, a tuple or "
-                    "an array (got {}).".format(self.shapelet_lengths_)
-                )
-
-            self.shapelet_lengths_ = np.array(self.shapelet_lengths_, dtype=np.int32)
-            if not np.all(self.shapelet_lengths_ >= 2):
-                warnings.warn(
-                    "Some values in 'shapelet_lengths' are inferior to 2."
-                    "These values will be ignored.",
-                    stacklevel=2,
-                )
-                self.shapelet_lengths_ = self.shapelet_lengths[
-                    self.shapelet_lengths_ >= 2
-                ]
-
-            if not np.all(self.shapelet_lengths_ <= self.series_length_):
-                warnings.warn(
-                    "All the values in 'shapelet_lengths' must be lower or equal to"
-                    + "the series length. Shapelet lengths above it will be ignored.",
-                    stacklevel=2,
-                )
-                self.shapelet_lengths_ = self.shapelet_lengths_[
-                    self.shapelet_lengths_ <= self.series_length_
-                ]
-
-            if len(self.shapelet_lengths_) == 0:
-                raise ValueError(
-                    "Shapelet lengths array is empty, did you give shapelets lengths"
-                    " superior to the size of the series ?"
-                )
-
-        self.threshold_percentiles_ = self.threshold_percentiles
-        if self.threshold_percentiles_ is None:
-            self.threshold_percentiles_ = np.array([5, 10])
-        else:
-            if not isinstance(self.threshold_percentiles_, (list, tuple, np.ndarray)):
-                raise TypeError(
-                    "Expected a list, numpy array or tuple for threshold_percentiles"
-                )
-            if len(self.threshold_percentiles_) != 2:
-                raise ValueError(
-                    "The threshold_percentiles param should be an array of size 2"
-                )
-            self.threshold_percentiles_ = np.asarray(self.threshold_percentiles_)
-
-    def _more_tags(self) -> dict:
-        return {
-            "requires_y": True,
-            "non_deterministic": True,
-        }
-
-    @classmethod
-    def get_test_params(cls, parameter_set="default"):
-        """Return testing parameter settings for the estimator.
-
-        Parameters
-        ----------
-        parameter_set : str, default="default"
-            Name of the set of test parameters to return, for use in tests. If no
-            special parameters are defined for a value, will return `"default"` set.
-            There are currently no reserved values for transformers.
-
-        Returns
-        -------
-        params : dict or list of dict, default = {}
-            Parameters to create testing instances of the class
-            Each dict are parameters to construct an "interesting" test instance, i.e.,
-            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
-            `create_test_instance` uses the first (or only) dictionary in `params`
-        """
-        return {"max_shapelets": 10}
-
-
-@njit(fastmath=True, cache=True)
-def _init_random_shapelet_params(
-    max_shapelets,
-    shapelet_lengths,
-    proba_normalization,
-    use_prime_dilations,
-    n_channels,
-    series_length,
-):
-    """Randomly initialize the parameters of the shapelets.
-
-    Parameters
-    ----------
-    max_shapelets : int
-        The maximum number of shapelet to keep for the final transformation.
-        A lower number of shapelets can be kept if alpha similarity have discarded the
-        whole dataset.
-    shapelet_lengths : array
-        The set of possible length for shapelets. Each shapelet length is uniformly
-        drawn from this set.
-    proba_normalization : float
-        This probability (between 0 and 1) indicate the chance of each shapelet to be
-        initialized such as it will use a z-normalized distance, inducing either scale
-        sensitivity or invariance. A value of 1 would mean that all shapelets will use
-        a z-normalized distance.
-    use_prime_dilations : bool
-        If True, restrict the value of the shapelet dilation parameter to be prime
-        values. This can greatly speed up the algorithm for long time series and/or
-        short shapelet length, possibly at the cost of some accuracy.
-    n_channels : int
-        Number of channels of the input time series.
-    series_length : int
-        Size of the input time series.
-
-    Returns
-    -------
-    values : array, shape (max_shapelets, n_channels, max(shapelet_lengths))
-        An initialized (empty) value array for each shapelet
-    lengths : array, shape (max_shapelets)
-        The randomly initialized length of each shapelet
-    dilations : array, shape (max_shapelets)
-        The randomly initialized dilation of each shapelet
-    threshold : array, shape (max_shapelets)
-        An initialized (empty) value array for each shapelet
-    normalize : array, shape (max_shapelets)
-        The randomly initialized normalization indicator of each shapelet
-    means : array, shape (max_shapelets, n_channels)
-        Means of the shapelets
-    stds : array, shape (max_shapelets, n_channels)
-        Standard deviation of the shapelets
-
-    """
-    # Lengths of the shapelets
-    # test dtypes correctness
-    lengths = np.random.choice(shapelet_lengths, size=max_shapelets).astype(np.int32)
-    # Upper bound values for dilations
-    dilations = np.zeros(max_shapelets, dtype=np.int32)
-    upper_bounds = np.log2(np.floor_divide(series_length - 1, lengths - 1))
-
-    if use_prime_dilations:
-        _primes = prime_up_to(np.int32(2 ** upper_bounds.max()))
-        # 1 is not prime, but it is still a valid dilation for the "prime" scheme
-        primes = np.zeros((_primes.shape[0] + 1), dtype=np.int32)
-        primes[0] = 1
-        primes[1:] = _primes
-        for i in prange(max_shapelets):
-            shp_primes = primes[primes <= np.int32(2 ** upper_bounds[i])]
-            dilations[i] = shp_primes[choice_log(shp_primes.shape[0], 1)[0]]
-    else:
-        for i in prange(max_shapelets):
-            dilations[i] = np.int32(2 ** np.random.uniform(0, upper_bounds[i]))
-
-    # Init threshold array
-    threshold = np.zeros(max_shapelets, dtype=np.float64)
-
-    # Init values array
-    values = np.zeros(
-        (max_shapelets, n_channels, max(shapelet_lengths)), dtype=np.float64
-    )
-
-    # Is shapelet using z-normalization ?
-    normalize = np.random.random(size=max_shapelets)
-    normalize = normalize < proba_normalization
-
-    means = np.zeros((max_shapelets, n_channels), dtype=np.float64)
-    stds = np.zeros((max_shapelets, n_channels), dtype=np.float64)
-
-    return values, lengths, dilations, threshold, normalize, means, stds
-
-
-@njit(fastmath=True, cache=True, parallel=True)
-def _random_dilated_shapelet_extraction(
-    X,
-    y,
-    max_shapelets,
-    shapelet_lengths,
-    proba_normalization,
-    threshold_percentiles,
-    alpha_similarity,
-    use_prime_dilations,
-    seed,
-):
-    """Randomly generate a set of shapelets given the input parameters.
-
-    Parameters
-    ----------
-    X : array, shape (n_instances, n_channels, series_length)
-        Time series dataset
-    y : array, shape (n_instances)
-        Class of each input time series
-    max_shapelets : int
-        The maximum number of shapelet to keep for the final transformation.
-        A lower number of shapelets can be kept if alpha similarity have discarded the
-        whole dataset.
-    shapelet_lengths : array
-        The set of possible length for shapelets. Each shapelet length is uniformly
-        drawn from this set.
-    proba_normalization : float
-        This probability (between 0 and 1) indicate the chance of each shapelet to be
-        initialized such as it will use a z-normalized distance, inducing either scale
-        sensitivity or invariance. A value of 1 would mean that all shapelets will use
-        a z-normalized distance.
-    threshold_percentiles : array
-        The two percentiles used to select the threshold used to compute the Shapelet
-        Occurrence feature.
-    alpha_similarity : float
-        The strength of the alpha similarity pruning. The higher the value, the lower
-        the allowed number of common indexes with previously sampled shapelets
-        when sampling a new candidate with the same dilation parameter.
-        It can cause the number of sampled shapelets to be lower than max_shapelets if
-        the whole search space has been covered. The default is 0.5.
-    use_prime_dilations : bool
-        If True, restrict the value of the shapelet dilation parameter to be prime
-        values. This can greatly speed up the algorithm for long time series and/or
-        short shapelet length, possibly at the cost of some accuracy.
-    seed : int
-        Seed for random number generation.
-
-    Returns
-    -------
-    Shapelets : tuple
-    The returned tuple contains 7 arrays describing the shapelets parameters:
-        - values : array, shape (max_shapelets, n_channels, max(shapelet_lengths))
-            Values of the shapelets.
-        - lengths : array, shape (max_shapelets)
-            Length parameter of the shapelets
-        - dilations : array, shape (max_shapelets)
-            Dilation parameter of the shapelets
-        - threshold : array, shape (max_shapelets)
-            Threshold parameter of the shapelets
-        - normalize : array, shape (max_shapelets)
-            Normalization indicator of the shapelets
-        - means : array, shape (max_shapelets, n_channels)
-            Means of the shapelets
-        - stds : array, shape (max_shapelets, n_channels)
-            Standard deviation of the shapelets
-    """
-    n_instances, n_channels, series_length = X.shape
-
-    # Fix the random seed
-    set_numba_random_seed(seed)
-
-    # Initialize shapelets
-    (
-        values,
-        lengths,
-        dilations,
-        threshold,
-        normalize,
-        means,
-        stds,
-    ) = _init_random_shapelet_params(
-        max_shapelets,
-        shapelet_lengths,
-        proba_normalization,
-        use_prime_dilations,
-        n_channels,
-        series_length,
-    )
-
-    # Get unique dilations to loop over
-    unique_dil = np.unique(dilations)
-    n_dilations = unique_dil.shape[0]
-
-    # For each dilation, we can do in parallel
-    for i_dilation in prange(n_dilations):
-        # (2, _, _): Mask is different for normalized and non-normalized shapelets
-        alpha_mask = np.ones((2, n_instances, series_length), dtype=np.bool_)
-        id_shps = np.where(dilations == unique_dil[i_dilation])[0]
-        min_len = min(lengths[id_shps])
-        # For each shapelet id with this dilation
-        for i_shp in id_shps:
-            # Get shapelet params
-            dilation = dilations[i_shp]
-            length = lengths[i_shp]
-            norm = np.int32(normalize[i_shp])
-            dist_vect_shape = series_length - (length - 1) * dilation
-
-            # Possible sampling points given self similarity mask
-            current_mask = alpha_mask[norm, :, :dist_vect_shape]
-            idx_mask = np.where(current_mask)
-
-            n_admissible_points = idx_mask[0].shape[0]
-            if n_admissible_points > 0:
-                # Choose a sample and a timestamp
-                idx_choice = np.random.choice(n_admissible_points)
-                idx_sample = idx_mask[0][idx_choice]
-                idx_timestamp = idx_mask[1][idx_choice]
-
-                # Update the mask in two directions from the sampling point
-                alpha_size = length - int(max(1, (1 - alpha_similarity) * min_len))
-                for j in range(alpha_size):
-                    alpha_mask[norm, idx_sample, (idx_timestamp - (j * dilation))] = (
-                        False
-                    )
-                    alpha_mask[norm, idx_sample, (idx_timestamp + (j * dilation))] = (
-                        False
-                    )
-
-                # Extract the values of shapelet
-                if norm:
-                    _val, _means, _stds = get_subsequence_with_mean_std(
-                        X[idx_sample], idx_timestamp, length, dilation
-                    )
-                    for i_channel in prange(_val.shape[0]):
-                        if _stds[i_channel] > 0:
-                            _val[i_channel] = (
-                                _val[i_channel] - _means[i_channel]
-                            ) / _stds[i_channel]
-                        else:
-                            _val[i_channel] = _val[i_channel] - _means[i_channel]
-                else:
-                    _val = get_subsequence(
-                        X[idx_sample], idx_timestamp, length, dilation
-                    )
-
-                # Select another sample of the same class as the sample used to
-                loc_others = np.where(y == y[idx_sample])[0]
-                if loc_others.shape[0] > 1:
-                    loc_others = loc_others[loc_others != idx_sample]
-                    id_test = np.random.choice(loc_others)
-                else:
-                    id_test = idx_sample
-
-                # Compute distance vector, first get the subsequences
-                X_subs = _get_all_subsequences(X[id_test], length, dilation)
-                if norm:
-                    # Normalize them if needed
-                    X_means, X_stds = sliding_mean_std_one_series(
-                        X[id_test], length, dilation
-                    )
-                    X_subs = _normalize_subsequences(X_subs, X_means, X_stds)
-
-                x_dist = _compute_shapelet_dist_vector(X_subs, _val, length)
-
-                lower_bound = np.percentile(x_dist, threshold_percentiles[0])
-                upper_bound = np.percentile(x_dist, threshold_percentiles[1])
-
-                threshold[i_shp] = np.random.uniform(lower_bound, upper_bound)
-                values[i_shp, :, :length] = _val
-                if norm:
-                    means[i_shp] = _means
-                    stds[i_shp] = _stds
-
-    mask_values = np.ones(max_shapelets, dtype=np.bool_)
-    for i in prange(max_shapelets):
-        if threshold[i] == 0 and np.all(values[i] == 0):
-            mask_values[i] = False
-
-    return (
-        values[mask_values],
-        lengths[mask_values],
-        dilations[mask_values],
-        threshold[mask_values],
-        normalize[mask_values],
-        means[mask_values],
-        stds[mask_values],
-    )
-
-
-@njit(fastmath=True, cache=True, parallel=True)
-def _dilated_shapelet_transform(X, shapelets):
-    """Perform the shapelet transform with a set of shapelets and a set of time series.
-
-    Parameters
-    ----------
-    X : array, shape (n_instances, n_channels, series_length)
-        Time series dataset
-    shapelets : tuple
-        The returned tuple contains 7 arrays describing the shapelets parameters:
-            - values : array, shape (n_shapelets, n_channels, max(shapelet_lengths))
-                Values of the shapelets.
-            - lengths : array, shape (n_shapelets)
-                Length parameter of the shapelets
-            - dilations : array, shape (n_shapelets)
-                Dilation parameter of the shapelets
-            - threshold : array, shape (n_shapelets)
-                Threshold parameter of the shapelets
-            - normalize : array, shape (n_shapelets)
-                Normalization indicator of the shapelets
-            - means : array, shape (n_shapelets, n_channels)
-                Means of the shapelets
-            - stds : array, shape (n_shapelets, n_channels)
-                Standard deviation of the shapelets
-
-    Returns
-    -------
-    X_new : array, shape=(n_instances, 3*n_shapelets)
-        The transformed input time series with each shapelet extracting 3
-        feature from the distance vector computed on each time series.
-
-    """
-    (values, lengths, dilations, threshold, normalize, means, stds) = shapelets
-    n_shapelets = len(lengths)
-    n_instances, n_channels, series_length = X.shape
-
-    n_ft = 3
-
-    # (u_l * u_d , 2)
-    params_shp = combinations_1d(lengths, dilations)
-
-    X_new = np.zeros((n_instances, n_ft * n_shapelets))
-    for i_params in prange(params_shp.shape[0]):
-        length = params_shp[i_params, 0]
-        dilation = params_shp[i_params, 1]
-        id_shps = np.where((lengths == length) & (dilations == dilation))[0]
-
-        for i_x in prange(n_instances):
-            X_subs = _get_all_subsequences(X[i_x], length, dilation)
-            idx_no_norm = id_shps[np.where(~normalize[id_shps])[0]]
-            for i_shp in idx_no_norm:
-                X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = (
-                    _compute_shapelet_features(
-                        X_subs,
-                        values[i_shp],
-                        length,
-                        threshold[i_shp],
-                    )
-                )
-
-            idx_norm = id_shps[np.where(normalize[id_shps])[0]]
-            if len(idx_norm) > 0:
-                X_means, X_stds = sliding_mean_std_one_series(X[i_x], length, dilation)
-                X_subs = _normalize_subsequences(X_subs, X_means, X_stds)
-                for i_shp in idx_norm:
-                    X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = (
-                        _compute_shapelet_features(
-                            X_subs,
-                            values[i_shp],
-                            length,
-                            threshold[i_shp],
-                        )
-                    )
-    return X_new
-
-
-@njit(fastmath=True, cache=True)
-def _normalize_subsequences(X_subs, X_means, X_stds):
-    """
-    Generate subsequences from a time series given the length and dilation parameters.
-
-    Parameters
-    ----------
-    X_subs : array, shape (n_timestamps-(length-1)*dilation, n_channels, length)
-        The subsequences of an input time series given the length and dilation parameter
-    X_means : array, shape (n_channels, n_timestamps-(length-1)*dilation)
-        Length of the subsequences to generate.
-    X_stds : array, shape (n_channels, n_timestamps-(length-1)*dilation)
-        Dilation parameter to apply when generating the strides.
-
-    Returns
-    -------
-    array, shape = (n_timestamps-(length-1)*dilation, n_channels, length)
-        Subsequences of the input time series.
-    """
-    n_subsequences, n_channels, length = X_subs.shape
-    X_new = np.zeros((n_subsequences, n_channels, length))
-    for i_sub in prange(n_subsequences):
-        for i_channel in prange(n_channels):
-            if X_stds[i_channel, i_sub] > 0:
-                X_new[i_sub, i_channel] = (
-                    X_subs[i_sub, i_channel] - X_means[i_channel, i_sub]
-                ) / X_stds[i_channel, i_sub]
-            # else it gives 0, the default value
-    return X_new
-
-
-@njit(fastmath=True, cache=True)
-def _get_all_subsequences(X, length, dilation):
-    """
-    Generate subsequences from a time series given the length and dilation parameters.
-
-    Parameters
-    ----------
-    X : array, shape = (n_channels, n_timestamps)
-        An input time series as (n_channels, n_timestamps).
-    length : int
-        Length of the subsequences to generate.
-    dilation : int
-        Dilation parameter to apply when generating the strides.
-
-    Returns
-    -------
-    array, shape = (n_timestamps-(length-1)*dilation, n_channels, length)
-        Subsequences of the input time series.
-    """
-    n_channels, n_timestamps = X.shape
-    n_subsequences = n_timestamps - (length - 1) * dilation
-    X_subs = np.zeros((n_subsequences, n_channels, length))
-    for i_sub in prange(n_subsequences):
-        for i_channel in prange(n_channels):
-            for i_length in prange(length):
-                X_subs[i_sub, i_channel, i_length] = X[
-                    i_channel, i_sub + (i_length * dilation)
-                ]
-    return X_subs
-
-
-@njit(fastmath=True, cache=True)
-def _compute_shapelet_features(X_subs, values, length, threshold):
-    """Extract the features from a shapelet distance vector.
-
-    Given a shapelet and a time series, extract three features from the resulting
-    distance vector:
-        - min
-        - argmin
-        - Shapelet Occurence : number of point in the distance vector inferior to the
-        threshold parameter
-
-    Parameters
-    ----------
-    X_subs : array, shape (n_timestamps-(length-1)*dilation, n_channels, length)
-        The subsequences of an input time series given the length and dilation parameter
-    values : array, shape (n_channels, length)
-        The value array of the shapelet
-    length : int
-        Length of the shapelet
-    values : array, shape (n_channels, length)
-        The resulting subsequence
-    threshold : float
-        The threshold parameter of the shapelet
-    distance_function: function
-        A distance function defined as a numba function with signature as
-        (x: np.ndarray, y: np.ndarray) -> float. The default distance function is the
-        manhattan distance.
-
-    Returns
-    -------
-    min, argmin, shapelet occurence
-        The three computed features as float dtypes
-    """
-    _min = np.inf
-    _argmin = np.inf
-    _SO = 0
-
-    n_subsequences = X_subs.shape[0]
-
-    for i_sub in prange(n_subsequences):
-        _dist = manhattan_distance(X_subs[i_sub], values[:, :length])
-
-        if _dist < _min:
-            _min = _dist
-            _argmin = i_sub
-        if _dist < threshold:
-            _SO += 1
-
-    return np.float64(_min), np.float64(_argmin), np.float64(_SO)
-
-
-@njit(fastmath=True, cache=True)
-def _compute_shapelet_dist_vector(X_subs, values, length):
-    """Extract the features from a shapelet distance vector.
-
-    Given a shapelet and a time series, extract three features from the resulting
-    distance vector:
-        - min
-        - argmin
-        - Shapelet Occurence : number of point in the distance vector inferior to the
-        threshold parameter
-
-    Parameters
-    ----------
-    X_subs : array, shape (n_timestamps-(length-1)*dilation, n_channels, length)
-        The subsequences of an input time series given the length and dilation parameter
-    values : array, shape (n_channels, length)
-        The value array of the shapelet
-    length : int
-        Dilation of the shapelet
-
-    Returns
-    -------
-    min, argmin, shapelet occurence
-        The three computed features as float dtypes
-    """
-    n_subsequences = X_subs.shape[0]
-    dist_vector = np.zeros(n_subsequences)
-    for i_sub in prange(n_subsequences):
-        dist_vector[i_sub] = manhattan_distance(X_subs[i_sub], values[:, :length])
-    return dist_vector
diff --git a/tsml/transformations/_transform_concatenator.py b/tsml/transformations/_transform_concatenator.py
index a33b516..f3413bd 100644
--- a/tsml/transformations/_transform_concatenator.py
+++ b/tsml/transformations/_transform_concatenator.py
@@ -1,4 +1,4 @@
-""""""
+"""TransformerConcatenator."""
 
 __author__ = ["MatthewMiddlehurst"]
 __all__ = ["TransformerConcatenator"]
@@ -12,7 +12,7 @@
 
 
 class TransformerConcatenator(TransformerMixin, BaseTimeSeriesEstimator):
-    """ """
+    """TransformerConcatenator."""
 
     def __init__(
         self,
@@ -128,13 +128,11 @@ def get_test_params(cls, parameter_set="default"):
             `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
             `create_test_instance` uses the first (or only) dictionary in `params`
         """
-        from tsml.transformations import Catch22Transformer
+        from tsml.transformations import SevenNumberSummaryTransformer
 
         return {
             "transformers": [
-                Catch22Transformer(
-                    features=["DN_HistogramMode_5", "DN_HistogramMode_10"]
-                ),
-                Catch22Transformer(features="CO_f1ecac"),
+                SevenNumberSummaryTransformer(),
+                SevenNumberSummaryTransformer(),
             ],
         }
diff --git a/tsml/transformations/tests/test_interval_extraction.py b/tsml/transformations/tests/test_interval_extraction.py
index 198e2a5..147d4df 100644
--- a/tsml/transformations/tests/test_interval_extraction.py
+++ b/tsml/transformations/tests/test_interval_extraction.py
@@ -1,5 +1,7 @@
 """Interval extraction test code."""
 
+import pytest
+
 from tsml.transformations import (
     Catch22Transformer,
     RandomIntervalTransformer,
@@ -8,9 +10,11 @@
 )
 from tsml.utils.numba_functions.stats import row_mean, row_median
 from tsml.utils.testing import generate_3d_test_data
+from tsml.utils.validation import _check_optional_dependency
 
 
 def test_interval_prune():
+    """Test RandomIntervalTransformer duplicate pruning."""
     X, y = generate_3d_test_data(random_state=0, n_channels=2, series_length=10)
 
     rit = RandomIntervalTransformer(
@@ -25,6 +29,7 @@ def test_interval_prune():
 
 
 def test_random_interval_transformer():
+    """Test RandomIntervalTransformer."""
     X, y = generate_3d_test_data(random_state=0, n_channels=2, series_length=10)
 
     rit = RandomIntervalTransformer(
@@ -38,7 +43,12 @@ def test_random_interval_transformer():
     assert rit.transform(X).shape == (10, 35)
 
 
+@pytest.mark.skipif(
+    not _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False),
+    reason="pycatch22 not installed",
+)
 def test_supervised_transformers():
+    """Test SupervisedIntervalTransformer."""
     X, y = generate_3d_test_data(random_state=0)
 
     sit = SupervisedIntervalTransformer(
diff --git a/tsml/transformations/tests/test_transform_concatenator.py b/tsml/transformations/tests/test_transform_concatenator.py
index b56002b..4d3e28b 100644
--- a/tsml/transformations/tests/test_transform_concatenator.py
+++ b/tsml/transformations/tests/test_transform_concatenator.py
@@ -1,5 +1,6 @@
+"""Tests for the TransformerConcatenator class."""
+
 from tsml.transformations import (
-    Catch22Transformer,
     FunctionTransformer,
     SevenNumberSummaryTransformer,
     TransformerConcatenator,
@@ -9,19 +10,21 @@
 
 
 def test_concatenate_features():
+    """Test TransformerConcatenator on features."""
     X, y = generate_3d_test_data()
 
     concat = TransformerConcatenator(
         transformers=[
-            Catch22Transformer(features=["DN_HistogramMode_5", "DN_HistogramMode_10"]),
+            SevenNumberSummaryTransformer(),
             SevenNumberSummaryTransformer(),
         ]
     )
 
-    assert concat.fit_transform(X).shape == (X.shape[0], 9)
+    assert concat.fit_transform(X).shape == (X.shape[0], 14)
 
 
 def test_concatenate_series():
+    """Test TransformerConcatenator on series."""
     X, y = generate_3d_test_data()
 
     concat = TransformerConcatenator(
diff --git a/tsml/utils/discovery.py b/tsml/utils/discovery.py
index 13c87e0..d2233eb 100644
--- a/tsml/utils/discovery.py
+++ b/tsml/utils/discovery.py
@@ -109,9 +109,7 @@ def is_abstract(c):
             "classifier": ClassifierMixin,
             "regressor": RegressorMixin,
             "transformer": TransformerMixin,
-            # accept both clusterer inputs
             "clusterer": ClusterMixin,
-            "cluster": ClusterMixin,
         }
         for name, mixin in filters.items():
             if name in type_filter:
@@ -123,7 +121,7 @@ def is_abstract(c):
         if type_filter:
             raise ValueError(
                 "Parameter type_filter must be 'classifier', "
-                "'regressor', 'transformer', 'cluster' or "
+                "'regressor', 'transformer', 'clusterer' or "
                 f"None, got {repr(type_filter)}."
             )
 
diff --git a/tsml/utils/testing.py b/tsml/utils/testing.py
index 8a715d1..7e0c98e 100644
--- a/tsml/utils/testing.py
+++ b/tsml/utils/testing.py
@@ -100,10 +100,10 @@ def parametrize_with_checks(estimators: List[BaseEstimator]) -> Callable:
     Examples
     --------
     >>> from tsml.utils.testing import parametrize_with_checks
-    >>> from tsml.interval_based import TSFRegressor
+    >>> from tsml.interval_based import IntervalForestRegressor
     >>> from tsml.vector import RotationForestClassifier
     >>> @parametrize_with_checks(
-    ...     [TSFRegressor(), RotationForestClassifier()]
+    ...     [IntervalForestRegressor(), RotationForestClassifier()]
     ... )
     ... def test_tsml_compatible_estimator(estimator, check):
     ...     check(estimator)
diff --git a/tsml/utils/validation.py b/tsml/utils/validation.py
index 252375c..b57c232 100644
--- a/tsml/utils/validation.py
+++ b/tsml/utils/validation.py
@@ -609,9 +609,10 @@ def _check_optional_dependency(
         # package cannot be imported
         if raise_error:
             raise ModuleNotFoundError(
-                f'{source_name} has an optional dependency and requires "{package_name}" '
-                f'to be installed. Run: "pip install {package_name}" or "pip install '
-                f'tsml[extras]" to install all optional dependencies.'
+                f"{source_name} has an optional dependency and requires "
+                f'"{package_name}" to be installed. Run: "pip install {package_name}" '
+                f'or "pip install tsml[all_extras]" to install all optional '
+                f"dependencies."
             ) from e
         else:
             return False