diff --git a/src/jabs/classifier/__init__.py b/src/jabs/classifier/__init__.py index b726dd11..1fc9e980 100644 --- a/src/jabs/classifier/__init__.py +++ b/src/jabs/classifier/__init__.py @@ -5,12 +5,8 @@ Gradient Boosting, and XGBoost), utilities for feature management, data splitting, model evaluation, and serialization.` """ -import pathlib - from .classifier import Classifier -HYPERPARAMETER_PATH = pathlib.Path(__file__).parent / "hyperparameters.json" - __all__ = [ "Classifier", ] diff --git a/src/jabs/classifier/classifier.py b/src/jabs/classifier/classifier.py index 0819991b..1010d3e1 100644 --- a/src/jabs/classifier/classifier.py +++ b/src/jabs/classifier/classifier.py @@ -1,3 +1,4 @@ +import logging import random import re import typing @@ -8,7 +9,7 @@ import joblib import numpy as np import pandas as pd -from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.exceptions import InconsistentVersionWarning from sklearn.metrics import ( accuracy_score, @@ -21,28 +22,53 @@ from jabs.types import ClassifierType from jabs.utils import hash_file -_VERSION = 9 - -_classifier_choices = [ClassifierType.RANDOM_FOREST, ClassifierType.GRADIENT_BOOSTING] +_VERSION = 10 try: _xgboost = import_module("xgboost") - # we were able to import xgboost, make it available as an option: - _classifier_choices.append(ClassifierType.XGBOOST) -except Exception: +except ImportError: # we were unable to import the xgboost module. It's either not - # installed (it should be if the user used our requirements-old.txt) + # installed (it should be if the user installed JABS as a package) # or it may have been unable to be imported due to a missing # libomp. Either way, we won't add it to the available choices and # we can otherwise ignore this exception _xgboost = None + logging.warning( + "Unable to import xgboost. XGBoost support will be unavailable. " + "You may need to install xgboost and/or libomp." + ) + + +# Classifier factory helpers and mapping +def _make_random_forest(n_jobs: int, random_seed: int | None): + """Factory function to construct a RandomForest classifier.""" + return RandomForestClassifier(n_jobs=n_jobs, random_state=random_seed) + + +def _make_xgboost(n_jobs: int, random_seed: int | None): + """Factory function to construct an XGBoost classifier.""" + if _xgboost is None: + raise RuntimeError( + "XGBoost classifier requested but 'xgboost' is not available in this environment." + ) + return _xgboost.XGBClassifier(n_jobs=n_jobs, random_state=random_seed) + + +# _CLASSIFIER_FACTORIES serves as both the single source of truth for classifiers +# supported by the current JABS environment, in addition to the mapping of ClassifierTypes +# to factory functions that produce instantiated classifiers for that type +_CLASSIFIER_FACTORIES: dict[ClassifierType, typing.Callable[[int, int | None], typing.Any]] = { + ClassifierType.RANDOM_FOREST: _make_random_forest, +} +if _xgboost is not None: + _CLASSIFIER_FACTORIES[ClassifierType.XGBOOST] = _make_xgboost class Classifier: """A machine learning classifier for behavior classification tasks. This class supports training, evaluating, saving, and loading classifiers - for behavioral data using Random Forest, Gradient Boosting, or XGBoost algorithms. + for behavioral data using Random Forest or XGBoost algorithms. It provides utilities for data splitting, balancing, augmentation, and feature management. Attributes: @@ -51,13 +77,7 @@ class Classifier: LABEL_THRESHOLD = 20 - _CLASSIFIER_NAMES: typing.ClassVar[dict] = { - ClassifierType.RANDOM_FOREST: "Random Forest", - ClassifierType.GRADIENT_BOOSTING: "Gradient Boosting", - ClassifierType.XGBOOST: "XGBoost", - } - - def __init__(self, classifier=ClassifierType.RANDOM_FOREST, n_jobs=1): + def __init__(self, classifier: ClassifierType = ClassifierType.RANDOM_FOREST, n_jobs: int = 1): self._classifier_type = classifier self._classifier = None self._project_settings = None @@ -69,9 +89,10 @@ def __init__(self, classifier=ClassifierType.RANDOM_FOREST, n_jobs=1): self._classifier_file = None self._classifier_hash = None self._classifier_source = None + self._supported_classifiers = self._supported_classifier_choices() # make sure the value passed for the classifier parameter is valid - if classifier not in _classifier_choices: + if classifier not in self._supported_classifiers: raise ValueError("Invalid classifier type") @classmethod @@ -93,10 +114,10 @@ def from_training_file(cls, path: Path): classifier.behavior_name = behavior classifier.set_dict_settings(loaded_training_data["settings"]) classifier_type = ClassifierType(loaded_training_data["classifier_type"]) - if classifier_type in classifier.classifier_choices(): + if classifier_type in classifier._supported_classifiers: classifier.set_classifier(classifier_type) else: - print( + logging.warning( f"Specified classifier type {classifier_type.name} is unavailable, using default: {classifier.classifier_type.name}" ) training_features = classifier.combine_data( @@ -119,7 +140,7 @@ def from_training_file(cls, path: Path): @property def classifier_name(self) -> str: """return the name of the classifier used as a string""" - return self._CLASSIFIER_NAMES[self._classifier_type] + return self._classifier_type.value @property def classifier_type(self) -> ClassifierType: @@ -148,7 +169,7 @@ def project_settings(self) -> dict: return {} @property - def behavior_name(self) -> str: + def behavior_name(self) -> str | None: """return the behavior name property""" return self._behavior @@ -163,7 +184,7 @@ def version(self) -> int: return self._version @property - def feature_names(self) -> list: + def feature_names(self) -> list[str] | None: """returns the list of feature names used when training this classifier""" return self._feature_names @@ -299,9 +320,8 @@ def downsample_balance(features, labels, random_seed=None): selected_samples = [] for cur_label in label_states: idxs = np.where(labels == cur_label)[0] - if random_seed is not None: - np.random.seed(random_seed) - sampled_idxs = np.random.choice(idxs, max_examples_per_class, replace=False) + rng = np.random.default_rng(random_seed) + sampled_idxs = rng.choice(idxs, max_examples_per_class, replace=False) selected_samples.append(sampled_idxs) selected_samples = np.sort(np.concatenate(selected_samples)) features = features.iloc[selected_samples] @@ -346,9 +366,9 @@ def augment_symmetric(features, labels, random_str="ASygRQDZJD"): # print(str(lowercase_features[idx]) + ' -> ' + str(reflected_feature_names[idx])) return features, labels - def set_classifier(self, classifier): + def set_classifier(self, classifier: ClassifierType): """change the type of the classifier being used""" - if classifier not in _classifier_choices: + if classifier not in self._supported_classifiers: raise ValueError("Invalid Classifier Type") self._classifier_type = classifier @@ -380,15 +400,17 @@ def classifier_choices(self): Returns: dict where keys are ClassifierType enum values, and the - values are string names for the classifiers. example: - - { - : 'Random Forest', - : 'Gradient Boosting', - : 'XGBoost' - } + values are string names for the classifiers. """ - return {d: self._CLASSIFIER_NAMES[d] for d in _classifier_choices} + return {t: t.value for t in sorted(self._supported_classifiers, key=lambda t: t.value)} + + def _create_classifier(self, random_seed: int | None = None): + """Instantiate the underlying classifier for the current classifier type.""" + try: + factory = _CLASSIFIER_FACTORIES[self._classifier_type] + except KeyError: + raise ValueError(f"Unsupported classifier type: {self._classifier_type!r}") from None + return factory(self._n_jobs, random_seed) def train(self, data, random_seed: int | None = None): """train the classifier @@ -421,16 +443,23 @@ def train(self, data, random_seed: int | None = None): if self._project_settings.get("balance_labels", False): features, labels = self.downsample_balance(features, labels, random_seed) - if self._classifier_type == ClassifierType.RANDOM_FOREST: - self._classifier = self._fit_random_forest(features, labels, random_seed=random_seed) - elif self._classifier_type == ClassifierType.GRADIENT_BOOSTING: - self._classifier = self._fit_gradient_boost(features, labels, random_seed=random_seed) - elif _xgboost is not None and self._classifier_type == ClassifierType.XGBOOST: + classifier = self._create_classifier(random_seed=random_seed) + + if self._classifier_type == ClassifierType.XGBOOST: with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) - self._classifier = self._fit_xgboost(features, labels, random_seed=random_seed) + # XGBoost natively supports NaN as a marker for missing values and handles them + # during tree construction. For XGBoost we therefore convert infinite values to NaN + # and leave them as missing, instead of imputing them with 0. This differs from the + # Random Forest (and other sklearn) path below, where both infinities and NaN are + # replaced with 0. + cleaned_features = features.replace([np.inf, -np.inf], np.nan) + self._classifier = classifier.fit(cleaned_features, labels) else: - raise ValueError("Unsupported classifier") + # RandomForestClassifier (and most other sklearn estimators) do not natively support NaN + # values, so here we replace infinite values and NaNs with 0 before fitting. + cleaned_features = features.replace([np.inf, -np.inf], 0).fillna(0) + self._classifier = classifier.fit(cleaned_features, labels) # Classifier may have been re-used from a prior training, blank the logging attributes self._classifier_file = None @@ -513,7 +542,7 @@ def load(self, path: Path): ) # make sure the value passed for the classifier parameter is valid - if c._classifier_type not in _classifier_choices: + if c._classifier_type not in self._supported_classifiers: raise ValueError("Invalid classifier type") self._classifier = c._classifier @@ -529,16 +558,6 @@ def load(self, path: Path): self._classifier_hash = hash_file(Path(path)) self._classifier_source = "pickle" - def _update_classifier_type(self): - # we may need to update the classifier type based - # on the type of the loaded object - if isinstance(self._classifier, RandomForestClassifier): - self._classifier_type = ClassifierType.RANDOM_FOREST - elif isinstance(self._classifier, GradientBoostingClassifier): - self._classifier_type = ClassifierType.GRADIENT_BOOSTING - else: - self._classifier_type = ClassifierType.XGBOOST - @staticmethod def accuracy_score(truth, predictions): """return accuracy score""" @@ -567,19 +586,6 @@ def combine_data(per_frame, window): """ return pd.concat([per_frame, window], axis=1) - def _fit_random_forest(self, features, labels, random_seed: int | None = None): - classifier = RandomForestClassifier(n_jobs=self._n_jobs, random_state=random_seed) - return classifier.fit(features.replace([np.inf, -np.inf], 0).fillna(0), labels) - - def _fit_gradient_boost(self, features, labels, random_seed: int | None = None): - classifier = GradientBoostingClassifier(random_state=random_seed) - return classifier.fit(features.replace([np.inf, -np.inf], 0).fillna(0), labels) - - def _fit_xgboost(self, features, labels, random_seed: int | None = None): - classifier = _xgboost.XGBClassifier(n_jobs=self._n_jobs, random_state=random_seed) - classifier.fit(features.replace([np.inf, -np.inf]), labels) - return classifier - def print_feature_importance(self, feature_list, limit=20): """print the most important features and their importance @@ -665,3 +671,8 @@ def label_threshold_met(all_counts: dict, min_groups: int): """ group_count = Classifier.count_label_threshold(all_counts) return 1 < group_count >= min_groups + + @staticmethod + def _supported_classifier_choices() -> set[ClassifierType]: + """Determine the list of supported classifier types in the current JABS environment.""" + return set(_CLASSIFIER_FACTORIES.keys()) diff --git a/src/jabs/project/export_training.py b/src/jabs/project/export_training.py index 3565454d..f0ffa1c6 100644 --- a/src/jabs/project/export_training.py +++ b/src/jabs/project/export_training.py @@ -62,7 +62,7 @@ def export_training_data( out_h5.attrs["min_pose_version"] = pose_version out_h5.attrs["behavior"] = behavior write_project_settings(out_h5, project.settings_manager.get_behavior(behavior), "settings") - out_h5.attrs["classifier_type"] = classifier_type.value + out_h5.attrs["classifier_type"] = str(classifier_type) out_h5.attrs["training_seed"] = training_seed feature_group = out_h5.create_group("features") for feature, data in features["per_frame"].items(): diff --git a/src/jabs/resources/docs/user_guide/user_guide.md b/src/jabs/resources/docs/user_guide/user_guide.md index 60f13de9..568f892d 100644 --- a/src/jabs/resources/docs/user_guide/user_guide.md +++ b/src/jabs/resources/docs/user_guide/user_guide.md @@ -643,7 +643,7 @@ See `jabs-classify COMMAND --help` for information on a specific command. ``` ```text -usage: jabs-classify classify [-h] [--random-forest | --gradient-boosting | --xgboost] +usage: jabs-classify classify [-h] [--random-forest | --xgboost] (--training TRAINING | --classifier CLASSIFIER) --input-pose INPUT_POSE --out-dir OUT_DIR [--fps FPS] [--feature-dir FEATURE_DIR] @@ -664,7 +664,6 @@ optionally override the classifier specified in the training file: Ignored if trained classifier passed with --classifier option. (the following options are mutually exclusive): --random-forest Use Random Forest - --gradient-boosting Use Gradient Boosting --xgboost Use XGBoost Classifier Input (one of the following is required): @@ -674,7 +673,7 @@ Classifier Input (one of the following is required): ``` ```text -usage: jabs-classify train [-h] [--random-forest | --gradient-boosting | --xgboost] +usage: jabs-classify train [-h] [--random-forest | --xgboost] training_file out_file positional arguments: @@ -687,7 +686,6 @@ optional arguments: optionally override the classifier specified in the training file: (the following options are mutually exclusive): --random-forest Use Random Forest - --gradient-boosting Use Gradient Boosting --xgboost Use XGBoost ``` diff --git a/src/jabs/scripts/classify.py b/src/jabs/scripts/classify.py index cfd38c1e..c17d0f5d 100755 --- a/src/jabs/scripts/classify.py +++ b/src/jabs/scripts/classify.py @@ -263,7 +263,7 @@ def classify_main(): required_args.add_argument( "--input-pose", - help="input HDF5 pose file (v2, v3, v4, or v5).", + help="input HDF5 pose file.", required=True, ) required_args.add_argument( diff --git a/src/jabs/types/classifier_types.py b/src/jabs/types/classifier_types.py index 6a29d008..4c447ed7 100644 --- a/src/jabs/types/classifier_types.py +++ b/src/jabs/types/classifier_types.py @@ -1,9 +1,8 @@ -from enum import IntEnum +from enum import Enum -class ClassifierType(IntEnum): +class ClassifierType(str, Enum): """Classifier type for the project.""" - RANDOM_FOREST = 1 - GRADIENT_BOOSTING = 2 - XGBOOST = 3 + RANDOM_FOREST = "Random Forest" + XGBOOST = "XGBoost" diff --git a/src/jabs/ui/main_control_widget.py b/src/jabs/ui/main_control_widget.py index 3e27b86e..fcb291f9 100644 --- a/src/jabs/ui/main_control_widget.py +++ b/src/jabs/ui/main_control_widget.py @@ -16,6 +16,7 @@ from PySide6.QtGui import QIcon, QPainter, QPixmap from jabs.classifier import Classifier +from jabs.types import ClassifierType from jabs.ui.ear_tag_icons import EarTagIconManager from .colors import ( @@ -335,7 +336,12 @@ def classify_button_enabled(self, enabled: bool): @property def classifier_type(self): """return the selected classifier type""" - return self._classifier_selection.currentData() + data = self._classifier_selection.currentData() + # QComboBox may return a string instead of the enum due to serialization + # so we need to convert it back to ClassifierType if it's a string + if isinstance(data, str): + return ClassifierType(data) + return data @property def use_balance_labels(self):