diff --git a/docs/tutorials.rst b/docs/tutorials.rst index aae1ab989f..8f7900f709 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -13,6 +13,8 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package tutorials/random_class_exp tutorials/rotation_cifar tutorials/spiral_exp + tutorials/sporf_datasets + tutorials/sporf_decision_boundaries tutorials/uncertaintyforest_running_example tutorials/uncertaintyforest_posteriorestimates tutorials/uncertaintyforest_conditionalentropyestimates diff --git a/docs/tutorials/functions/sporf_datasets_functions.py b/docs/tutorials/functions/sporf_datasets_functions.py new file mode 100644 index 0000000000..cbee97c497 --- /dev/null +++ b/docs/tutorials/functions/sporf_datasets_functions.py @@ -0,0 +1,90 @@ +import sys +import numpy as np +import pandas as pd +import csv +from numpy import genfromtxt + +from proglearn.progressive_learner import ProgressiveLearner +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ObliqueTreeClassificationTransformer +from proglearn.deciders import SimpleArgmaxAverage + +from sklearn.model_selection import train_test_split, cross_val_score + +def load_simulated_data(file): + data = genfromtxt(file, delimiter=',') + X = data[:, :-1] + y = data[:, -1] + + return X, y + +def load_data(data_file, task_num): + if "Hill_Valley" in data_file: + df = pd.read_csv(data_file) + X = df[df.columns[:-1]].to_numpy() + y = df[df.columns[-1]].to_numpy() + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y) + + if "acute" in data_file: + + df = pd.read_table(data_file, encoding='utf-16') + df[df == "no"] = 0 + df[df == "yes"] = 1 + + data = df.to_numpy() + temps = data[:, 0] + + temperature = [] + for i in range(len(temps)): + temp_str = temps[i] + temp_str = temp_str.replace(",", ".") + temperature.append(float(temp_str)) + + data[:, 0] = np.array(temperature) + + X = np.array(data[:, :5], dtype=float) + + # 6 for task 1, 7 for task 2 + if task_num == 1: + y = np.array(data[:, 6], dtype=float) + else: + y = np.array(data[:, 7], dtype=float) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y) + + return X_train, X_test, y_train, y_test, len(np.unique(y)) + + +def test(data_file, reps, n_trees, task_num, + default_transformer_class, default_transformer_kwargs): + default_voter_class = TreeClassificationVoter + default_voter_kwargs = {} + + default_decider_class = SimpleArgmaxAverage + + kappa = np.zeros(reps) + for i in range(reps): + X_train, X_test, y_train, y_test, n_classes = load_data(data_file, task_num) + default_decider_kwargs = {"classes": np.arange(n_classes)} + + pl = ProgressiveLearner( + default_transformer_class=default_transformer_class, + default_transformer_kwargs=default_transformer_kwargs, + default_voter_class=default_voter_class, + default_voter_kwargs=default_voter_kwargs, + default_decider_class=default_decider_class, + default_decider_kwargs=default_decider_kwargs) + + pl.add_task(X_train, y_train, num_transformers=n_trees) + + y_hat = pl.predict(X_test, task_id=0) + + acc = np.sum(y_test == y_hat) / len(y_test) + print("Accuracy after iteration ", i, ": ", acc) + + chance_pred = 1 / n_classes + kappa[i] = (acc - chance_pred) / (1 - chance_pred) + + return np.mean(kappa) * 100, (np.std(kappa) * 100) / np.sqrt(reps) \ No newline at end of file diff --git a/docs/tutorials/functions/sporf_decision_boundaries_functions.py b/docs/tutorials/functions/sporf_decision_boundaries_functions.py new file mode 100644 index 0000000000..baef1cfc89 --- /dev/null +++ b/docs/tutorials/functions/sporf_decision_boundaries_functions.py @@ -0,0 +1,96 @@ +from rerf.rerfClassifier import rerfClassifier + +import numpy as np +np.random.seed(42) + +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.ensemble import RandomForestClassifier + +from proglearn.forest import LifelongClassificationForest +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ObliqueTreeClassificationTransformer +from proglearn.deciders import SimpleArgmaxAverage + +def test(NT, h, names, classifiers, datasets): + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + edgecolors='k') + # Plot the testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, + edgecolors='k') + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + + if "Proglearn" in name: + + clf = LifelongClassificationForest(oblique=True, + default_feature_combinations=1, default_density=0.5) + clf.add_task(X_train, y_train, n_estimators=NT) + y_hat = clf.predict(X_test, task_id=0) + score = np.sum(y_hat == y_test) / len(y_test) + + else: + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + + # Plot the decision boundary. For that, we will assign a color to each + # point in the mesh [x_min, x_max]x[y_min, y_max]. + if hasattr(clf, "decision_function"): + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + elif "Proglearn" in name: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()], task_id=0)[:, 1] + else: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + edgecolors='k') + # Plot the testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + edgecolors='k', alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 diff --git a/docs/tutorials/sporf_datasets.ipynb b/docs/tutorials/sporf_datasets.ipynb new file mode 100644 index 0000000000..31ab947477 --- /dev/null +++ b/docs/tutorials/sporf_datasets.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SPORF Tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this tutorial is to prove that this pure python implementation of SPORF is identical, in terms of functionality, to the one used in the SPORF paper (Tomita, Tyler M., et al. \"Sparse projection oblique randomer forests.\" Journal of Machine Learning Research 21.104 (2020): 1-39.). In order to do this, this notebook runs this implementation of SPORF on 3 different data sets: hill valley, acute inflammation task 1, and acute inflammation task 2. Cohen's Kappa (fractional decrease in error rate over the chance error rate) is the metric that is being used to compare the implementations. If this implementation has the same kappa values (for the same data sets) as the one in the SPORF paper, we can say with confidence that this implementation is accurate. The datasets used in this notebook all had kappa values of 100 ± 0 in the SPORF paper implementation, which is also what is found when run on this SPORF implementation, as seen below. Thus, we can say with confidence that this implementation of SPORF is accurate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from proglearn.progressive_learner import ProgressiveLearner\n", + "from proglearn.forest import LifelongClassificationForest\n", + "from proglearn.voters import TreeClassificationVoter\n", + "from proglearn.transformers import TreeClassificationTransformer\n", + "from proglearn.transformers import ObliqueTreeClassificationTransformer\n", + "from proglearn.deciders import SimpleArgmaxAverage\n", + "\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "\n", + "from functions.sporf_datasets_functions import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SPORF\n", + "\n", + "## Set parameters and run on hill valley without noise data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 2\n", + "density = 0.01\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 1 data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 2 data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 2\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forest (RF)\n", + "\n", + "Now we will run the same datasets on a base Random forest. The goal of this is to show how SPORF can clearly outperform or perform as well as the Random Forest algorithm. As seen by the results below, SPORF has a much higher kappa value, than RF, for the hill valley without noise data and has the same value for the acute inflammation data sets. Having a high kappa value is desired since as mentioned above, it is a measure of how much the error rate over the chance error rate decreases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on hill valley without noise data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 0.5409836065573771\n", + "Accuracy after iteration 1 : 0.5901639344262295\n", + "Accuracy after iteration 2 : 0.5901639344262295\n", + "Accuracy after iteration 3 : 0.6885245901639344\n", + "Accuracy after iteration 4 : 0.5245901639344263\n", + "kappa: 17.37704918032787 , error: 5.1130724431784715\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 2\n", + "density = 0.01\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 1 data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 2 data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 2\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/sporf_decision_boundaries.ipynb b/docs/tutorials/sporf_decision_boundaries.ipynb new file mode 100644 index 0000000000..70b10f2802 --- /dev/null +++ b/docs/tutorials/sporf_decision_boundaries.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use SPORF to Draw Decision Boundaries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this notebook is to show that the oblique tree in ProgLearn is correct and can accurately determine oblique splits." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from rerf.rerfClassifier import rerfClassifier\n", + "\n", + "import numpy as np\n", + "np.random.seed(42)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.colors import ListedColormap\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.datasets import make_moons, make_circles, make_classification\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from proglearn.forest import LifelongClassificationForest\n", + "from proglearn.voters import TreeClassificationVoter\n", + "from proglearn.transformers import TreeClassificationTransformer\n", + "from proglearn.transformers import ObliqueTreeClassificationTransformer\n", + "from proglearn.deciders import SimpleArgmaxAverage\n", + "\n", + "from functions.sporf_decision_boundaries_functions import test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters, initialize datasets, and initialize classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "NT = 10\n", + "\n", + "h = .1 # step size in the mesh\n", + "\n", + "names = [\"RF\", \"RerF\", \"Proglearn-SPORF\"]\n", + "\n", + "classifiers = [\n", + " RandomForestClassifier(max_depth=5, n_estimators=NT, max_features=1),\n", + " rerfClassifier(n_estimators = NT, feature_combinations=1.5, max_features=2),\n", + " LifelongClassificationForest(oblique=True, default_feature_combinations=1, default_density=0.5)]\n", + "\n", + "X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,\n", + " random_state=1, n_clusters_per_class=1)\n", + "rng = np.random.RandomState(2)\n", + "X += 2 * rng.uniform(size=X.shape)\n", + "linearly_separable = (X, y)\n", + "\n", + "datasets = [make_moons(noise=0.3, random_state=0),\n", + " make_circles(noise=0.2, factor=0.5, random_state=1),\n", + " linearly_separable\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run on all datasets for all models" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "figure = plt.figure(figsize=(15, 9))\n", + "test(NT, h, names, classifiers, datasets)\n", + "plt.tight_layout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/proglearn/forest.py b/proglearn/forest.py index 3539eb87e2..c464529272 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -3,7 +3,10 @@ Corresponding Email: levinewill@icloud.com """ from .progressive_learner import ClassificationProgressiveLearner -from .transformers import TreeClassificationTransformer +from .transformers import ( + TreeClassificationTransformer, + ObliqueTreeClassificationTransformer, +) from .voters import TreeClassificationVoter from .deciders import SimpleArgmaxAverage @@ -35,6 +38,15 @@ class LifelongClassificationForest(ClassificationProgressiveLearner): The maximum depth of a tree in the Lifelong Classification Forest. This is used if 'max_depth' is not fed to add_task. + oblique : bool, default=False + Specifies if an oblique tree should used for the classifier or not. + + feature_combinations : float, default=1.5 + The feature combinations to use for the oblique split. + + density : float, default=0.5 + Density estimate. + Attributes ---------- pl_ : ClassificationProgressiveLearner @@ -48,13 +60,26 @@ def __init__( default_tree_construction_proportion=0.67, default_kappa=np.inf, default_max_depth=30, + oblique=False, + default_feature_combinations=1.5, + default_density=0.5, ): self.default_n_estimators = default_n_estimators self.default_tree_construction_proportion = default_tree_construction_proportion self.default_kappa = default_kappa self.default_max_depth = default_max_depth + self.oblique = oblique + + if oblique: + default_transformer_class = ObliqueTreeClassificationTransformer + self.default_feature_combinations = default_feature_combinations + self.default_density = default_density + + else: + default_transformer_class = TreeClassificationTransformer + self.pl_ = ClassificationProgressiveLearner( - default_transformer_class=TreeClassificationTransformer, + default_transformer_class=default_transformer_class, default_transformer_kwargs={}, default_voter_class=TreeClassificationVoter, default_voter_kwargs={"kappa": default_kappa}, @@ -71,6 +96,8 @@ def add_task( tree_construction_proportion="default", kappa="default", max_depth="default", + feature_combinations="default", + density="default", ): """ adds a task with id task_id, max tree depth max_depth, given input data matrix X @@ -106,6 +133,12 @@ def add_task( The maximum depth of a tree in the Lifelong Classification Forest. The default is used if 'default' is provided. + feature_combinations : float, default='default' + The feature combinations to use for the oblique split. + + density : float, default='default' + Density estimate. + Returns ------- self : LifelongClassificationForest @@ -120,6 +153,23 @@ def add_task( if max_depth == "default": max_depth = self.default_max_depth + if self.oblique: + if feature_combinations == "default": + feature_combinations = self.default_feature_combinations + if density == "default": + density = self.default_density + + transformer_kwargs = { + "kwargs": { + "max_depth": max_depth, + "feature_combinations": feature_combinations, + "density": density, + } + } + + else: + transformer_kwargs = ({"kwargs": {"max_depth": max_depth}},) + X, y = check_X_y(X, y) return self.pl_.add_task( X, @@ -131,7 +181,7 @@ def add_task( 0, ], num_transformers=n_estimators, - transformer_kwargs={"kwargs": {"max_depth": max_depth}}, + transformer_kwargs=transformer_kwargs, voter_kwargs={ "classes": np.unique(y), "kappa": kappa, diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index acf7e9976b..b03a4d1323 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -4,7 +4,10 @@ import random from proglearn.forest import LifelongClassificationForest -from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ( + TreeClassificationTransformer, + ObliqueTreeClassificationTransformer, +) from proglearn.voters import TreeClassificationVoter from proglearn.deciders import SimpleArgmaxAverage @@ -47,3 +50,10 @@ def test_correct_default_n_estimators(self): def test_correct_true_initilization_finite_sample_correction(self): l2f = LifelongClassificationForest(default_kappa=np.inf) assert l2f.pl_.default_voter_kwargs == {"kappa": np.inf} + + def test_oblique_transformer(self): + l2f = LifelongClassificationForest(oblique=True) + assert l2f.pl_.default_transformer_class == ObliqueTreeClassificationTransformer + assert l2f.default_feature_combinations == 1.5 + assert l2f.default_density == 0.5 + assert l2f.pl_.default_transformer_kwargs == {} diff --git a/proglearn/tests/test_transformer.py b/proglearn/tests/test_transformer.py index f7d66f2b39..a4de499ef3 100644 --- a/proglearn/tests/test_transformer.py +++ b/proglearn/tests/test_transformer.py @@ -1,9 +1,16 @@ import pytest import numpy as np -from numpy.testing import assert_allclose +from numpy.testing import ( + assert_almost_equal, + assert_warns, + assert_raises, + assert_allclose, +) +from numpy import random as rng +from sklearn.datasets import load_iris from sklearn.exceptions import NotFittedError -from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import * class TestTreeClassificationTransformer: @@ -31,3 +38,151 @@ def test_correct_transformation(self): u1 = trt.transform(np.array([0]).reshape(1, -1)) u2 = trt.transform(np.array([1]).reshape(1, -1)) assert u1 != u2 + + +class TestObliqueSplitter: + def test_sample_projmat(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + y = np.zeros(100) + + density = 0.5 + proj_dims = [10, 20, 40, 60, 80] + sample_inds = [ + np.linspace(0, 9, 10, dtype=int), + np.linspace(0, 19, 20, dtype=int), + np.linspace(0, 39, 40, dtype=int), + np.linspace(0, 59, 60, dtype=int), + np.linspace(0, 79, 80, dtype=int), + ] + + n_sample_inds = [10, 20, 40, 60, 80] + + for pd in proj_dims: + splitter = ObliqueSplitter(X, y, pd, density, random_state) + + for i in range(len(n_sample_inds)): + si = sample_inds[i] + n = n_sample_inds[i] + + proj_X, projmat = splitter.sample_proj_mat(si) + assert n == proj_X.shape[0] + assert pd == proj_X.shape[1] + + def test_score(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(11, 11) + + density = 0.5 + proj_dims = 5 + + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + score = splitter.score(y, 6) + assert 0 == score + + score = splitter.score(y, 1) + assert_almost_equal(5 / 11, score) + + def test_impurity(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + + density = 0.5 + proj_dims = 50 + + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10 * i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + # Impurity of one thing should be 0 + impurity = splitter.impurity([0]) + assert 0 == impurity + + # Impurity of one class should be 0 + impurity = splitter.impurity(np.linspace(0, 9, 10, dtype=int)) + assert 0 == impurity + + # Impurity of two different classes with equal number should be 0.5 + impurity = splitter.impurity(np.linspace(0, 19, 20, dtype=int)) + assert 0.5 == impurity + + # Impurity of all classes should be 10 * (1/10)(9/10) = 9/10 + impurity = splitter.impurity(np.linspace(0, 99, 100, dtype=int)) + assert_almost_equal(0.9, impurity) + + def test_split(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + + density = 0.5 + proj_dims = 50 + + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10 * i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + split_info = splitter.split(np.array([i for i in range(100)])) + + +class TestObliqueTree: + def test_add_node(self): + + # Add a root node + tree = ObliqueTree(None, 0, 0, 0, 0, 0) + + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) + + # Add a regular node + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) + + # Add a leaf node + tree.add_node(1, False, 0, 0, True, 0, 0, None, 0, 0) + + assert 3 == len(tree.nodes) + assert 3 == tree.node_count + + def test_fit(self): + + data = load_iris() + clf = ObliqueTreeClassifier() + clf.fit(data.data, data.target) + + def test_predict(self): + Xtrain = np.random.rand(6, 5) + ytrain = np.array([0, 0, 1, 1, 0, 1]) + tree = ObliqueTreeClassifier() + tree.fit(Xtrain, ytrain) + Xtest = np.random.rand(3, 5) + preds = tree.predict(Xtest) + + assert len(preds) == len(Xtest) + + preds_proba = tree.predict_proba(Xtest) + preds_log_proba = tree.predict_log_proba(Xtest) + + assert len(preds_proba) == len(Xtest) + assert len(preds_log_proba) == len(Xtest) + + bool_inc = np.all(np.log(preds_proba) == preds_log_proba) + + assert bool_inc diff --git a/proglearn/transformers.py b/proglearn/transformers.py old mode 100755 new mode 100644 index 3ce1f63e17..93546fc116 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -5,6 +5,9 @@ import numpy as np from sklearn.tree import DecisionTreeClassifier +from sklearn.base import BaseEstimator +from sklearn.random_projection import SparseRandomProjection + from sklearn.utils.validation import ( check_X_y, @@ -194,3 +197,932 @@ def transform(self, X): check_is_fitted(self) X = check_array(X) return self.transformer_.apply(X) + + +class ObliqueTreeClassificationTransformer(BaseTransformer): + """ + A class used to transform data from a category to a specialized representation. + + Parameters + ---------- + kwargs : dict, default={} + A dictionary to contain parameters of the tree. + + Attributes + ---------- + transformer : ObliqueTreeClassifier + an sklearn compliant oblique decisiotn tree (SPORF) + """ + + def __init__(self, kwargs={}): + self.kwargs = kwargs + + def fit(self, X, y): + """ + Fits the transformer to data X with labels y. + + Parameters + ---------- + X : ndarray + Input data matrix. + y : ndarray + Output (i.e. response data matrix). + + Returns + ------- + self : TreeClassificationTransformer + The object itself. + """ + X, y = check_X_y(X, y) + self.transformer_ = ObliqueTreeClassifier(**self.kwargs).fit(X, y) + return self + + def transform(self, X): + """ + Performs inference using the transformer. + + Parameters + ---------- + X : ndarray + Input data matrix. + + Returns + ------- + X_transformed : ndarray + The transformed input. + + Raises + ------ + NotFittedError + When the model is not fitted. + """ + check_is_fitted(self) + X = check_array(X) + return self.transformer_.apply(X) + + +""" +Authors: Parth Vora and Jay Mandavilli + +Oblique Decision Tree (SPORF) +""" +# -------------------------------------------------------------------------- +class SplitInfo: + """ + A class used to store information about a certain split. + + Parameters + ---------- + feature : int + The feature which is used for the particular split. + threshold : float + The feature value which defines the split, if an example has a value less + than this threshold for the feature of this split then it will go to the + left child, otherwise it wil go the right child where these children are + the children nodes of the node for which this split defines. + proj_mat : array of shape [n_components, n_features] + The sparse random projection matrix for this split. + left_impurity : float + This is Gini impurity of left side of the split. + left_idx : array of shape [left_n_samples] + This is the indices of the nodes that are in the left side of this split. + left_n_samples : int + The number of samples in the left side of this split. + right_impurity : float + This is Gini impurity of right side of the split. + right_idx : array of shape [right_n_samples] + This is the indices of the nodes that are in the right side of this split. + right_n_samples : int + The number of samples in the right side of this split. + no_split : bool + A boolean specifying if there is a valid split or not. Here an invalid + split means all of the samples would go to one side. + improvement : float + A metric to determine if the split improves the decision tree. + """ + + def __init__( + self, + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ): + + self.feature = feature + self.threshold = threshold + self.proj_mat = proj_mat + self.left_impurity = left_impurity + self.left_idx = left_idx + self.left_n_samples = left_n_samples + self.right_impurity = right_impurity + self.right_idx = right_idx + self.right_n_samples = right_n_samples + self.no_split = no_split + self.improvement = improvement + + +class ObliqueSplitter: + """ + A class used to represent an oblique splitter, where splits are done on + the linear combination of the features. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input data X is a matrix of the examples and their respective feature + values for each of the features. + y : array of shape [n_samples] + The labels for each of the examples in X. + proj_dims : int + The dimensionality of the target projection space. + density : float + Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. + random_state : int + Controls the pseudo random number generator used to generate the projection matrix. + + Methods + ------- + sample_proj_mat(sample_inds) + This gets the projection matrix and it fits the transform to the samples of interest. + leaf_label_proba(idx) + This calculates the label and the probability for that label for a particular leaf + node. + score(y_sort, t) + Finds the Gini impurity for a split. + impurity(idx) + Finds the impurity for a certain set of samples. + split(sample_inds) + Determines the best possible split for the given set of samples. + """ + + def __init__(self, X, y, proj_dims, density, random_state): + + self.X = X + self.y = y + + self.classes = np.array(np.unique(y), dtype=int) + self.n_classes = len(self.classes) + self.indices = np.indices(y.shape)[0] + + self.n_samples = X.shape[0] + + self.proj_dims = proj_dims + self.density = density + self.random_state = random_state + + def sample_proj_mat(self, sample_inds): + """ + Gets the projection matrix and it fits the transform to the samples of interest. + + Parameters + ---------- + sample_inds : array of shape [n_samples] + The data we are transforming. + + Returns + ------- + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + The generated sparse random matrix. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projected matrix. + """ + + proj_mat = SparseRandomProjection( + density=self.density, + n_components=self.proj_dims, + random_state=self.random_state, + ) + + proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) + return proj_X, proj_mat + + def leaf_label_proba(self, idx): + """ + Finds the most common label and probability of this label from the samples at + the leaf node for which this is used on. + + Parameters + ---------- + idx : array of shape [n_samples] + The indices of the samples that are at the leaf node for which the label + and probability need to be found. + + Returns + ------- + label : int + The label for any sample that is predicted to be at this node. + proba : float + The probability of the predicted sample to have this node's label. + """ + + samples = self.y[idx] + n = len(samples) + labels, count = np.unique(samples, return_counts=True) + most = np.argmax(count) + + label = labels[most] + proba = count[most] / n + + return label, proba + + # Returns gini impurity for split + # Expects 0 < t < n + def score(self, y_sort, t): + """ + Finds the Gini impurity for the split of interest + + Parameters + ---------- + y_sort : array of shape [n_samples] + A sorted array of labels for the examples for which the Gini impurity + is being calculated. + t : float + The threshold determining where to split y_sort. + + Returns + ------- + gini : float + The Gini impurity of the split. + """ + + left = y_sort[:t] + right = y_sort[t:] + + n_left = len(left) + n_right = len(right) + + left_unique, left_counts = np.unique(left, return_counts=True) + right_unique, right_counts = np.unique(right, return_counts=True) + + left_counts = left_counts / n_left + right_counts = right_counts / n_right + + left_gini = 1 - np.sum(np.power(left_counts, 2)) + right_gini = 1 - np.sum(np.power(right_counts, 2)) + + gini = (n_left / self.n_samples) * left_gini + ( + n_right / self.n_samples + ) * right_gini + return gini + + # Returns impurity for a group of examples + # expects idx not None + def impurity(self, idx): + """ + Finds the actual impurity for a set of samples + + Parameters + ---------- + idx : array of shape [n_samples] + The indices of the nodes in the set for which the impurity is being calculated. + + Returns + ------- + impurity : float + Actual impurity of split. + """ + + samples = self.y[idx] + n = len(samples) + + if n == 0: + return 0 + + unique, count = np.unique(samples, return_counts=True) + count = count / n + gini = np.sum(np.power(count, 2)) + + return 1 - gini + + # Finds the best split + # This needs to be parallelized; its a major bottleneck + def split(self, sample_inds): + """ + Finds the optimal split for a set of samples. + Note that the code for this method needs to be parallelized. This is a major + bottleneck in integration with scikit-learn. + + Parameters + ---------- + sample_inds : array of shape [n_samples] + The indices of the nodes in the set for which the best split is found. + + Returns + ------- + split_info : SplitInfo + Class holding information about the split. + """ + + # Project the data + proj_X, proj_mat = self.sample_proj_mat(sample_inds) + y_sample = self.y[sample_inds] + n_samples = len(sample_inds) + + # Score matrix + # No split score is just node impurity + Q = np.zeros((n_samples, self.proj_dims)) + node_impurity = self.impurity(sample_inds) + Q[0, :] = node_impurity + Q[-1, :] = node_impurity + + # Loop through projected features and examples to find best split + # This can be parallelized for sure + for j in range(self.proj_dims): + + # Sort labels by the jth feature + idx = np.argsort(proj_X[:, j]) + y_sort = y_sample[idx] + + Q[1:-1, j] = np.array( + [self.score(y_sort, i) for i in range(1, n_samples - 1)] + ) + + # Identify best split feature, minimum gini impurity + best_split_ind = np.argmin(Q) + thresh_i, feature = np.unravel_index(best_split_ind, Q.shape) + best_gini = Q[thresh_i, feature] + + # Sort samples by the split feature + feat_vec = proj_X[:, feature] + idx = np.argsort(feat_vec) + + feat_vec = feat_vec[idx] + sample_inds = sample_inds[idx] + + # Get the threshold, split samples into left and right + threshold = feat_vec[thresh_i] + left_idx = sample_inds[:thresh_i] + right_idx = sample_inds[thresh_i:] + + left_n_samples = len(left_idx) + right_n_samples = len(right_idx) + + # See if we have no split + no_split = left_n_samples == 0 or right_n_samples == 0 + + # Evaluate improvement + improvement = node_impurity - best_gini + + # Evaluate impurities for left and right children + left_impurity = self.impurity(left_idx) + right_impurity = self.impurity(right_idx) + + split_info = SplitInfo( + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ) + + return split_info + + +# -------------------------------------------------------------------------- + + +class Node: + """ + A class used to represent an oblique node. + + Parameters + ---------- + None + + Methods + ------- + None + """ + + def __init__(self): + self.node_id = None + self.is_leaf = None + self.parent = None + self.left_child = None + self.right_child = None + + self.feature = None + self.threshold = None + self.impurity = None + self.n_samples = None + + self.proj_mat = None + self.label = None + self.proba = None + + +class StackRecord: + """ + A class used to keep track of a node's parent and other information about the node and its split. + + Parameters + ---------- + parent : int + The index of the parent node. + depth : int + The depth at which this node is. + is_left : bool + Represents if the node is a left child or not. + impurity : float + This is Gini impurity of this node. + sample_idx : array of shape [n_samples] + This is the indices of the nodes that are in this node. + n_samples : int + The number of samples in this node. + + Methods + ------- + None + """ + + def __init__(self, parent, depth, is_left, impurity, sample_idx, n_samples): + + self.parent = parent + self.depth = depth + self.is_left = is_left + self.impurity = impurity + self.sample_idx = sample_idx + self.n_samples = n_samples + + +class ObliqueTree: + """ + A class used to represent a tree with oblique splits. + + Parameters + ---------- + splitter : class + The type of splitter for this tree, should be an ObliqueSplitter. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + max_depth : int + Maximum depth allowed for the tree. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + + Methods + ------- + add_node(parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba) + Adds a node to the existing tree + build() + This is what is initially called on to completely build the oblique tree. + predict(X) + Finds the final node for each input sample as it passes through the decision tree. + """ + + def __init__( + self, + splitter, + min_samples_split, + min_samples_leaf, + max_depth, + min_impurity_split, + min_impurity_decrease, + ): + + # Tree parameters + # self.n_samples = n_samples + # self.n_features = n_features + # self.n_classes = n_classes + self.depth = 0 + self.node_count = 0 + self.nodes = [] + + # Build parameters + self.splitter = splitter + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_depth = max_depth + self.min_impurity_split = min_impurity_split + self.min_impurity_decrease = min_impurity_decrease + + def add_node( + self, + parent, + is_left, + impurity, + n_samples, + is_leaf, + feature, + threshold, + proj_mat, + label, + proba, + ): + """ + Adds a node to the existing oblique tree. + + Parameters + ---------- + parent : int + The index of the parent node for the new node being added. + is_left : bool + Determines if this new node being added is a left or right child. + impurity : float + Impurity of this new node. + n_samples : int + Number of samples at this new node. + is_leaf : bool + Determines if this new node is a leaf of the tree or an internal node. + feature : int + Index of feature on which the split occurs at this node. + threshold : float + The threshold feature value for this node determining if a sample will go + to this node's left of right child. If a sample has a value less than the + threshold (for the feature of this node) it will go to the left childe, + otherwise it will go the right child. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projection matrix for this new node. + label : int + The label a sample will be given if it is predicted to be at this node. + proba : float + The probability a predicted sample has of being the node's label. + + Returns + ------- + node_id : int + Index of the new node just added. + """ + + node = Node() + node.node_id = self.node_count + node.impurity = impurity + node.n_samples = n_samples + + # If not the root node, set parents + if self.node_count > 0: + node.parent = parent + if is_left: + self.nodes[parent].left_child = node.node_id + else: + self.nodes[parent].right_child = node.node_id + + # Set node parameters + if is_leaf: + node.is_leaf = True + node.label = label + node.proba = proba + else: + node.is_leaf = False + node.feature = feature + node.threshold = threshold + node.proj_mat = proj_mat + + self.node_count += 1 + self.nodes.append(node) + + return node.node_id + + def build(self): + """ + Builds the oblique tree. + + Parameters + ---------- + None + + Returns + ------- + None + """ + + # Initialize, add root node + stack = [] + root = StackRecord( + 0, + 1, + False, + self.splitter.impurity(self.splitter.indices), + self.splitter.indices, + self.splitter.n_samples, + ) + stack.append(root) + + # Build tree + while len(stack) > 0: + + # Pop a record off the stack + cur = stack.pop() + + # Evaluate if it is a leaf + is_leaf = ( + cur.depth >= self.max_depth + or cur.n_samples < self.min_samples_split + or cur.n_samples < 2 * self.min_samples_leaf + or cur.impurity <= self.min_impurity_split + ) + + # Split if not + if not is_leaf: + split = self.splitter.split(cur.sample_idx) + + is_leaf = ( + is_leaf + or split.no_split + or split.improvement <= self.min_impurity_decrease + ) + + # Add the node to the tree + if is_leaf: + + label, proba = self.splitter.leaf_label_proba(cur.sample_idx) + + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + None, + None, + None, + label, + proba, + ) + + else: + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + split.feature, + split.threshold, + split.proj_mat, + None, + None, + ) + + # Push the right and left children to the stack if applicable + if not is_leaf: + + right_child = StackRecord( + node_id, + cur.depth + 1, + False, + split.right_impurity, + split.right_idx, + split.right_n_samples, + ) + stack.append(right_child) + + left_child = StackRecord( + node_id, + cur.depth + 1, + True, + split.left_impurity, + split.left_idx, + split.left_n_samples, + ) + stack.append(left_child) + + if cur.depth > self.depth: + self.depth = cur.depth + + def predict(self, X): + """ + Predicts final nodes of samples given. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The input array for which predictions are made. + + Returns + ------- + predictions : array of shape [n_samples] + Array of the final node index for each input prediction sample. + """ + + predictions = np.zeros(X.shape[0]) + for i in range(X.shape[0]): + cur = self.nodes[0] + while not cur is None and not cur.is_leaf: + proj_X = cur.proj_mat.transform(X) + if proj_X[i, cur.feature] < cur.threshold: + id = cur.left_child + cur = self.nodes[id] + else: + id = cur.right_child + cur = self.nodes[id] + + predictions[i] = cur.node_id + + return predictions + + +# -------------------------------------------------------------------------- + +""" Class for Oblique Tree """ + + +class ObliqueTreeClassifier(BaseEstimator): + """ + A class used to represent a classifier that uses an oblique decision tree. + + Parameters + ---------- + max_depth : int + Maximum depth allowed for oblique tree. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + random_state : int + Maximum depth allowed for the tree. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + feature_combinations : float + The feature combinations to use for the oblique split. + density : float + Density estimate. + + Methods + ------- + fit(X,y) + Fits the oblique tree to the training samples. + apply(X) + Calls on the predict function from the oblique tree for the test samples. + predict(X) + Gets the prediction labels for the test samples. + predict_proba(X) + Gets the probability of the prediction labels for the test samples. + predict_log_proba(X) + Gets the log of the probability of the prediction labels for the test samples. + """ + + def __init__( + self, + *, + # criterion="gini", + # splitter=None, + max_depth=np.inf, + min_samples_split=2, + min_samples_leaf=1, + # min_weight_fraction_leaf=0, + # max_features="auto", + # max_leaf_nodes=None, + random_state=None, + min_impurity_decrease=0, + min_impurity_split=0, + # class_weight=None, + # ccp_alpha=0.0, + # New args + feature_combinations=1.5, + density=0.5 + ): + + # self.criterion=criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + # self.min_weight_fraction_leaf=min_weight_fraction_leaf + # self.max_features=max_features + # self.max_leaf_nodes=max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + # self.class_weight=class_weight + # self.ccp_alpha=ccp_alpha + + self.feature_combinations = feature_combinations + self.density = density + + def fit(self, X, y): + """ + Predicts final nodes of samples given. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The training samples. + y : array of shape [n_samples] + Labels for the training samples. + + Returns + ------- + ObliqueTreeClassifier + The fit classifier. + """ + + self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) + splitter = ObliqueSplitter( + X, y, self.proj_dims, self.density, self.random_state + ) + + self.tree = ObliqueTree( + splitter, + self.min_samples_split, + self.min_samples_leaf, + self.max_depth, + self.min_impurity_split, + self.min_impurity_decrease, + ) + self.tree.build() + return self + + def apply(self, X): + """ + Gets predictions form the oblique tree for the test samples. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + ------- + pred_nodes : array of shape[n_samples] + The indices for each test sample's final node in the oblique tree. + """ + + pred_nodes = self.tree.predict(X).astype(int) + return pred_nodes + + def predict(self, X): + """ + Determines final label predictions for each sample in the test data. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + ------- + preds : array of shape[n_samples] + The predictions (labels) for each testing sample. + """ + + preds = np.zeros(X.shape[0]) + pred_nodes = self.apply(X) + for k in range(len(pred_nodes)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].label + + return preds + + def predict_proba(self, X): + """ + Determines probabilities of the final label predictions for each sample in the test data. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + ------- + preds : array of shape[n_samples] + The probabilities of the predictions (labels) for each testing sample. + """ + + preds = np.zeros(X.shape[0]) + pred_nodes = self.apply(X) + for k in range(len(preds)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].proba + + return preds + + def predict_log_proba(self, X): + """ + Determines log of the probabilities of the final label predictions for each sample in the test data. + + Parameters + ---------- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + ------- + preds : array of shape[n_samples] + The log of the probabilities of the predictions (labels) for each testing sample. + """ + + proba = self.predict_proba(X) + for k in range(len(proba)): + proba[k] = np.log(proba[k]) + + return proba