From 660eab2a879b63f0a86dab6ff14ccbc2e42900bc Mon Sep 17 00:00:00 2001 From: parthgvora Date: Thu, 12 Nov 2020 20:55:10 -0500 Subject: [PATCH 01/17] staging branch finally --- proglearn/oblique_tree.py | 564 +++++++++++++++++++++++++++ proglearn/tests/oblique_tree_test.py | 125 ++++++ proglearn/transformers.py | 62 +++ 3 files changed, 751 insertions(+) create mode 100644 proglearn/oblique_tree.py create mode 100644 proglearn/tests/oblique_tree_test.py diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py new file mode 100644 index 0000000000..e5eedcfec3 --- /dev/null +++ b/proglearn/oblique_tree.py @@ -0,0 +1,564 @@ +""" +Parth Vora + +Oblique Decision Tree (SPORF) +""" +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.random_projection import SparseRandomProjection + +# debugging +import sys +# Parallelize things later +from joblib import Parallel, delayed + +#-------------------------------------------------------------------------- +class SplitInfo: + """ + A class used to store information about a certain split. + + Parameters: + --- + feature : int + The feature which is used for the particular split. + threshold : float + The feature value which defines the split, if an example has a value less + than this threshold for the feature of this split then it will go to the + left child, otherwise it wil go the right child where these children are + the children nodes of the node for which this split defines. + proj_mat : array of shape [n_components, n_features] + The sparse random projection matrix for this split. + left_impurity : float + This is Gini impurity of left side of the split. + left_idx : array of shape [left_n_samples] + This is the indices of the nodes that are in the left side of this split. + left_n_samples : int + The number of samples in the left side of this split. + right_impurity : float + This is Gini impurity of right side of the split. + right_idx : array of shape [right_n_samples] + This is the indices of the nodes that are in the right side of this split. + right_n_samples : int + The number of samples in the right side of this split. + no_split : bool + A boolean specifying if there is a valid split or not. Here an invalid + split means all of the samples would go to one side. + improvement : float + A metric to determine if the split improves the decision tree. + """ + + def __init__(self, feature, threshold, proj_mat, + left_impurity, left_idx, left_n_samples, + right_impurity, right_idx, right_n_samples, + no_split, improvement): + + self.feature = feature + self.threshold = threshold + self.proj_mat = proj_mat + self.left_impurity = left_impurity + self.left_idx = left_idx + self.left_n_samples = left_n_samples + self.right_impurity = right_impurity + self.right_idx = right_idx + self.right_n_samples = right_n_samples + self.no_split = no_split + self.improvement = improvement + +class ObliqueSplitter: + """ + A class used to represent an oblique splitter, where splits are done on + the linear combination of the features. + + Parameters: + --- + X : array of shape [n_samples, n_features] + The input data X is a matrix of the examples and their respective feature + values for each of the features. + y : array of shape [n_samples] + The labels for each of the examples in X. + proj_dims : int + The dimensionality of the target projection space. + density : float + Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. + random_state : int + Controls the pseudo random number generator used to generate the projection matrix. + + Methods + --- + sample_proj_mat(sample_inds) + This gets the projection matrix and it fits the transform to the samples of interest. + leaf_label_proba(idx) + This calculates the label and the probability for that label for a particular leaf + node. + score(y_sort, t) + Finds the Gini impurity for a split. + impurity(idx) + Finds the impurity for a certain set of samples. + split(sample_inds) + Determines the best possible split for the given set of samples. + """ + + def __init__(self, X, y, proj_dims, density, random_state): + + self.X = X + self.y = y + + self.classes = np.array(np.unique(y), dtype=int) + self.n_classes = len(self.classes) + self.indices = np.indices(y.shape)[0] + + self.n_samples = X.shape[0] + + self.proj_dims = proj_dims + self.density = density + self.random_state = random_state + + def sample_proj_mat(self, sample_inds): + """ + Gets the projection matrix and it fits the transform to the samples of interest. + + Parameters + --- + sample_inds : array of shape [n_samples] + The data we are transforming. + """ + + proj_mat = SparseRandomProjection(density=self.density, + n_components=self.proj_dims, + random_state=self.random_state) + + proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) + return proj_X, proj_mat + + def leaf_label_proba(self, idx): + """ + Finds the most common label and probability of this label from the samples at + the leaf node for which this is used on. + + Parameters + --- + idx : array of shape [n_samples] + The indeces of the samples that are at the leaf node for which the label + and probability need to be found. + """ + + samples = self.y[idx] + n = len(samples) + labels, count = np.unique(samples, return_counts=True) + most = np.argmax(count) + + label = labels[most] + proba = count[most] / n + + return label, proba + + # Returns gini impurity for split + # Expects 0 < t < n + def score(self, y_sort, t): + """ + Finds the Gini impurity for the split of interest + + Parameters + --- + y_sort : array of shape [n_samples] + A sorted array of labels for the examples for which the Gini impurity + is being calculated. + t : float + The threshold determining where to split y_sort. + """ + + left = y_sort[:t] + right = y_sort[t:] + + n_left = len(left) + n_right = len(right) + + left_unique, left_counts = np.unique(left, return_counts=True) + right_unique, right_counts = np.unique(right, return_counts=True) + + left_counts = left_counts / n_left + right_counts = right_counts / n_right + + left_gini = 1 - np.sum(np.power(left_counts, 2)) + right_gini = 1 - np.sum(np.power(right_counts, 2)) + + gini = (n_left / self.n_samples) * left_gini + (n_right / self.n_samples) * right_gini + return gini + + # Returns impurity for a group of examples + # expects idx not None + def impurity(self, idx): + """ + Finds the actual impurity for a set of samples + + Parameters + --- + idx : array of shape [n_samples] + The indices of the nodes in the set for which the impurity is being calculated. + """ + + samples = self.y[idx] + n = len(samples) + + if n == 0: + return 0 + + unique, count = np.unique(samples, return_counts=True) + count = count / n + gini = np.sum(np.power(count, 2)) + + return 1 - gini + + # Finds the best split + # This needs to be parallelized; its a major bottleneck + def split(self, sample_inds): + """ + Finds the optimal split for a set of samples. + Note that the code for this method needs to be parallelized. This is a major + bottleneck in integration with scikit-learn. + + Parameters + --- + sample_inds : array of shape [n_samples] + The indices of the nodes in the set for which the best split is found. + """ + + # Project the data + proj_X, proj_mat = self.sample_proj_mat(sample_inds) + y_sample = self.y[sample_inds] + n_samples = len(sample_inds) + + # Score matrix + # No split score is just node impurity + Q = np.zeros((n_samples, self.proj_dims)) + node_impurity = self.impurity(sample_inds) + Q[0, :] = node_impurity + Q[-1, :] = node_impurity + + # Loop through projected features and examples to find best split + # This can be parallelized for sure + for j in range(self.proj_dims): + + # Sort labels by the jth feature + idx = np.argsort(proj_X[:, j]) + y_sort = y_sample[idx] + + Q[1:-1, j] = np.array([self.score(y_sort, i) for i in range(1, n_samples - 1)]) + + # Identify best split feature, minimum gini impurity + best_split_ind = np.argmin(Q) + thresh_i, feature = np.unravel_index(best_split_ind, Q.shape) + best_gini = Q[thresh_i, feature] + + # Sort samples by the split feature + feat_vec = proj_X[:, feature] + idx = np.argsort(feat_vec) + + feat_vec = feat_vec[idx] + sample_inds = sample_inds[idx] + + # Get the threshold, split samples into left and right + threshold = feat_vec[thresh_i] + left_idx = sample_inds[:thresh_i] + right_idx = sample_inds[thresh_i:] + + left_n_samples = len(left_idx) + right_n_samples = len(right_idx) + + # See if we have no split + no_split = (left_n_samples == 0 or + right_n_samples == 0) + + # Evaluate improvement + improvement = node_impurity - best_gini + + # Evaluate impurities for left and right children + left_impurity = self.impurity(left_idx) + right_impurity = self.impurity(right_idx) + + split_info = SplitInfo(feature, threshold, proj_mat, + left_impurity, left_idx, left_n_samples, + right_impurity, right_idx, right_n_samples, + no_split, improvement) + + return split_info +#-------------------------------------------------------------------------- + +class Node: + def __init__(self): + + self.node_id = None + self.is_leaf = None + self.parent = None + self.left_child = None + self.right_child = None + + self.feature = None + self.threshold = None + self.impurity = None + self.n_samples = None + + self.proj_mat = None + self.label = None + self.proba = None + +class StackRecord: + def __init__(self, parent, depth, is_left, + impurity, sample_idx, n_samples): + + self.parent = parent + self.depth = depth + self.is_left = is_left + self.impurity = impurity + self.sample_idx = sample_idx + self.n_samples = n_samples + +class ObliqueTree: + + def __init__(self, splitter, min_samples_split, min_samples_leaf, + max_depth, min_impurity_split, min_impurity_decrease): + + # Tree parameters + #self.n_samples = n_samples + #self.n_features = n_features + #self.n_classes = n_classes + self.depth = 0 + self.node_count = 0 + self.nodes = [] + + # Build parameters + self.splitter = splitter + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_depth = max_depth + self.min_impurity_split = min_impurity_split + self.min_impurity_decrease = min_impurity_decrease + + + + def add_node(self, parent, is_left, + impurity, n_samples, is_leaf, + feature, threshold, proj_mat, + label, proba): + + node = Node() + node.node_id = self.node_count + node.impurity = impurity + node.n_samples = n_samples + + # If not the root node, set parents + if self.node_count > 0: + node.parent = parent + if is_left: + self.nodes[parent].left_child = node.node_id + else: + self.nodes[parent].right_child = node.node_id + + # Set node parameters + if is_leaf: + node.is_leaf = True + node.label = label + node.proba = proba + else: + node.is_leaf = False + node.feature = feature + node.threshold = threshold + node.proj_mat = proj_mat + + self.node_count += 1 + self.nodes.append(node) + + return node.node_id + + def build(self): + + # Initialize, add root node + stack = [] + root = StackRecord(0, 1, False, + self.splitter.impurity(self.splitter.indices), + self.splitter.indices, + self.splitter.n_samples) + stack.append(root) + + + # Build tree + while len(stack) > 0: + + # Pop a record off the stack + cur = stack.pop() + + + # Evaluate if it is a leaf + is_leaf = (cur.depth >= self.max_depth or + cur.n_samples < self.min_samples_split or + cur.n_samples < 2 * self.min_samples_leaf or + cur.impurity <= self.min_impurity_split) + + # Split if not + if not is_leaf: + split = self.splitter.split(cur.sample_idx) + + is_leaf = (is_leaf or + split.no_split or + split.improvement <= self.min_impurity_decrease) + + # Add the node to the tree + if is_leaf: + + label, proba = self.splitter.leaf_label_proba(cur.sample_idx) + + node_id = self.add_node(cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + None, + None, + None, + label, + proba) + + else: + node_id = self.add_node(cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + split.feature, + split.threshold, + split.proj_mat, + None, + None) + + + # Push the right and left children to the stack if applicable + if not is_leaf: + + right_child = StackRecord(node_id, + cur.depth + 1, + False, + split.right_impurity, + split.right_idx, + split.right_n_samples) + stack.append(right_child) + + left_child = StackRecord(node_id, + cur.depth + 1, + True, + split.left_impurity, + split.left_idx, + split.left_n_samples) + stack.append(left_child) + + if cur.depth > self.depth: + self.depth = cur.depth + + def predict(self, X): + predictions = np.zeros(X.shape[0]) + for i in range(X.shape[0]): + cur = self.nodes[0] + while not cur is None and not cur.is_leaf: + proj_X = cur.proj_mat.transform(X) + if proj_X[i, cur.feature] < cur.threshold: + id = cur.left_child + cur = self.nodes[id] + else: + id = cur.right_child + cur = self.nodes[id] + + predictions[i] = cur.node_id + + return predictions + +#-------------------------------------------------------------------------- + +""" Class for Oblique Tree """ +class ObliqueTreeClassifier(BaseEstimator): + + def __init__(self, *, + + #criterion="gini", + #splitter=None, + max_depth=np.inf, + min_samples_split=2, + min_samples_leaf=1, + #min_weight_fraction_leaf=0, + #max_features="auto", + #max_leaf_nodes=None, + random_state=None, + min_impurity_decrease=0, + min_impurity_split=0, + #class_weight=None, + #ccp_alpha=0.0, + + #New args + feature_combinations=1.2, + density=0.7 + + ): + + #self.criterion=criterion + self.max_depth=max_depth + self.min_samples_split=min_samples_split + self.min_samples_leaf=min_samples_leaf + #self.min_weight_fraction_leaf=min_weight_fraction_leaf + #self.max_features=max_features + #self.max_leaf_nodes=max_leaf_nodes + self.random_state=random_state + self.min_impurity_decrease=min_impurity_decrease + self.min_impurity_split=min_impurity_split + #self.class_weight=class_weight + #self.ccp_alpha=ccp_alpha + + self.feature_combinations=feature_combinations + self.density=density + + def fit(self, X, y): + + self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) + splitter = ObliqueSplitter(X, y, + self.proj_dims, + self.density, + self.random_state) + + self.tree = ObliqueTree(splitter, + self.min_samples_split, + self.min_samples_leaf, + self.max_depth, + self.min_impurity_split, + self.min_impurity_decrease) + self.tree.build() + return self + + def apply(self, X): + pred_nodes = self.tree.predict(X).astype(int) + return pred_nodes + + def predict(self, X): + preds = np.zeros(X.shape[0]) + pred_nodes = self.tree.predict(X).astype(int) + for k in range(len(pred_nodes)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].label + + return preds + + + def predict_proba(self, X): + preds = np.zeros(X.shape[0]) + pred_nodes = self.tree.predict(X).astype(int) + for k in range(len(preds)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].proba + + return preds + + def predict_log_proba(self, X): + proba = self.predict_proba(X) + + for k in range(len(preds)): + proba[k] = np.log(proba[k]) + + return proba + + diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py new file mode 100644 index 0000000000..6e9155ef38 --- /dev/null +++ b/proglearn/tests/oblique_tree_test.py @@ -0,0 +1,125 @@ +import pytest +import numpy as np +from numpy import random as rng +from numpy.testing import assert_almost_equal, assert_warns, assert_raises + +import sys +sys.path.append("../") +from oblique_tree import * + + +class TestObliqueSplitter: + + def test_sample_projmat(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + y = np.zeros(100) + + density = 0.5 + proj_dims = [10, 20, 40, 60, 80] + sample_inds = [np.linspace(0, 9, 10, dtype=int), + np.linspace(0, 19, 20, dtype=int), + np.linspace(0, 39, 40, dtype=int), + np.linspace(0, 59, 60, dtype=int), + np.linspace(0, 79, 80, dtype=int)] + + n_sample_inds = [10, 20, 40, 60, 80] + + for pd in proj_dims: + splitter = ObliqueSplitter(X, y, pd, density, random_state) + + for i in range(len(n_sample_inds)): + si = sample_inds[i] + n = n_sample_inds[i] + + proj_X, projmat = splitter.sample_proj_mat(si) + assert n == proj_X.shape[0] + assert pd == proj_X.shape[1] + + def test_score(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(11, 11) + + density = 0.5 + proj_dims = 5 + + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + score = splitter.score(y, 6) + assert 0 == score + + score = splitter.score(y, 1) + assert_almost_equal(5/11, score) + + def test_impurity(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + + density = 0.5 + proj_dims = 50 + + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10*i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + # Impurity of empty thing should be throw exception + + # Impurity of one thing should be 0 + impurity = splitter.impurity([0]) + assert 0 == impurity + + # Impurity of one class should be 0 + impurity = splitter.impurity(np.linspace(0, 9, 10, dtype=int)) + assert 0 == impurity + + # Impurity of two different classes with equal number should be 0.5 + impurity = splitter.impurity(np.linspace(0, 19, 20, dtype=int)) + assert 0.5 == impurity + + # Impurity of all classes should be 10 * (1/10)(9/10) = 9/10 + impurity = splitter.impurity(np.linspace(0, 99, 100, dtype=int)) + assert_almost_equal(0.9, impurity) + + def test_split(self): + + pass + + + +class TestObliqueTree: + + def test_add_node(self): + + + + pass + + + def test_build(self): + + pass + + def test_predict(self): + Xtrain = np.random.rand(6, 5) + ytrain = np.array([0, 0, 1, 1, 0, 1]) + tree = ObliqueTreeClassifier() + tree.fit(Xtrain, ytrain) + Xtest = np.random.rand(3, 5) + preds = tree.predict(Xtest) + + # Tried testing on Xtrain but didn't get 100% accuracy + + assert len(preds) == 3 diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 3ce1f63e17..3c13846794 100755 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -15,6 +15,7 @@ import keras as keras from .base import BaseTransformer +from .oblique_tree import ObliqueTreeClassifier class NeuralClassificationTransformer(BaseTransformer): @@ -194,3 +195,64 @@ def transform(self, X): check_is_fitted(self) X = check_array(X) return self.transformer_.apply(X) + +class ObliqueTreeClassificationTransformer(BaseTransformer): + """ + A class used to transform data from a category to a specialized representation. + + Parameters + ---------- + kwargs : dict, default={} + A dictionary to contain parameters of the tree. + + Attributes + ---------- + transformer : sklearn.tree.DecisionTreeClassifier + an internal sklearn DecisionTreeClassifier + """ + + def __init__(self, kwargs={}): + self.kwargs = kwargs + + def fit(self, X, y): + """ + Fits the transformer to data X with labels y. + + Parameters + ---------- + X : ndarray + Input data matrix. + y : ndarray + Output (i.e. response data matrix). + + Returns + ------- + self : TreeClassificationTransformer + The object itself. + """ + X, y = check_X_y(X, y) + self.transformer_ = ObliqueTreeClassifier(**self.kwargs).fit(X, y) + return self + + def transform(self, X): + """ + Performs inference using the transformer. + + Parameters + ---------- + X : ndarray + Input data matrix. + + Returns + ------- + X_transformed : ndarray + The transformed input. + + Raises + ------ + NotFittedError + When the model is not fitted. + """ + check_is_fitted(self) + X = check_array(X) + return self.transformer_.apply(X) From 273f96093c1f214e644d423bb172cd3b8f1475b1 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Tue, 17 Nov 2020 20:48:31 -0500 Subject: [PATCH 02/17] all tests pass --- proglearn/tests/oblique_tree_test.py | 45 ++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py index 6e9155ef38..0ff93e0281 100644 --- a/proglearn/tests/oblique_tree_test.py +++ b/proglearn/tests/oblique_tree_test.py @@ -2,6 +2,7 @@ import numpy as np from numpy import random as rng from numpy.testing import assert_almost_equal, assert_warns, assert_raises +from sklearn.datasets import load_iris import sys sys.path.append("../") @@ -75,8 +76,6 @@ def test_impurity(self): splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - # Impurity of empty thing should be throw exception - # Impurity of one thing should be 0 impurity = splitter.impurity([0]) assert 0 == impurity @@ -95,22 +94,56 @@ def test_impurity(self): def test_split(self): - pass + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + density = 0.5 + proj_dims = 50 + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10*i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + split_info = splitter.split(np.array([i for i in range(100)])) class TestObliqueTree: def test_add_node(self): + + # Add a root node + tree = ObliqueTree(None, 0, 0, 0, 0, 0) + + tree.add_node(0, False, + 0, 0, False, + 0, 0, None, + 0, 0) + # Add a regular node + tree.add_node(0, False, + 0, 0, False, + 0, 0, None, + 0, 0) + # Add a leaf node + tree.add_node(1, False, + 0, 0, True, + 0, 0, None, + 0, 0) - pass + assert 3 == len(tree.nodes) + assert 3 == tree.node_count - def test_build(self): + def test_fit(self): - pass + data = load_iris() + clf = ObliqueTreeClassifier() + clf.fit(data.data, data.target) def test_predict(self): Xtrain = np.random.rand(6, 5) From fb8cb0f096f7bd2dc863a1c141ee3541ee1c336a Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 20 Nov 2020 00:56:26 -0700 Subject: [PATCH 03/17] finished documentation --- proglearn/oblique_tree.py | 264 +++++++++++++++++++++++++++++++++++++- 1 file changed, 259 insertions(+), 5 deletions(-) diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py index e5eedcfec3..417b945ddf 100644 --- a/proglearn/oblique_tree.py +++ b/proglearn/oblique_tree.py @@ -121,6 +121,13 @@ def sample_proj_mat(self, sample_inds): --- sample_inds : array of shape [n_samples] The data we are transforming. + + Returns + --- + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + The generated sparse random matrix. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projected matrix. """ proj_mat = SparseRandomProjection(density=self.density, @@ -138,8 +145,15 @@ def leaf_label_proba(self, idx): Parameters --- idx : array of shape [n_samples] - The indeces of the samples that are at the leaf node for which the label + The indices of the samples that are at the leaf node for which the label and probability need to be found. + + Returns + --- + label : int + The label for any sample that is predicted to be at this node. + proba : float + The probability of the predicted sample to have this node's label. """ samples = self.y[idx] @@ -165,6 +179,11 @@ def score(self, y_sort, t): is being calculated. t : float The threshold determining where to split y_sort. + + Returns + --- + gini : float + The Gini impurity of the split. """ left = y_sort[:t] @@ -195,6 +214,11 @@ def impurity(self, idx): --- idx : array of shape [n_samples] The indices of the nodes in the set for which the impurity is being calculated. + + Returns + --- + impurity : float + Actual impurity of split. """ samples = self.y[idx] @@ -221,6 +245,11 @@ def split(self, sample_inds): --- sample_inds : array of shape [n_samples] The indices of the nodes in the set for which the best split is found. + + Returns + --- + split_info : SplitInfo + Class holding information about the split. """ # Project the data @@ -285,8 +314,18 @@ def split(self, sample_inds): #-------------------------------------------------------------------------- class Node: - def __init__(self): - + """ + A class used to represent an oblique node. + + Parameters: + --- + None + + Methods + --- + None + """ + def __init__(self): self.node_id = None self.is_leaf = None self.parent = None @@ -303,6 +342,29 @@ def __init__(self): self.proba = None class StackRecord: + """ + A class used to keep track of a node's parent and other information about the node and its split. + + Parameters: + --- + parent : int + The index of the parent node. + depth : int + The depth at which this node is. + is_left : bool + Represents if the node is a left child or not. + impurity : float + This is Gini impurity of this node. + sample_idx : array of shape [n_samples] + This is the indices of the nodes that are in this node. + n_samples : int + The number of samples in this node. + + Methods + --- + None + """ + def __init__(self, parent, depth, is_left, impurity, sample_idx, n_samples): @@ -314,7 +376,34 @@ def __init__(self, parent, depth, is_left, self.n_samples = n_samples class ObliqueTree: - + """ + A class used to represent a tree with oblique splits. + + Parameters: + --- + splitter : class + The type of splitter for this tree, should be an ObliqueSplitter. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + max_depth : int + Maximum depth allowed for the tree. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + + Methods + --- + add_node(parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba) + Adds a node to the existing tree + build() + This is what is initially called on to completely build the oblique tree. + predict(X) + Finds the final node for each input sample as it passes through the decision tree. + """ + def __init__(self, splitter, min_samples_split, min_samples_leaf, max_depth, min_impurity_split, min_impurity_decrease): @@ -340,7 +429,41 @@ def add_node(self, parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba): - + """ + Adds a node to the existing oblique tree. + + Parameters + --- + parent : int + The index of the parent node for the new node being added. + is_left : bool + Determines if this new node being added is a left or right child. + impurity : float + Impurity of this new node. + n_samples : int + Number of samples at this new node. + is_leaf : bool + Determines if this new node is a leaf of the tree or an internal node. + feature : int + Index of feature on which the split occurs at this node. + threshold : float + The threshold feature value for this node determining if a sample will go + to this node's left of right child. If a sample has a value less than the + threshold (for the feature of this node) it will go to the left childe, + otherwise it will go the right child. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projection matrix for this new node. + label : int + The label a sample will be given if it is predicted to be at this node. + proba : float + The probability a predicted sample has of being the node's label. + + Returns + --- + node_id : int + Index of the new node just added. + """ + node = Node() node.node_id = self.node_count node.impurity = impurity @@ -371,6 +494,17 @@ def add_node(self, parent, is_left, return node.node_id def build(self): + """ + Builds the oblique tree. + + Parameters + --- + None + + Returns + --- + None + """ # Initialize, add root node stack = [] @@ -454,6 +588,20 @@ def build(self): self.depth = cur.depth def predict(self, X): + """ + Predicts final nodes of samples given. + + Parameters + --- + X : array of shape [n_samples, n_features] + The input array for which predictions are made. + + Returns + --- + predictions : array of shape [n_samples] + Array of the final node index for each input prediction sample. + """ + predictions = np.zeros(X.shape[0]) for i in range(X.shape[0]): cur = self.nodes[0] @@ -474,6 +622,41 @@ def predict(self, X): """ Class for Oblique Tree """ class ObliqueTreeClassifier(BaseEstimator): + """ + A class used to represent a classifier that uses an oblique decision tree. + + Parameters: + --- + max_depth : int + Maximum depth allowed for oblique tree. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + random_state : int + Maximum depth allowed for the tree. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + feature_combinations : float + The feature combinations to use for the oblique split. + density : float + Density estimate. + + Methods + --- + fit(X,y) + Fits the oblique tree to the training samples. + apply(X) + Calls on the predict function from the oblique tree for the test samples. + predict(X) + Gets the prediction labels for the test samples. + predict_proba(X) + Gets the probability of the prediction labels for the test samples. + predict_log_proba(X) + Gets the log of the probability of the prediction labels for the test samples. + """ def __init__(self, *, @@ -514,6 +697,21 @@ def __init__(self, *, self.density=density def fit(self, X, y): + """ + Predicts final nodes of samples given. + + Parameters + --- + X : array of shape [n_samples, n_features] + The training samples. + y : array of shape [n_samples] + Labels for the training samples. + + Returns + --- + ObliqueTreeClassifier + The fit classifier. + """ self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) splitter = ObliqueSplitter(X, y, @@ -531,10 +729,38 @@ def fit(self, X, y): return self def apply(self, X): + """ + Gets predictions form the oblique tree for the test samples. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + pred_nodes : array of shape[n_samples] + The indices for each test sample's final node in the oblique tree. + """ + pred_nodes = self.tree.predict(X).astype(int) return pred_nodes def predict(self, X): + """ + Determines final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The predictions (labels) for each testing sample. + """ + preds = np.zeros(X.shape[0]) pred_nodes = self.tree.predict(X).astype(int) for k in range(len(pred_nodes)): @@ -545,6 +771,20 @@ def predict(self, X): def predict_proba(self, X): + """ + Determines probabilities of the final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The probabilities of the predictions (labels) for each testing sample. + """ + preds = np.zeros(X.shape[0]) pred_nodes = self.tree.predict(X).astype(int) for k in range(len(preds)): @@ -554,6 +794,20 @@ def predict_proba(self, X): return preds def predict_log_proba(self, X): + """ + Determines log of the probabilities of the final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The log of the probabilities of the predictions (labels) for each testing sample. + """ + proba = self.predict_proba(X) for k in range(len(preds)): From c4cef8bfbb48fc6657acd60f32a48c3d8499ac50 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 20 Nov 2020 01:16:54 -0700 Subject: [PATCH 04/17] fixed some bugs and finished predict tests --- proglearn/oblique_tree.py | 7 +++---- proglearn/tests/oblique_tree_test.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py index 417b945ddf..97c48ca9b2 100644 --- a/proglearn/oblique_tree.py +++ b/proglearn/oblique_tree.py @@ -762,7 +762,7 @@ def predict(self, X): """ preds = np.zeros(X.shape[0]) - pred_nodes = self.tree.predict(X).astype(int) + pred_nodes = self.apply(X) for k in range(len(pred_nodes)): id = pred_nodes[k] preds[k] = self.tree.nodes[id].label @@ -786,7 +786,7 @@ def predict_proba(self, X): """ preds = np.zeros(X.shape[0]) - pred_nodes = self.tree.predict(X).astype(int) + pred_nodes = self.apply(X) for k in range(len(preds)): id = pred_nodes[k] preds[k] = self.tree.nodes[id].proba @@ -809,8 +809,7 @@ def predict_log_proba(self, X): """ proba = self.predict_proba(X) - - for k in range(len(preds)): + for k in range(len(proba)): proba[k] = np.log(proba[k]) return proba diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py index 0ff93e0281..96323b565a 100644 --- a/proglearn/tests/oblique_tree_test.py +++ b/proglearn/tests/oblique_tree_test.py @@ -153,6 +153,15 @@ def test_predict(self): Xtest = np.random.rand(3, 5) preds = tree.predict(Xtest) - # Tried testing on Xtrain but didn't get 100% accuracy + assert len(preds) == len(Xtest) + + preds_proba = tree.predict_proba(Xtest) + preds_log_proba = tree.predict_log_proba(Xtest) + + assert len(preds_proba) == len(Xtest) + assert len(preds_log_proba) == len(Xtest) + + bool_inc = np.all(np.log(preds_proba) == preds_log_proba) + + assert bool_inc - assert len(preds) == 3 From 2481e918e2e3abbed7315a8c01468b46f0c30912 Mon Sep 17 00:00:00 2001 From: parthgvora <58313394+parthgvora@users.noreply.github.com> Date: Fri, 20 Nov 2020 11:45:45 -0500 Subject: [PATCH 05/17] Fixed header, unnecessary imports --- proglearn/oblique_tree.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py index 97c48ca9b2..57458c79ca 100644 --- a/proglearn/oblique_tree.py +++ b/proglearn/oblique_tree.py @@ -1,5 +1,5 @@ """ -Parth Vora +Authors: Parth Vora and Jay Mandavilli Oblique Decision Tree (SPORF) """ @@ -7,11 +7,6 @@ from sklearn.base import BaseEstimator from sklearn.random_projection import SparseRandomProjection -# debugging -import sys -# Parallelize things later -from joblib import Parallel, delayed - #-------------------------------------------------------------------------- class SplitInfo: """ From cf1b017b378e7d5ed175a0e1cfaddc4ad7646381 Mon Sep 17 00:00:00 2001 From: parthgvora <58313394+parthgvora@users.noreply.github.com> Date: Fri, 20 Nov 2020 13:52:02 -0500 Subject: [PATCH 06/17] Including oblique_tree in proglearn --- proglearn/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/proglearn/__init__.py b/proglearn/__init__.py index 37657e3dda..323e4439d7 100755 --- a/proglearn/__init__.py +++ b/proglearn/__init__.py @@ -1,4 +1,5 @@ from .forest import * from .network import * +from .oblique_tree import * __version__ = "0.0.3" From 83c9f13241c06528d919f112e700c62b6f29bbb4 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Fri, 20 Nov 2020 16:06:30 -0500 Subject: [PATCH 07/17] fixing importing issues --- proglearn/tests/oblique_tree_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py index 96323b565a..294ce9784e 100644 --- a/proglearn/tests/oblique_tree_test.py +++ b/proglearn/tests/oblique_tree_test.py @@ -4,9 +4,7 @@ from numpy.testing import assert_almost_equal, assert_warns, assert_raises from sklearn.datasets import load_iris -import sys -sys.path.append("../") -from oblique_tree import * +from proglearn.oblique_tree import * class TestObliqueSplitter: From a7c31f0bcc718e75bbf2a79025f868276f05ebc3 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Sat, 21 Nov 2020 15:02:51 -0500 Subject: [PATCH 08/17] black formatting --- proglearn/oblique_tree.py | 361 ++++++++++++++++----------- proglearn/tests/oblique_tree_test.py | 58 ++--- proglearn/transformers.py | 1 + 3 files changed, 234 insertions(+), 186 deletions(-) diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py index 57458c79ca..60bfe84765 100644 --- a/proglearn/oblique_tree.py +++ b/proglearn/oblique_tree.py @@ -7,7 +7,7 @@ from sklearn.base import BaseEstimator from sklearn.random_projection import SparseRandomProjection -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- class SplitInfo: """ A class used to store information about a certain split. @@ -42,10 +42,20 @@ class SplitInfo: A metric to determine if the split improves the decision tree. """ - def __init__(self, feature, threshold, proj_mat, - left_impurity, left_idx, left_n_samples, - right_impurity, right_idx, right_n_samples, - no_split, improvement): + def __init__( + self, + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ): self.feature = feature self.threshold = threshold @@ -59,6 +69,7 @@ def __init__(self, feature, threshold, proj_mat, self.no_split = no_split self.improvement = improvement + class ObliqueSplitter: """ A class used to represent an oblique splitter, where splits are done on @@ -125,30 +136,32 @@ def sample_proj_mat(self, sample_inds): Projected matrix. """ - proj_mat = SparseRandomProjection(density=self.density, - n_components=self.proj_dims, - random_state=self.random_state) - + proj_mat = SparseRandomProjection( + density=self.density, + n_components=self.proj_dims, + random_state=self.random_state, + ) + proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) return proj_X, proj_mat def leaf_label_proba(self, idx): """ - Finds the most common label and probability of this label from the samples at - the leaf node for which this is used on. + Finds the most common label and probability of this label from the samples at + the leaf node for which this is used on. - Parameters - --- - idx : array of shape [n_samples] - The indices of the samples that are at the leaf node for which the label - and probability need to be found. + Parameters + --- + idx : array of shape [n_samples] + The indices of the samples that are at the leaf node for which the label + and probability need to be found. - Returns - --- - label : int - The label for any sample that is predicted to be at this node. - proba : float - The probability of the predicted sample to have this node's label. + Returns + --- + label : int + The label for any sample that is predicted to be at this node. + proba : float + The probability of the predicted sample to have this node's label. """ samples = self.y[idx] @@ -189,14 +202,16 @@ def score(self, y_sort, t): left_unique, left_counts = np.unique(left, return_counts=True) right_unique, right_counts = np.unique(right, return_counts=True) - + left_counts = left_counts / n_left right_counts = right_counts / n_right left_gini = 1 - np.sum(np.power(left_counts, 2)) right_gini = 1 - np.sum(np.power(right_counts, 2)) - gini = (n_left / self.n_samples) * left_gini + (n_right / self.n_samples) * right_gini + gini = (n_left / self.n_samples) * left_gini + ( + n_right / self.n_samples + ) * right_gini return gini # Returns impurity for a group of examples @@ -227,7 +242,7 @@ def impurity(self, idx): gini = np.sum(np.power(count, 2)) return 1 - gini - + # Finds the best split # This needs to be parallelized; its a major bottleneck def split(self, sample_inds): @@ -267,7 +282,9 @@ def split(self, sample_inds): idx = np.argsort(proj_X[:, j]) y_sort = y_sample[idx] - Q[1:-1, j] = np.array([self.score(y_sort, i) for i in range(1, n_samples - 1)]) + Q[1:-1, j] = np.array( + [self.score(y_sort, i) for i in range(1, n_samples - 1)] + ) # Identify best split feature, minimum gini impurity best_split_ind = np.argmin(Q) @@ -277,7 +294,7 @@ def split(self, sample_inds): # Sort samples by the split feature feat_vec = proj_X[:, feature] idx = np.argsort(feat_vec) - + feat_vec = feat_vec[idx] sample_inds = sample_inds[idx] @@ -290,8 +307,7 @@ def split(self, sample_inds): right_n_samples = len(right_idx) # See if we have no split - no_split = (left_n_samples == 0 or - right_n_samples == 0) + no_split = left_n_samples == 0 or right_n_samples == 0 # Evaluate improvement improvement = node_impurity - best_gini @@ -300,13 +316,25 @@ def split(self, sample_inds): left_impurity = self.impurity(left_idx) right_impurity = self.impurity(right_idx) - split_info = SplitInfo(feature, threshold, proj_mat, - left_impurity, left_idx, left_n_samples, - right_impurity, right_idx, right_n_samples, - no_split, improvement) + split_info = SplitInfo( + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ) return split_info -#-------------------------------------------------------------------------- + + +# -------------------------------------------------------------------------- + class Node: """ @@ -320,22 +348,24 @@ class Node: --- None """ + def __init__(self): self.node_id = None self.is_leaf = None self.parent = None self.left_child = None self.right_child = None - + self.feature = None self.threshold = None self.impurity = None self.n_samples = None - + self.proj_mat = None self.label = None self.proba = None + class StackRecord: """ A class used to keep track of a node's parent and other information about the node and its split. @@ -360,8 +390,7 @@ class StackRecord: None """ - def __init__(self, parent, depth, is_left, - impurity, sample_idx, n_samples): + def __init__(self, parent, depth, is_left, impurity, sample_idx, n_samples): self.parent = parent self.depth = depth @@ -370,6 +399,7 @@ def __init__(self, parent, depth, is_left, self.sample_idx = sample_idx self.n_samples = n_samples + class ObliqueTree: """ A class used to represent a tree with oblique splits. @@ -399,13 +429,20 @@ class ObliqueTree: Finds the final node for each input sample as it passes through the decision tree. """ - def __init__(self, splitter, min_samples_split, min_samples_leaf, - max_depth, min_impurity_split, min_impurity_decrease): - + def __init__( + self, + splitter, + min_samples_split, + min_samples_leaf, + max_depth, + min_impurity_split, + min_impurity_decrease, + ): + # Tree parameters - #self.n_samples = n_samples - #self.n_features = n_features - #self.n_classes = n_classes + # self.n_samples = n_samples + # self.n_features = n_features + # self.n_classes = n_classes self.depth = 0 self.node_count = 0 self.nodes = [] @@ -418,12 +455,19 @@ def __init__(self, splitter, min_samples_split, min_samples_leaf, self.min_impurity_split = min_impurity_split self.min_impurity_decrease = min_impurity_decrease - - - def add_node(self, parent, is_left, - impurity, n_samples, is_leaf, - feature, threshold, proj_mat, - label, proba): + def add_node( + self, + parent, + is_left, + impurity, + n_samples, + is_leaf, + feature, + threshold, + proj_mat, + label, + proba, + ): """ Adds a node to the existing oblique tree. @@ -503,80 +547,93 @@ def build(self): # Initialize, add root node stack = [] - root = StackRecord(0, 1, False, - self.splitter.impurity(self.splitter.indices), - self.splitter.indices, - self.splitter.n_samples) + root = StackRecord( + 0, + 1, + False, + self.splitter.impurity(self.splitter.indices), + self.splitter.indices, + self.splitter.n_samples, + ) stack.append(root) - # Build tree while len(stack) > 0: - + # Pop a record off the stack cur = stack.pop() - # Evaluate if it is a leaf - is_leaf = (cur.depth >= self.max_depth or - cur.n_samples < self.min_samples_split or - cur.n_samples < 2 * self.min_samples_leaf or - cur.impurity <= self.min_impurity_split) + is_leaf = ( + cur.depth >= self.max_depth + or cur.n_samples < self.min_samples_split + or cur.n_samples < 2 * self.min_samples_leaf + or cur.impurity <= self.min_impurity_split + ) # Split if not if not is_leaf: split = self.splitter.split(cur.sample_idx) - is_leaf = (is_leaf or - split.no_split or - split.improvement <= self.min_impurity_decrease) + is_leaf = ( + is_leaf + or split.no_split + or split.improvement <= self.min_impurity_decrease + ) # Add the node to the tree if is_leaf: - + label, proba = self.splitter.leaf_label_proba(cur.sample_idx) - - node_id = self.add_node(cur.parent, - cur.is_left, - cur.impurity, - cur.n_samples, - is_leaf, - None, - None, - None, - label, - proba) - else: - node_id = self.add_node(cur.parent, - cur.is_left, - cur.impurity, - cur.n_samples, - is_leaf, - split.feature, - split.threshold, - split.proj_mat, - None, - None) + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + None, + None, + None, + label, + proba, + ) + else: + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + split.feature, + split.threshold, + split.proj_mat, + None, + None, + ) # Push the right and left children to the stack if applicable if not is_leaf: - right_child = StackRecord(node_id, - cur.depth + 1, - False, - split.right_impurity, - split.right_idx, - split.right_n_samples) + right_child = StackRecord( + node_id, + cur.depth + 1, + False, + split.right_impurity, + split.right_idx, + split.right_n_samples, + ) stack.append(right_child) - left_child = StackRecord(node_id, - cur.depth + 1, - True, - split.left_impurity, - split.left_idx, - split.left_n_samples) + left_child = StackRecord( + node_id, + cur.depth + 1, + True, + split.left_impurity, + split.left_idx, + split.left_n_samples, + ) stack.append(left_child) if cur.depth > self.depth: @@ -613,9 +670,12 @@ def predict(self, X): return predictions -#-------------------------------------------------------------------------- + +# -------------------------------------------------------------------------- """ Class for Oblique Tree """ + + class ObliqueTreeClassifier(BaseEstimator): """ A class used to represent a classifier that uses an oblique decision tree. @@ -653,43 +713,42 @@ class ObliqueTreeClassifier(BaseEstimator): Gets the log of the probability of the prediction labels for the test samples. """ - def __init__(self, *, - - #criterion="gini", - #splitter=None, - max_depth=np.inf, - min_samples_split=2, - min_samples_leaf=1, - #min_weight_fraction_leaf=0, - #max_features="auto", - #max_leaf_nodes=None, - random_state=None, - min_impurity_decrease=0, - min_impurity_split=0, - #class_weight=None, - #ccp_alpha=0.0, - - #New args - feature_combinations=1.2, - density=0.7 - - ): - - #self.criterion=criterion - self.max_depth=max_depth - self.min_samples_split=min_samples_split - self.min_samples_leaf=min_samples_leaf - #self.min_weight_fraction_leaf=min_weight_fraction_leaf - #self.max_features=max_features - #self.max_leaf_nodes=max_leaf_nodes - self.random_state=random_state - self.min_impurity_decrease=min_impurity_decrease - self.min_impurity_split=min_impurity_split - #self.class_weight=class_weight - #self.ccp_alpha=ccp_alpha - - self.feature_combinations=feature_combinations - self.density=density + def __init__( + self, + *, + # criterion="gini", + # splitter=None, + max_depth=np.inf, + min_samples_split=2, + min_samples_leaf=1, + # min_weight_fraction_leaf=0, + # max_features="auto", + # max_leaf_nodes=None, + random_state=None, + min_impurity_decrease=0, + min_impurity_split=0, + # class_weight=None, + # ccp_alpha=0.0, + # New args + feature_combinations=1.2, + density=0.7 + ): + + # self.criterion=criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + # self.min_weight_fraction_leaf=min_weight_fraction_leaf + # self.max_features=max_features + # self.max_leaf_nodes=max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + # self.class_weight=class_weight + # self.ccp_alpha=ccp_alpha + + self.feature_combinations = feature_combinations + self.density = density def fit(self, X, y): """ @@ -709,17 +768,18 @@ def fit(self, X, y): """ self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) - splitter = ObliqueSplitter(X, y, - self.proj_dims, - self.density, - self.random_state) - - self.tree = ObliqueTree(splitter, - self.min_samples_split, - self.min_samples_leaf, - self.max_depth, - self.min_impurity_split, - self.min_impurity_decrease) + splitter = ObliqueSplitter( + X, y, self.proj_dims, self.density, self.random_state + ) + + self.tree = ObliqueTree( + splitter, + self.min_samples_split, + self.min_samples_leaf, + self.max_depth, + self.min_impurity_split, + self.min_impurity_decrease, + ) self.tree.build() return self @@ -740,7 +800,7 @@ def apply(self, X): pred_nodes = self.tree.predict(X).astype(int) return pred_nodes - + def predict(self, X): """ Determines final label predictions for each sample in the test data. @@ -764,7 +824,6 @@ def predict(self, X): return preds - def predict_proba(self, X): """ Determines probabilities of the final label predictions for each sample in the test data. @@ -808,5 +867,3 @@ def predict_log_proba(self, X): proba[k] = np.log(proba[k]) return proba - - diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py index 294ce9784e..f0659cbaff 100644 --- a/proglearn/tests/oblique_tree_test.py +++ b/proglearn/tests/oblique_tree_test.py @@ -8,9 +8,8 @@ class TestObliqueSplitter: - def test_sample_projmat(self): - + random_state = 0 rng.seed(random_state) @@ -19,11 +18,13 @@ def test_sample_projmat(self): density = 0.5 proj_dims = [10, 20, 40, 60, 80] - sample_inds = [np.linspace(0, 9, 10, dtype=int), - np.linspace(0, 19, 20, dtype=int), - np.linspace(0, 39, 40, dtype=int), - np.linspace(0, 59, 60, dtype=int), - np.linspace(0, 79, 80, dtype=int)] + sample_inds = [ + np.linspace(0, 9, 10, dtype=int), + np.linspace(0, 19, 20, dtype=int), + np.linspace(0, 39, 40, dtype=int), + np.linspace(0, 59, 60, dtype=int), + np.linspace(0, 79, 80, dtype=int), + ] n_sample_inds = [10, 20, 40, 60, 80] @@ -55,10 +56,10 @@ def test_score(self): assert 0 == score score = splitter.score(y, 1) - assert_almost_equal(5/11, score) + assert_almost_equal(5 / 11, score) def test_impurity(self): - + random_state = 0 rng.seed(random_state) @@ -70,14 +71,14 @@ def test_impurity(self): y = np.zeros(100) for i in range(10): for j in range(10): - y[10*i + j] = i - + y[10 * i + j] = i + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - + # Impurity of one thing should be 0 impurity = splitter.impurity([0]) assert 0 == impurity - + # Impurity of one class should be 0 impurity = splitter.impurity(np.linspace(0, 9, 10, dtype=int)) assert 0 == impurity @@ -89,7 +90,7 @@ def test_impurity(self): # Impurity of all classes should be 10 * (1/10)(9/10) = 9/10 impurity = splitter.impurity(np.linspace(0, 99, 100, dtype=int)) assert_almost_equal(0.9, impurity) - + def test_split(self): random_state = 0 @@ -103,43 +104,33 @@ def test_split(self): y = np.zeros(100) for i in range(10): for j in range(10): - y[10*i + j] = i - + y[10 * i + j] = i + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - + split_info = splitter.split(np.array([i for i in range(100)])) -class TestObliqueTree: +class TestObliqueTree: def test_add_node(self): - + # Add a root node tree = ObliqueTree(None, 0, 0, 0, 0, 0) - tree.add_node(0, False, - 0, 0, False, - 0, 0, None, - 0, 0) + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) # Add a regular node - tree.add_node(0, False, - 0, 0, False, - 0, 0, None, - 0, 0) + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) # Add a leaf node - tree.add_node(1, False, - 0, 0, True, - 0, 0, None, - 0, 0) + tree.add_node(1, False, 0, 0, True, 0, 0, None, 0, 0) assert 3 == len(tree.nodes) assert 3 == tree.node_count - def test_fit(self): - data = load_iris() + data = load_iris() clf = ObliqueTreeClassifier() clf.fit(data.data, data.target) @@ -162,4 +153,3 @@ def test_predict(self): bool_inc = np.all(np.log(preds_proba) == preds_log_proba) assert bool_inc - diff --git a/proglearn/transformers.py b/proglearn/transformers.py index 3c13846794..756d0bde28 100755 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -196,6 +196,7 @@ def transform(self, X): X = check_array(X) return self.transformer_.apply(X) + class ObliqueTreeClassificationTransformer(BaseTransformer): """ A class used to transform data from a category to a specialized representation. From 0e50784a506319a965b47caefc1dc8d68d08f0c5 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Sat, 21 Nov 2020 15:20:07 -0800 Subject: [PATCH 09/17] Fix docstring indents --- proglearn/oblique_tree.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py index 60bfe84765..ea6bcbe632 100644 --- a/proglearn/oblique_tree.py +++ b/proglearn/oblique_tree.py @@ -147,21 +147,21 @@ def sample_proj_mat(self, sample_inds): def leaf_label_proba(self, idx): """ - Finds the most common label and probability of this label from the samples at - the leaf node for which this is used on. + Finds the most common label and probability of this label from the samples at + the leaf node for which this is used on. - Parameters - --- - idx : array of shape [n_samples] - The indices of the samples that are at the leaf node for which the label - and probability need to be found. + Parameters + --- + idx : array of shape [n_samples] + The indices of the samples that are at the leaf node for which the label + and probability need to be found. - Returns - --- + Returns + --- label : int - The label for any sample that is predicted to be at this node. - proba : float - The probability of the predicted sample to have this node's label. + The label for any sample that is predicted to be at this node. + proba : float + The probability of the predicted sample to have this node's label. """ samples = self.y[idx] From 1328b75312cca03ffcc6e94b69c704332a01ee54 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Fri, 11 Dec 2020 11:53:29 -0500 Subject: [PATCH 10/17] black formatting --- proglearn/__init__.py | 1 - proglearn/forest.py | 36 +- proglearn/oblique_tree.py | 869 -------------------------- proglearn/tests/oblique_tree_test.py | 155 ----- proglearn/tests/test_transformer.py | 159 ++++- proglearn/transformers.py | 875 ++++++++++++++++++++++++++- 6 files changed, 1063 insertions(+), 1032 deletions(-) delete mode 100644 proglearn/oblique_tree.py delete mode 100644 proglearn/tests/oblique_tree_test.py mode change 100755 => 100644 proglearn/transformers.py diff --git a/proglearn/__init__.py b/proglearn/__init__.py index 323e4439d7..37657e3dda 100755 --- a/proglearn/__init__.py +++ b/proglearn/__init__.py @@ -1,5 +1,4 @@ from .forest import * from .network import * -from .oblique_tree import * __version__ = "0.0.3" diff --git a/proglearn/forest.py b/proglearn/forest.py index 6deddf4e40..e8812b8148 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -48,13 +48,26 @@ def __init__( default_tree_construction_proportion=0.67, default_kappa=np.inf, default_max_depth=30, + oblique=False, + default_feature_combinations=1.5, + default_density=0.5, ): self.default_n_estimators = default_n_estimators self.default_tree_construction_proportion = default_tree_construction_proportion self.default_kappa = default_kappa self.default_max_depth = default_max_depth + self.oblique = oblique + + if oblique: + default_transformer_class = ObliqueTreeClassificationTransformer + self.default_feature_combinations = default_feature_combinations + self.default_density = default_density + + else: + default_transformer_class = TreeClassificationTransformer + self.pl_ = ClassificationProgressiveLearner( - default_transformer_class=TreeClassificationTransformer, + default_transformer_class=default_transformer_class, default_transformer_kwargs={}, default_voter_class=TreeClassificationVoter, default_voter_kwargs={"kappa": default_kappa}, @@ -71,6 +84,8 @@ def add_task( tree_construction_proportion="default", kappa="default", max_depth="default", + feature_combinations="default", + density="default", ): """ adds a task with id task_id, max tree depth max_depth, given input data matrix X @@ -120,6 +135,23 @@ def add_task( if max_depth == "default": max_depth = self.default_max_depth + if self.oblique: + if feature_combinations == "default": + feature_combinations = self.default_feature_combinations + if density == "default": + density = self.default_density + + transformer_kwargs = { + "kwargs": { + "max_depth": max_depth, + "feature_combinations": feature_combinations, + "density": density, + } + } + + else: + transformer_kwargs = ({"kwargs": {"max_depth": max_depth}},) + X, y = check_X_y(X, y) return self.pl_.add_task( X, @@ -131,7 +163,7 @@ def add_task( 0, ], num_transformers=n_estimators, - transformer_kwargs={"kwargs": {"max_depth": max_depth}}, + transformer_kwargs=transformer_kwargs, voter_kwargs={ "classes": np.unique(y), "kappa": kappa, diff --git a/proglearn/oblique_tree.py b/proglearn/oblique_tree.py deleted file mode 100644 index 60bfe84765..0000000000 --- a/proglearn/oblique_tree.py +++ /dev/null @@ -1,869 +0,0 @@ -""" -Authors: Parth Vora and Jay Mandavilli - -Oblique Decision Tree (SPORF) -""" -import numpy as np -from sklearn.base import BaseEstimator -from sklearn.random_projection import SparseRandomProjection - -# -------------------------------------------------------------------------- -class SplitInfo: - """ - A class used to store information about a certain split. - - Parameters: - --- - feature : int - The feature which is used for the particular split. - threshold : float - The feature value which defines the split, if an example has a value less - than this threshold for the feature of this split then it will go to the - left child, otherwise it wil go the right child where these children are - the children nodes of the node for which this split defines. - proj_mat : array of shape [n_components, n_features] - The sparse random projection matrix for this split. - left_impurity : float - This is Gini impurity of left side of the split. - left_idx : array of shape [left_n_samples] - This is the indices of the nodes that are in the left side of this split. - left_n_samples : int - The number of samples in the left side of this split. - right_impurity : float - This is Gini impurity of right side of the split. - right_idx : array of shape [right_n_samples] - This is the indices of the nodes that are in the right side of this split. - right_n_samples : int - The number of samples in the right side of this split. - no_split : bool - A boolean specifying if there is a valid split or not. Here an invalid - split means all of the samples would go to one side. - improvement : float - A metric to determine if the split improves the decision tree. - """ - - def __init__( - self, - feature, - threshold, - proj_mat, - left_impurity, - left_idx, - left_n_samples, - right_impurity, - right_idx, - right_n_samples, - no_split, - improvement, - ): - - self.feature = feature - self.threshold = threshold - self.proj_mat = proj_mat - self.left_impurity = left_impurity - self.left_idx = left_idx - self.left_n_samples = left_n_samples - self.right_impurity = right_impurity - self.right_idx = right_idx - self.right_n_samples = right_n_samples - self.no_split = no_split - self.improvement = improvement - - -class ObliqueSplitter: - """ - A class used to represent an oblique splitter, where splits are done on - the linear combination of the features. - - Parameters: - --- - X : array of shape [n_samples, n_features] - The input data X is a matrix of the examples and their respective feature - values for each of the features. - y : array of shape [n_samples] - The labels for each of the examples in X. - proj_dims : int - The dimensionality of the target projection space. - density : float - Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. - random_state : int - Controls the pseudo random number generator used to generate the projection matrix. - - Methods - --- - sample_proj_mat(sample_inds) - This gets the projection matrix and it fits the transform to the samples of interest. - leaf_label_proba(idx) - This calculates the label and the probability for that label for a particular leaf - node. - score(y_sort, t) - Finds the Gini impurity for a split. - impurity(idx) - Finds the impurity for a certain set of samples. - split(sample_inds) - Determines the best possible split for the given set of samples. - """ - - def __init__(self, X, y, proj_dims, density, random_state): - - self.X = X - self.y = y - - self.classes = np.array(np.unique(y), dtype=int) - self.n_classes = len(self.classes) - self.indices = np.indices(y.shape)[0] - - self.n_samples = X.shape[0] - - self.proj_dims = proj_dims - self.density = density - self.random_state = random_state - - def sample_proj_mat(self, sample_inds): - """ - Gets the projection matrix and it fits the transform to the samples of interest. - - Parameters - --- - sample_inds : array of shape [n_samples] - The data we are transforming. - - Returns - --- - proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) - The generated sparse random matrix. - proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) - Projected matrix. - """ - - proj_mat = SparseRandomProjection( - density=self.density, - n_components=self.proj_dims, - random_state=self.random_state, - ) - - proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) - return proj_X, proj_mat - - def leaf_label_proba(self, idx): - """ - Finds the most common label and probability of this label from the samples at - the leaf node for which this is used on. - - Parameters - --- - idx : array of shape [n_samples] - The indices of the samples that are at the leaf node for which the label - and probability need to be found. - - Returns - --- - label : int - The label for any sample that is predicted to be at this node. - proba : float - The probability of the predicted sample to have this node's label. - """ - - samples = self.y[idx] - n = len(samples) - labels, count = np.unique(samples, return_counts=True) - most = np.argmax(count) - - label = labels[most] - proba = count[most] / n - - return label, proba - - # Returns gini impurity for split - # Expects 0 < t < n - def score(self, y_sort, t): - """ - Finds the Gini impurity for the split of interest - - Parameters - --- - y_sort : array of shape [n_samples] - A sorted array of labels for the examples for which the Gini impurity - is being calculated. - t : float - The threshold determining where to split y_sort. - - Returns - --- - gini : float - The Gini impurity of the split. - """ - - left = y_sort[:t] - right = y_sort[t:] - - n_left = len(left) - n_right = len(right) - - left_unique, left_counts = np.unique(left, return_counts=True) - right_unique, right_counts = np.unique(right, return_counts=True) - - left_counts = left_counts / n_left - right_counts = right_counts / n_right - - left_gini = 1 - np.sum(np.power(left_counts, 2)) - right_gini = 1 - np.sum(np.power(right_counts, 2)) - - gini = (n_left / self.n_samples) * left_gini + ( - n_right / self.n_samples - ) * right_gini - return gini - - # Returns impurity for a group of examples - # expects idx not None - def impurity(self, idx): - """ - Finds the actual impurity for a set of samples - - Parameters - --- - idx : array of shape [n_samples] - The indices of the nodes in the set for which the impurity is being calculated. - - Returns - --- - impurity : float - Actual impurity of split. - """ - - samples = self.y[idx] - n = len(samples) - - if n == 0: - return 0 - - unique, count = np.unique(samples, return_counts=True) - count = count / n - gini = np.sum(np.power(count, 2)) - - return 1 - gini - - # Finds the best split - # This needs to be parallelized; its a major bottleneck - def split(self, sample_inds): - """ - Finds the optimal split for a set of samples. - Note that the code for this method needs to be parallelized. This is a major - bottleneck in integration with scikit-learn. - - Parameters - --- - sample_inds : array of shape [n_samples] - The indices of the nodes in the set for which the best split is found. - - Returns - --- - split_info : SplitInfo - Class holding information about the split. - """ - - # Project the data - proj_X, proj_mat = self.sample_proj_mat(sample_inds) - y_sample = self.y[sample_inds] - n_samples = len(sample_inds) - - # Score matrix - # No split score is just node impurity - Q = np.zeros((n_samples, self.proj_dims)) - node_impurity = self.impurity(sample_inds) - Q[0, :] = node_impurity - Q[-1, :] = node_impurity - - # Loop through projected features and examples to find best split - # This can be parallelized for sure - for j in range(self.proj_dims): - - # Sort labels by the jth feature - idx = np.argsort(proj_X[:, j]) - y_sort = y_sample[idx] - - Q[1:-1, j] = np.array( - [self.score(y_sort, i) for i in range(1, n_samples - 1)] - ) - - # Identify best split feature, minimum gini impurity - best_split_ind = np.argmin(Q) - thresh_i, feature = np.unravel_index(best_split_ind, Q.shape) - best_gini = Q[thresh_i, feature] - - # Sort samples by the split feature - feat_vec = proj_X[:, feature] - idx = np.argsort(feat_vec) - - feat_vec = feat_vec[idx] - sample_inds = sample_inds[idx] - - # Get the threshold, split samples into left and right - threshold = feat_vec[thresh_i] - left_idx = sample_inds[:thresh_i] - right_idx = sample_inds[thresh_i:] - - left_n_samples = len(left_idx) - right_n_samples = len(right_idx) - - # See if we have no split - no_split = left_n_samples == 0 or right_n_samples == 0 - - # Evaluate improvement - improvement = node_impurity - best_gini - - # Evaluate impurities for left and right children - left_impurity = self.impurity(left_idx) - right_impurity = self.impurity(right_idx) - - split_info = SplitInfo( - feature, - threshold, - proj_mat, - left_impurity, - left_idx, - left_n_samples, - right_impurity, - right_idx, - right_n_samples, - no_split, - improvement, - ) - - return split_info - - -# -------------------------------------------------------------------------- - - -class Node: - """ - A class used to represent an oblique node. - - Parameters: - --- - None - - Methods - --- - None - """ - - def __init__(self): - self.node_id = None - self.is_leaf = None - self.parent = None - self.left_child = None - self.right_child = None - - self.feature = None - self.threshold = None - self.impurity = None - self.n_samples = None - - self.proj_mat = None - self.label = None - self.proba = None - - -class StackRecord: - """ - A class used to keep track of a node's parent and other information about the node and its split. - - Parameters: - --- - parent : int - The index of the parent node. - depth : int - The depth at which this node is. - is_left : bool - Represents if the node is a left child or not. - impurity : float - This is Gini impurity of this node. - sample_idx : array of shape [n_samples] - This is the indices of the nodes that are in this node. - n_samples : int - The number of samples in this node. - - Methods - --- - None - """ - - def __init__(self, parent, depth, is_left, impurity, sample_idx, n_samples): - - self.parent = parent - self.depth = depth - self.is_left = is_left - self.impurity = impurity - self.sample_idx = sample_idx - self.n_samples = n_samples - - -class ObliqueTree: - """ - A class used to represent a tree with oblique splits. - - Parameters: - --- - splitter : class - The type of splitter for this tree, should be an ObliqueSplitter. - min_samples_split : int - Minimum number of samples possible at a node. - min_samples_leaf : int - Minimum number of samples possible at a leaf. - max_depth : int - Maximum depth allowed for the tree. - min_impurity_split : float - Minimum Gini impurity value that must be achieved for a split to occur on the node. - min_impurity_decrease : float - Minimum amount Gini impurity value must decrease by for a split to be valid. - - Methods - --- - add_node(parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba) - Adds a node to the existing tree - build() - This is what is initially called on to completely build the oblique tree. - predict(X) - Finds the final node for each input sample as it passes through the decision tree. - """ - - def __init__( - self, - splitter, - min_samples_split, - min_samples_leaf, - max_depth, - min_impurity_split, - min_impurity_decrease, - ): - - # Tree parameters - # self.n_samples = n_samples - # self.n_features = n_features - # self.n_classes = n_classes - self.depth = 0 - self.node_count = 0 - self.nodes = [] - - # Build parameters - self.splitter = splitter - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - self.max_depth = max_depth - self.min_impurity_split = min_impurity_split - self.min_impurity_decrease = min_impurity_decrease - - def add_node( - self, - parent, - is_left, - impurity, - n_samples, - is_leaf, - feature, - threshold, - proj_mat, - label, - proba, - ): - """ - Adds a node to the existing oblique tree. - - Parameters - --- - parent : int - The index of the parent node for the new node being added. - is_left : bool - Determines if this new node being added is a left or right child. - impurity : float - Impurity of this new node. - n_samples : int - Number of samples at this new node. - is_leaf : bool - Determines if this new node is a leaf of the tree or an internal node. - feature : int - Index of feature on which the split occurs at this node. - threshold : float - The threshold feature value for this node determining if a sample will go - to this node's left of right child. If a sample has a value less than the - threshold (for the feature of this node) it will go to the left childe, - otherwise it will go the right child. - proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) - Projection matrix for this new node. - label : int - The label a sample will be given if it is predicted to be at this node. - proba : float - The probability a predicted sample has of being the node's label. - - Returns - --- - node_id : int - Index of the new node just added. - """ - - node = Node() - node.node_id = self.node_count - node.impurity = impurity - node.n_samples = n_samples - - # If not the root node, set parents - if self.node_count > 0: - node.parent = parent - if is_left: - self.nodes[parent].left_child = node.node_id - else: - self.nodes[parent].right_child = node.node_id - - # Set node parameters - if is_leaf: - node.is_leaf = True - node.label = label - node.proba = proba - else: - node.is_leaf = False - node.feature = feature - node.threshold = threshold - node.proj_mat = proj_mat - - self.node_count += 1 - self.nodes.append(node) - - return node.node_id - - def build(self): - """ - Builds the oblique tree. - - Parameters - --- - None - - Returns - --- - None - """ - - # Initialize, add root node - stack = [] - root = StackRecord( - 0, - 1, - False, - self.splitter.impurity(self.splitter.indices), - self.splitter.indices, - self.splitter.n_samples, - ) - stack.append(root) - - # Build tree - while len(stack) > 0: - - # Pop a record off the stack - cur = stack.pop() - - # Evaluate if it is a leaf - is_leaf = ( - cur.depth >= self.max_depth - or cur.n_samples < self.min_samples_split - or cur.n_samples < 2 * self.min_samples_leaf - or cur.impurity <= self.min_impurity_split - ) - - # Split if not - if not is_leaf: - split = self.splitter.split(cur.sample_idx) - - is_leaf = ( - is_leaf - or split.no_split - or split.improvement <= self.min_impurity_decrease - ) - - # Add the node to the tree - if is_leaf: - - label, proba = self.splitter.leaf_label_proba(cur.sample_idx) - - node_id = self.add_node( - cur.parent, - cur.is_left, - cur.impurity, - cur.n_samples, - is_leaf, - None, - None, - None, - label, - proba, - ) - - else: - node_id = self.add_node( - cur.parent, - cur.is_left, - cur.impurity, - cur.n_samples, - is_leaf, - split.feature, - split.threshold, - split.proj_mat, - None, - None, - ) - - # Push the right and left children to the stack if applicable - if not is_leaf: - - right_child = StackRecord( - node_id, - cur.depth + 1, - False, - split.right_impurity, - split.right_idx, - split.right_n_samples, - ) - stack.append(right_child) - - left_child = StackRecord( - node_id, - cur.depth + 1, - True, - split.left_impurity, - split.left_idx, - split.left_n_samples, - ) - stack.append(left_child) - - if cur.depth > self.depth: - self.depth = cur.depth - - def predict(self, X): - """ - Predicts final nodes of samples given. - - Parameters - --- - X : array of shape [n_samples, n_features] - The input array for which predictions are made. - - Returns - --- - predictions : array of shape [n_samples] - Array of the final node index for each input prediction sample. - """ - - predictions = np.zeros(X.shape[0]) - for i in range(X.shape[0]): - cur = self.nodes[0] - while not cur is None and not cur.is_leaf: - proj_X = cur.proj_mat.transform(X) - if proj_X[i, cur.feature] < cur.threshold: - id = cur.left_child - cur = self.nodes[id] - else: - id = cur.right_child - cur = self.nodes[id] - - predictions[i] = cur.node_id - - return predictions - - -# -------------------------------------------------------------------------- - -""" Class for Oblique Tree """ - - -class ObliqueTreeClassifier(BaseEstimator): - """ - A class used to represent a classifier that uses an oblique decision tree. - - Parameters: - --- - max_depth : int - Maximum depth allowed for oblique tree. - min_samples_split : int - Minimum number of samples possible at a node. - min_samples_leaf : int - Minimum number of samples possible at a leaf. - random_state : int - Maximum depth allowed for the tree. - min_impurity_decrease : float - Minimum amount Gini impurity value must decrease by for a split to be valid. - min_impurity_split : float - Minimum Gini impurity value that must be achieved for a split to occur on the node. - feature_combinations : float - The feature combinations to use for the oblique split. - density : float - Density estimate. - - Methods - --- - fit(X,y) - Fits the oblique tree to the training samples. - apply(X) - Calls on the predict function from the oblique tree for the test samples. - predict(X) - Gets the prediction labels for the test samples. - predict_proba(X) - Gets the probability of the prediction labels for the test samples. - predict_log_proba(X) - Gets the log of the probability of the prediction labels for the test samples. - """ - - def __init__( - self, - *, - # criterion="gini", - # splitter=None, - max_depth=np.inf, - min_samples_split=2, - min_samples_leaf=1, - # min_weight_fraction_leaf=0, - # max_features="auto", - # max_leaf_nodes=None, - random_state=None, - min_impurity_decrease=0, - min_impurity_split=0, - # class_weight=None, - # ccp_alpha=0.0, - # New args - feature_combinations=1.2, - density=0.7 - ): - - # self.criterion=criterion - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.min_samples_leaf = min_samples_leaf - # self.min_weight_fraction_leaf=min_weight_fraction_leaf - # self.max_features=max_features - # self.max_leaf_nodes=max_leaf_nodes - self.random_state = random_state - self.min_impurity_decrease = min_impurity_decrease - self.min_impurity_split = min_impurity_split - # self.class_weight=class_weight - # self.ccp_alpha=ccp_alpha - - self.feature_combinations = feature_combinations - self.density = density - - def fit(self, X, y): - """ - Predicts final nodes of samples given. - - Parameters - --- - X : array of shape [n_samples, n_features] - The training samples. - y : array of shape [n_samples] - Labels for the training samples. - - Returns - --- - ObliqueTreeClassifier - The fit classifier. - """ - - self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) - splitter = ObliqueSplitter( - X, y, self.proj_dims, self.density, self.random_state - ) - - self.tree = ObliqueTree( - splitter, - self.min_samples_split, - self.min_samples_leaf, - self.max_depth, - self.min_impurity_split, - self.min_impurity_decrease, - ) - self.tree.build() - return self - - def apply(self, X): - """ - Gets predictions form the oblique tree for the test samples. - - Parameters - --- - X : array of shape [n_samples, n_features] - The testing samples. - - Returns - --- - pred_nodes : array of shape[n_samples] - The indices for each test sample's final node in the oblique tree. - """ - - pred_nodes = self.tree.predict(X).astype(int) - return pred_nodes - - def predict(self, X): - """ - Determines final label predictions for each sample in the test data. - - Parameters - --- - X : array of shape [n_samples, n_features] - The testing samples. - - Returns - --- - preds : array of shape[n_samples] - The predictions (labels) for each testing sample. - """ - - preds = np.zeros(X.shape[0]) - pred_nodes = self.apply(X) - for k in range(len(pred_nodes)): - id = pred_nodes[k] - preds[k] = self.tree.nodes[id].label - - return preds - - def predict_proba(self, X): - """ - Determines probabilities of the final label predictions for each sample in the test data. - - Parameters - --- - X : array of shape [n_samples, n_features] - The testing samples. - - Returns - --- - preds : array of shape[n_samples] - The probabilities of the predictions (labels) for each testing sample. - """ - - preds = np.zeros(X.shape[0]) - pred_nodes = self.apply(X) - for k in range(len(preds)): - id = pred_nodes[k] - preds[k] = self.tree.nodes[id].proba - - return preds - - def predict_log_proba(self, X): - """ - Determines log of the probabilities of the final label predictions for each sample in the test data. - - Parameters - --- - X : array of shape [n_samples, n_features] - The testing samples. - - Returns - --- - preds : array of shape[n_samples] - The log of the probabilities of the predictions (labels) for each testing sample. - """ - - proba = self.predict_proba(X) - for k in range(len(proba)): - proba[k] = np.log(proba[k]) - - return proba diff --git a/proglearn/tests/oblique_tree_test.py b/proglearn/tests/oblique_tree_test.py deleted file mode 100644 index f0659cbaff..0000000000 --- a/proglearn/tests/oblique_tree_test.py +++ /dev/null @@ -1,155 +0,0 @@ -import pytest -import numpy as np -from numpy import random as rng -from numpy.testing import assert_almost_equal, assert_warns, assert_raises -from sklearn.datasets import load_iris - -from proglearn.oblique_tree import * - - -class TestObliqueSplitter: - def test_sample_projmat(self): - - random_state = 0 - rng.seed(random_state) - - X = rng.rand(100, 100) - y = np.zeros(100) - - density = 0.5 - proj_dims = [10, 20, 40, 60, 80] - sample_inds = [ - np.linspace(0, 9, 10, dtype=int), - np.linspace(0, 19, 20, dtype=int), - np.linspace(0, 39, 40, dtype=int), - np.linspace(0, 59, 60, dtype=int), - np.linspace(0, 79, 80, dtype=int), - ] - - n_sample_inds = [10, 20, 40, 60, 80] - - for pd in proj_dims: - splitter = ObliqueSplitter(X, y, pd, density, random_state) - - for i in range(len(n_sample_inds)): - si = sample_inds[i] - n = n_sample_inds[i] - - proj_X, projmat = splitter.sample_proj_mat(si) - assert n == proj_X.shape[0] - assert pd == proj_X.shape[1] - - def test_score(self): - - random_state = 0 - rng.seed(random_state) - - X = rng.rand(11, 11) - - density = 0.5 - proj_dims = 5 - - y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - - score = splitter.score(y, 6) - assert 0 == score - - score = splitter.score(y, 1) - assert_almost_equal(5 / 11, score) - - def test_impurity(self): - - random_state = 0 - rng.seed(random_state) - - X = rng.rand(100, 100) - - density = 0.5 - proj_dims = 50 - - y = np.zeros(100) - for i in range(10): - for j in range(10): - y[10 * i + j] = i - - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - - # Impurity of one thing should be 0 - impurity = splitter.impurity([0]) - assert 0 == impurity - - # Impurity of one class should be 0 - impurity = splitter.impurity(np.linspace(0, 9, 10, dtype=int)) - assert 0 == impurity - - # Impurity of two different classes with equal number should be 0.5 - impurity = splitter.impurity(np.linspace(0, 19, 20, dtype=int)) - assert 0.5 == impurity - - # Impurity of all classes should be 10 * (1/10)(9/10) = 9/10 - impurity = splitter.impurity(np.linspace(0, 99, 100, dtype=int)) - assert_almost_equal(0.9, impurity) - - def test_split(self): - - random_state = 0 - rng.seed(random_state) - - X = rng.rand(100, 100) - - density = 0.5 - proj_dims = 50 - - y = np.zeros(100) - for i in range(10): - for j in range(10): - y[10 * i + j] = i - - splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) - - split_info = splitter.split(np.array([i for i in range(100)])) - - -class TestObliqueTree: - def test_add_node(self): - - # Add a root node - tree = ObliqueTree(None, 0, 0, 0, 0, 0) - - tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) - - # Add a regular node - tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) - - # Add a leaf node - tree.add_node(1, False, 0, 0, True, 0, 0, None, 0, 0) - - assert 3 == len(tree.nodes) - assert 3 == tree.node_count - - def test_fit(self): - - data = load_iris() - clf = ObliqueTreeClassifier() - clf.fit(data.data, data.target) - - def test_predict(self): - Xtrain = np.random.rand(6, 5) - ytrain = np.array([0, 0, 1, 1, 0, 1]) - tree = ObliqueTreeClassifier() - tree.fit(Xtrain, ytrain) - Xtest = np.random.rand(3, 5) - preds = tree.predict(Xtest) - - assert len(preds) == len(Xtest) - - preds_proba = tree.predict_proba(Xtest) - preds_log_proba = tree.predict_log_proba(Xtest) - - assert len(preds_proba) == len(Xtest) - assert len(preds_log_proba) == len(Xtest) - - bool_inc = np.all(np.log(preds_proba) == preds_log_proba) - - assert bool_inc diff --git a/proglearn/tests/test_transformer.py b/proglearn/tests/test_transformer.py index f7d66f2b39..a4de499ef3 100644 --- a/proglearn/tests/test_transformer.py +++ b/proglearn/tests/test_transformer.py @@ -1,9 +1,16 @@ import pytest import numpy as np -from numpy.testing import assert_allclose +from numpy.testing import ( + assert_almost_equal, + assert_warns, + assert_raises, + assert_allclose, +) +from numpy import random as rng +from sklearn.datasets import load_iris from sklearn.exceptions import NotFittedError -from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import * class TestTreeClassificationTransformer: @@ -31,3 +38,151 @@ def test_correct_transformation(self): u1 = trt.transform(np.array([0]).reshape(1, -1)) u2 = trt.transform(np.array([1]).reshape(1, -1)) assert u1 != u2 + + +class TestObliqueSplitter: + def test_sample_projmat(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + y = np.zeros(100) + + density = 0.5 + proj_dims = [10, 20, 40, 60, 80] + sample_inds = [ + np.linspace(0, 9, 10, dtype=int), + np.linspace(0, 19, 20, dtype=int), + np.linspace(0, 39, 40, dtype=int), + np.linspace(0, 59, 60, dtype=int), + np.linspace(0, 79, 80, dtype=int), + ] + + n_sample_inds = [10, 20, 40, 60, 80] + + for pd in proj_dims: + splitter = ObliqueSplitter(X, y, pd, density, random_state) + + for i in range(len(n_sample_inds)): + si = sample_inds[i] + n = n_sample_inds[i] + + proj_X, projmat = splitter.sample_proj_mat(si) + assert n == proj_X.shape[0] + assert pd == proj_X.shape[1] + + def test_score(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(11, 11) + + density = 0.5 + proj_dims = 5 + + y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + score = splitter.score(y, 6) + assert 0 == score + + score = splitter.score(y, 1) + assert_almost_equal(5 / 11, score) + + def test_impurity(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + + density = 0.5 + proj_dims = 50 + + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10 * i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + # Impurity of one thing should be 0 + impurity = splitter.impurity([0]) + assert 0 == impurity + + # Impurity of one class should be 0 + impurity = splitter.impurity(np.linspace(0, 9, 10, dtype=int)) + assert 0 == impurity + + # Impurity of two different classes with equal number should be 0.5 + impurity = splitter.impurity(np.linspace(0, 19, 20, dtype=int)) + assert 0.5 == impurity + + # Impurity of all classes should be 10 * (1/10)(9/10) = 9/10 + impurity = splitter.impurity(np.linspace(0, 99, 100, dtype=int)) + assert_almost_equal(0.9, impurity) + + def test_split(self): + + random_state = 0 + rng.seed(random_state) + + X = rng.rand(100, 100) + + density = 0.5 + proj_dims = 50 + + y = np.zeros(100) + for i in range(10): + for j in range(10): + y[10 * i + j] = i + + splitter = ObliqueSplitter(X, y, proj_dims, density, random_state) + + split_info = splitter.split(np.array([i for i in range(100)])) + + +class TestObliqueTree: + def test_add_node(self): + + # Add a root node + tree = ObliqueTree(None, 0, 0, 0, 0, 0) + + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) + + # Add a regular node + tree.add_node(0, False, 0, 0, False, 0, 0, None, 0, 0) + + # Add a leaf node + tree.add_node(1, False, 0, 0, True, 0, 0, None, 0, 0) + + assert 3 == len(tree.nodes) + assert 3 == tree.node_count + + def test_fit(self): + + data = load_iris() + clf = ObliqueTreeClassifier() + clf.fit(data.data, data.target) + + def test_predict(self): + Xtrain = np.random.rand(6, 5) + ytrain = np.array([0, 0, 1, 1, 0, 1]) + tree = ObliqueTreeClassifier() + tree.fit(Xtrain, ytrain) + Xtest = np.random.rand(3, 5) + preds = tree.predict(Xtest) + + assert len(preds) == len(Xtest) + + preds_proba = tree.predict_proba(Xtest) + preds_log_proba = tree.predict_log_proba(Xtest) + + assert len(preds_proba) == len(Xtest) + assert len(preds_log_proba) == len(Xtest) + + bool_inc = np.all(np.log(preds_proba) == preds_log_proba) + + assert bool_inc diff --git a/proglearn/transformers.py b/proglearn/transformers.py old mode 100755 new mode 100644 index 756d0bde28..cc22acaa20 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -5,6 +5,9 @@ import numpy as np from sklearn.tree import DecisionTreeClassifier +from sklearn.base import BaseEstimator +from sklearn.random_projection import SparseRandomProjection + from sklearn.utils.validation import ( check_X_y, @@ -15,7 +18,6 @@ import keras as keras from .base import BaseTransformer -from .oblique_tree import ObliqueTreeClassifier class NeuralClassificationTransformer(BaseTransformer): @@ -208,8 +210,8 @@ class ObliqueTreeClassificationTransformer(BaseTransformer): Attributes ---------- - transformer : sklearn.tree.DecisionTreeClassifier - an internal sklearn DecisionTreeClassifier + transformer : ObliqueTreeClassifier + an sklearn compliant oblique decisiotn tree (SPORF) """ def __init__(self, kwargs={}): @@ -257,3 +259,870 @@ def transform(self, X): check_is_fitted(self) X = check_array(X) return self.transformer_.apply(X) + + +""" +Authors: Parth Vora and Jay Mandavilli + +Oblique Decision Tree (SPORF) +""" +# -------------------------------------------------------------------------- +class SplitInfo: + """ + A class used to store information about a certain split. + + Parameters: + --- + feature : int + The feature which is used for the particular split. + threshold : float + The feature value which defines the split, if an example has a value less + than this threshold for the feature of this split then it will go to the + left child, otherwise it wil go the right child where these children are + the children nodes of the node for which this split defines. + proj_mat : array of shape [n_components, n_features] + The sparse random projection matrix for this split. + left_impurity : float + This is Gini impurity of left side of the split. + left_idx : array of shape [left_n_samples] + This is the indices of the nodes that are in the left side of this split. + left_n_samples : int + The number of samples in the left side of this split. + right_impurity : float + This is Gini impurity of right side of the split. + right_idx : array of shape [right_n_samples] + This is the indices of the nodes that are in the right side of this split. + right_n_samples : int + The number of samples in the right side of this split. + no_split : bool + A boolean specifying if there is a valid split or not. Here an invalid + split means all of the samples would go to one side. + improvement : float + A metric to determine if the split improves the decision tree. + """ + + def __init__( + self, + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ): + + self.feature = feature + self.threshold = threshold + self.proj_mat = proj_mat + self.left_impurity = left_impurity + self.left_idx = left_idx + self.left_n_samples = left_n_samples + self.right_impurity = right_impurity + self.right_idx = right_idx + self.right_n_samples = right_n_samples + self.no_split = no_split + self.improvement = improvement + + +class ObliqueSplitter: + """ + A class used to represent an oblique splitter, where splits are done on + the linear combination of the features. + + Parameters: + --- + X : array of shape [n_samples, n_features] + The input data X is a matrix of the examples and their respective feature + values for each of the features. + y : array of shape [n_samples] + The labels for each of the examples in X. + proj_dims : int + The dimensionality of the target projection space. + density : float + Ratio of non-zero component in the random projection matrix in the range '(0, 1]'. + random_state : int + Controls the pseudo random number generator used to generate the projection matrix. + + Methods + --- + sample_proj_mat(sample_inds) + This gets the projection matrix and it fits the transform to the samples of interest. + leaf_label_proba(idx) + This calculates the label and the probability for that label for a particular leaf + node. + score(y_sort, t) + Finds the Gini impurity for a split. + impurity(idx) + Finds the impurity for a certain set of samples. + split(sample_inds) + Determines the best possible split for the given set of samples. + """ + + def __init__(self, X, y, proj_dims, density, random_state): + + self.X = X + self.y = y + + self.classes = np.array(np.unique(y), dtype=int) + self.n_classes = len(self.classes) + self.indices = np.indices(y.shape)[0] + + self.n_samples = X.shape[0] + + self.proj_dims = proj_dims + self.density = density + self.random_state = random_state + + def sample_proj_mat(self, sample_inds): + """ + Gets the projection matrix and it fits the transform to the samples of interest. + + Parameters + --- + sample_inds : array of shape [n_samples] + The data we are transforming. + + Returns + --- + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + The generated sparse random matrix. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projected matrix. + """ + + proj_mat = SparseRandomProjection( + density=self.density, + n_components=self.proj_dims, + random_state=self.random_state, + ) + + proj_X = proj_mat.fit_transform(self.X[sample_inds, :]) + return proj_X, proj_mat + + def leaf_label_proba(self, idx): + """ + Finds the most common label and probability of this label from the samples at + the leaf node for which this is used on. + + Parameters + --- + idx : array of shape [n_samples] + The indices of the samples that are at the leaf node for which the label + and probability need to be found. + + Returns + --- + label : int + The label for any sample that is predicted to be at this node. + proba : float + The probability of the predicted sample to have this node's label. + """ + + samples = self.y[idx] + n = len(samples) + labels, count = np.unique(samples, return_counts=True) + most = np.argmax(count) + + label = labels[most] + proba = count[most] / n + + return label, proba + + # Returns gini impurity for split + # Expects 0 < t < n + def score(self, y_sort, t): + """ + Finds the Gini impurity for the split of interest + + Parameters + --- + y_sort : array of shape [n_samples] + A sorted array of labels for the examples for which the Gini impurity + is being calculated. + t : float + The threshold determining where to split y_sort. + + Returns + --- + gini : float + The Gini impurity of the split. + """ + + left = y_sort[:t] + right = y_sort[t:] + + n_left = len(left) + n_right = len(right) + + left_unique, left_counts = np.unique(left, return_counts=True) + right_unique, right_counts = np.unique(right, return_counts=True) + + left_counts = left_counts / n_left + right_counts = right_counts / n_right + + left_gini = 1 - np.sum(np.power(left_counts, 2)) + right_gini = 1 - np.sum(np.power(right_counts, 2)) + + gini = (n_left / self.n_samples) * left_gini + ( + n_right / self.n_samples + ) * right_gini + return gini + + # Returns impurity for a group of examples + # expects idx not None + def impurity(self, idx): + """ + Finds the actual impurity for a set of samples + + Parameters + --- + idx : array of shape [n_samples] + The indices of the nodes in the set for which the impurity is being calculated. + + Returns + --- + impurity : float + Actual impurity of split. + """ + + samples = self.y[idx] + n = len(samples) + + if n == 0: + return 0 + + unique, count = np.unique(samples, return_counts=True) + count = count / n + gini = np.sum(np.power(count, 2)) + + return 1 - gini + + # Finds the best split + # This needs to be parallelized; its a major bottleneck + def split(self, sample_inds): + """ + Finds the optimal split for a set of samples. + Note that the code for this method needs to be parallelized. This is a major + bottleneck in integration with scikit-learn. + + Parameters + --- + sample_inds : array of shape [n_samples] + The indices of the nodes in the set for which the best split is found. + + Returns + --- + split_info : SplitInfo + Class holding information about the split. + """ + + # Project the data + proj_X, proj_mat = self.sample_proj_mat(sample_inds) + y_sample = self.y[sample_inds] + n_samples = len(sample_inds) + + # Score matrix + # No split score is just node impurity + Q = np.zeros((n_samples, self.proj_dims)) + node_impurity = self.impurity(sample_inds) + Q[0, :] = node_impurity + Q[-1, :] = node_impurity + + # Loop through projected features and examples to find best split + # This can be parallelized for sure + for j in range(self.proj_dims): + + # Sort labels by the jth feature + idx = np.argsort(proj_X[:, j]) + y_sort = y_sample[idx] + + Q[1:-1, j] = np.array( + [self.score(y_sort, i) for i in range(1, n_samples - 1)] + ) + + # Identify best split feature, minimum gini impurity + best_split_ind = np.argmin(Q) + thresh_i, feature = np.unravel_index(best_split_ind, Q.shape) + best_gini = Q[thresh_i, feature] + + # Sort samples by the split feature + feat_vec = proj_X[:, feature] + idx = np.argsort(feat_vec) + + feat_vec = feat_vec[idx] + sample_inds = sample_inds[idx] + + # Get the threshold, split samples into left and right + threshold = feat_vec[thresh_i] + left_idx = sample_inds[:thresh_i] + right_idx = sample_inds[thresh_i:] + + left_n_samples = len(left_idx) + right_n_samples = len(right_idx) + + # See if we have no split + no_split = left_n_samples == 0 or right_n_samples == 0 + + # Evaluate improvement + improvement = node_impurity - best_gini + + # Evaluate impurities for left and right children + left_impurity = self.impurity(left_idx) + right_impurity = self.impurity(right_idx) + + split_info = SplitInfo( + feature, + threshold, + proj_mat, + left_impurity, + left_idx, + left_n_samples, + right_impurity, + right_idx, + right_n_samples, + no_split, + improvement, + ) + + return split_info + + +# -------------------------------------------------------------------------- + + +class Node: + """ + A class used to represent an oblique node. + + Parameters: + --- + None + + Methods + --- + None + """ + + def __init__(self): + self.node_id = None + self.is_leaf = None + self.parent = None + self.left_child = None + self.right_child = None + + self.feature = None + self.threshold = None + self.impurity = None + self.n_samples = None + + self.proj_mat = None + self.label = None + self.proba = None + + +class StackRecord: + """ + A class used to keep track of a node's parent and other information about the node and its split. + + Parameters: + --- + parent : int + The index of the parent node. + depth : int + The depth at which this node is. + is_left : bool + Represents if the node is a left child or not. + impurity : float + This is Gini impurity of this node. + sample_idx : array of shape [n_samples] + This is the indices of the nodes that are in this node. + n_samples : int + The number of samples in this node. + + Methods + --- + None + """ + + def __init__(self, parent, depth, is_left, impurity, sample_idx, n_samples): + + self.parent = parent + self.depth = depth + self.is_left = is_left + self.impurity = impurity + self.sample_idx = sample_idx + self.n_samples = n_samples + + +class ObliqueTree: + """ + A class used to represent a tree with oblique splits. + + Parameters: + --- + splitter : class + The type of splitter for this tree, should be an ObliqueSplitter. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + max_depth : int + Maximum depth allowed for the tree. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + + Methods + --- + add_node(parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba) + Adds a node to the existing tree + build() + This is what is initially called on to completely build the oblique tree. + predict(X) + Finds the final node for each input sample as it passes through the decision tree. + """ + + def __init__( + self, + splitter, + min_samples_split, + min_samples_leaf, + max_depth, + min_impurity_split, + min_impurity_decrease, + ): + + # Tree parameters + # self.n_samples = n_samples + # self.n_features = n_features + # self.n_classes = n_classes + self.depth = 0 + self.node_count = 0 + self.nodes = [] + + # Build parameters + self.splitter = splitter + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_depth = max_depth + self.min_impurity_split = min_impurity_split + self.min_impurity_decrease = min_impurity_decrease + + def add_node( + self, + parent, + is_left, + impurity, + n_samples, + is_leaf, + feature, + threshold, + proj_mat, + label, + proba, + ): + """ + Adds a node to the existing oblique tree. + + Parameters + --- + parent : int + The index of the parent node for the new node being added. + is_left : bool + Determines if this new node being added is a left or right child. + impurity : float + Impurity of this new node. + n_samples : int + Number of samples at this new node. + is_leaf : bool + Determines if this new node is a leaf of the tree or an internal node. + feature : int + Index of feature on which the split occurs at this node. + threshold : float + The threshold feature value for this node determining if a sample will go + to this node's left of right child. If a sample has a value less than the + threshold (for the feature of this node) it will go to the left childe, + otherwise it will go the right child. + proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) + Projection matrix for this new node. + label : int + The label a sample will be given if it is predicted to be at this node. + proba : float + The probability a predicted sample has of being the node's label. + + Returns + --- + node_id : int + Index of the new node just added. + """ + + node = Node() + node.node_id = self.node_count + node.impurity = impurity + node.n_samples = n_samples + + # If not the root node, set parents + if self.node_count > 0: + node.parent = parent + if is_left: + self.nodes[parent].left_child = node.node_id + else: + self.nodes[parent].right_child = node.node_id + + # Set node parameters + if is_leaf: + node.is_leaf = True + node.label = label + node.proba = proba + else: + node.is_leaf = False + node.feature = feature + node.threshold = threshold + node.proj_mat = proj_mat + + self.node_count += 1 + self.nodes.append(node) + + return node.node_id + + def build(self): + """ + Builds the oblique tree. + + Parameters + --- + None + + Returns + --- + None + """ + + # Initialize, add root node + stack = [] + root = StackRecord( + 0, + 1, + False, + self.splitter.impurity(self.splitter.indices), + self.splitter.indices, + self.splitter.n_samples, + ) + stack.append(root) + + # Build tree + while len(stack) > 0: + + # Pop a record off the stack + cur = stack.pop() + + # Evaluate if it is a leaf + is_leaf = ( + cur.depth >= self.max_depth + or cur.n_samples < self.min_samples_split + or cur.n_samples < 2 * self.min_samples_leaf + or cur.impurity <= self.min_impurity_split + ) + + # Split if not + if not is_leaf: + split = self.splitter.split(cur.sample_idx) + + is_leaf = ( + is_leaf + or split.no_split + or split.improvement <= self.min_impurity_decrease + ) + + # Add the node to the tree + if is_leaf: + + label, proba = self.splitter.leaf_label_proba(cur.sample_idx) + + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + None, + None, + None, + label, + proba, + ) + + else: + node_id = self.add_node( + cur.parent, + cur.is_left, + cur.impurity, + cur.n_samples, + is_leaf, + split.feature, + split.threshold, + split.proj_mat, + None, + None, + ) + + # Push the right and left children to the stack if applicable + if not is_leaf: + + right_child = StackRecord( + node_id, + cur.depth + 1, + False, + split.right_impurity, + split.right_idx, + split.right_n_samples, + ) + stack.append(right_child) + + left_child = StackRecord( + node_id, + cur.depth + 1, + True, + split.left_impurity, + split.left_idx, + split.left_n_samples, + ) + stack.append(left_child) + + if cur.depth > self.depth: + self.depth = cur.depth + + def predict(self, X): + """ + Predicts final nodes of samples given. + + Parameters + --- + X : array of shape [n_samples, n_features] + The input array for which predictions are made. + + Returns + --- + predictions : array of shape [n_samples] + Array of the final node index for each input prediction sample. + """ + + predictions = np.zeros(X.shape[0]) + for i in range(X.shape[0]): + cur = self.nodes[0] + while not cur is None and not cur.is_leaf: + proj_X = cur.proj_mat.transform(X) + if proj_X[i, cur.feature] < cur.threshold: + id = cur.left_child + cur = self.nodes[id] + else: + id = cur.right_child + cur = self.nodes[id] + + predictions[i] = cur.node_id + + return predictions + + +# -------------------------------------------------------------------------- + +""" Class for Oblique Tree """ + + +class ObliqueTreeClassifier(BaseEstimator): + """ + A class used to represent a classifier that uses an oblique decision tree. + + Parameters: + --- + max_depth : int + Maximum depth allowed for oblique tree. + min_samples_split : int + Minimum number of samples possible at a node. + min_samples_leaf : int + Minimum number of samples possible at a leaf. + random_state : int + Maximum depth allowed for the tree. + min_impurity_decrease : float + Minimum amount Gini impurity value must decrease by for a split to be valid. + min_impurity_split : float + Minimum Gini impurity value that must be achieved for a split to occur on the node. + feature_combinations : float + The feature combinations to use for the oblique split. + density : float + Density estimate. + + Methods + --- + fit(X,y) + Fits the oblique tree to the training samples. + apply(X) + Calls on the predict function from the oblique tree for the test samples. + predict(X) + Gets the prediction labels for the test samples. + predict_proba(X) + Gets the probability of the prediction labels for the test samples. + predict_log_proba(X) + Gets the log of the probability of the prediction labels for the test samples. + """ + + def __init__( + self, + *, + # criterion="gini", + # splitter=None, + max_depth=np.inf, + min_samples_split=2, + min_samples_leaf=1, + # min_weight_fraction_leaf=0, + # max_features="auto", + # max_leaf_nodes=None, + random_state=None, + min_impurity_decrease=0, + min_impurity_split=0, + # class_weight=None, + # ccp_alpha=0.0, + # New args + feature_combinations=1.2, + density=0.7 + ): + + # self.criterion=criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + # self.min_weight_fraction_leaf=min_weight_fraction_leaf + # self.max_features=max_features + # self.max_leaf_nodes=max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + # self.class_weight=class_weight + # self.ccp_alpha=ccp_alpha + + self.feature_combinations = feature_combinations + self.density = density + + def fit(self, X, y): + """ + Predicts final nodes of samples given. + + Parameters + --- + X : array of shape [n_samples, n_features] + The training samples. + y : array of shape [n_samples] + Labels for the training samples. + + Returns + --- + ObliqueTreeClassifier + The fit classifier. + """ + + self.proj_dims = int(np.ceil(X.shape[1]) / self.feature_combinations) + splitter = ObliqueSplitter( + X, y, self.proj_dims, self.density, self.random_state + ) + + self.tree = ObliqueTree( + splitter, + self.min_samples_split, + self.min_samples_leaf, + self.max_depth, + self.min_impurity_split, + self.min_impurity_decrease, + ) + self.tree.build() + return self + + def apply(self, X): + """ + Gets predictions form the oblique tree for the test samples. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + pred_nodes : array of shape[n_samples] + The indices for each test sample's final node in the oblique tree. + """ + + pred_nodes = self.tree.predict(X).astype(int) + return pred_nodes + + def predict(self, X): + """ + Determines final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The predictions (labels) for each testing sample. + """ + + preds = np.zeros(X.shape[0]) + pred_nodes = self.apply(X) + for k in range(len(pred_nodes)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].label + + return preds + + def predict_proba(self, X): + """ + Determines probabilities of the final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The probabilities of the predictions (labels) for each testing sample. + """ + + preds = np.zeros(X.shape[0]) + pred_nodes = self.apply(X) + for k in range(len(preds)): + id = pred_nodes[k] + preds[k] = self.tree.nodes[id].proba + + return preds + + def predict_log_proba(self, X): + """ + Determines log of the probabilities of the final label predictions for each sample in the test data. + + Parameters + --- + X : array of shape [n_samples, n_features] + The testing samples. + + Returns + --- + preds : array of shape[n_samples] + The log of the probabilities of the predictions (labels) for each testing sample. + """ + + proba = self.predict_proba(X) + for k in range(len(proba)): + proba[k] = np.log(proba[k]) + + return proba From ecb7b0718d7a26141b856bee85e71e1d7caaec42 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Fri, 11 Dec 2020 12:06:21 -0500 Subject: [PATCH 11/17] oblique trees added --- proglearn/forest.py | 5 ++++- proglearn/tests/test_forest.py | 12 +++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/proglearn/forest.py b/proglearn/forest.py index e8812b8148..50ff12a3b8 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -3,7 +3,10 @@ Corresponding Email: levinewill@icloud.com """ from .progressive_learner import ClassificationProgressiveLearner -from .transformers import TreeClassificationTransformer +from .transformers import ( + TreeClassificationTransformer, + ObliqueTreeClassificationTransformer, +) from .voters import TreeClassificationVoter from .deciders import SimpleArgmaxAverage diff --git a/proglearn/tests/test_forest.py b/proglearn/tests/test_forest.py index acf7e9976b..b03a4d1323 100644 --- a/proglearn/tests/test_forest.py +++ b/proglearn/tests/test_forest.py @@ -4,7 +4,10 @@ import random from proglearn.forest import LifelongClassificationForest -from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ( + TreeClassificationTransformer, + ObliqueTreeClassificationTransformer, +) from proglearn.voters import TreeClassificationVoter from proglearn.deciders import SimpleArgmaxAverage @@ -47,3 +50,10 @@ def test_correct_default_n_estimators(self): def test_correct_true_initilization_finite_sample_correction(self): l2f = LifelongClassificationForest(default_kappa=np.inf) assert l2f.pl_.default_voter_kwargs == {"kappa": np.inf} + + def test_oblique_transformer(self): + l2f = LifelongClassificationForest(oblique=True) + assert l2f.pl_.default_transformer_class == ObliqueTreeClassificationTransformer + assert l2f.default_feature_combinations == 1.5 + assert l2f.default_density == 0.5 + assert l2f.pl_.default_transformer_kwargs == {} From 39cf94a311427212630f83098c1ea54125d9c770 Mon Sep 17 00:00:00 2001 From: parthgvora Date: Fri, 11 Dec 2020 12:36:32 -0500 Subject: [PATCH 12/17] consistency of default params --- proglearn/transformers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/proglearn/transformers.py b/proglearn/transformers.py index cc22acaa20..ea042606b1 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -989,8 +989,8 @@ def __init__( # class_weight=None, # ccp_alpha=0.0, # New args - feature_combinations=1.2, - density=0.7 + feature_combinations=1.5, + density=0.5 ): # self.criterion=criterion From df391b6a7afc8139203803561dd72364e504ec71 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 11 Dec 2020 17:41:29 -0700 Subject: [PATCH 13/17] made some documentation changes and created decision boundaries notebook --- .../oblique_decision_boundaries_functions.py | 96 +++++ .../functions/sporf_datasets_functions.py | 90 +++++ .../oblique_decision_boundaries.ipynb | 138 ++++++++ docs/tutorials/sporf_datasets.ipynb | 331 ++++++++++++++++++ proglearn/forest.py | 15 + proglearn/transformers.py | 86 ++--- 6 files changed, 713 insertions(+), 43 deletions(-) create mode 100644 docs/tutorials/functions/oblique_decision_boundaries_functions.py create mode 100644 docs/tutorials/functions/sporf_datasets_functions.py create mode 100644 docs/tutorials/oblique_decision_boundaries.ipynb create mode 100644 docs/tutorials/sporf_datasets.ipynb diff --git a/docs/tutorials/functions/oblique_decision_boundaries_functions.py b/docs/tutorials/functions/oblique_decision_boundaries_functions.py new file mode 100644 index 0000000000..7be5367339 --- /dev/null +++ b/docs/tutorials/functions/oblique_decision_boundaries_functions.py @@ -0,0 +1,96 @@ +from rerf.rerfClassifier import rerfClassifier + +import numpy as np +np.random.seed(42) + +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.ensemble import RandomForestClassifier + +from proglearn.forest import LifelongClassificationForest +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ObliqueTreeClassificationTransformer +from proglearn.deciders import SimpleArgmaxAverage + +def test(NT, h, names, classifiers, datasets): + i = 1 + # iterate over datasets + for ds_cnt, ds in enumerate(datasets): + # preprocess dataset, split into training and test part + X, y = ds + X = StandardScaler().fit_transform(X) + X_train, X_test, y_train, y_test = \ + train_test_split(X, y, test_size=.4, random_state=42) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + # just plot the dataset first + cm = plt.cm.RdBu + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + if ds_cnt == 0: + ax.set_title("Input data") + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + edgecolors='k') + # Plot the testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, + edgecolors='k') + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + + if "Proglearn" in name: + + clf = LifelongClassificationForest(oblique=True, + default_feature_combinations=1, default_density=0.5) + clf.add_task(X_train, y_train, n_estimators=NT) + y_hat = clf.predict(X_test, task_id=0) + score = np.sum(y_hat == y_test) / len(y_test) + + else: + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + + # Plot the decision boundary. For that, we will assign a color to each + # point in the mesh [x_min, x_max]x[y_min, y_max]. + if hasattr(clf, "decision_function"): + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + elif "Proglearn" in name: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()], task_id=0)[:, 1] + else: + Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) + + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, + edgecolors='k') + # Plot the testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + edgecolors='k', alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + if ds_cnt == 0: + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 \ No newline at end of file diff --git a/docs/tutorials/functions/sporf_datasets_functions.py b/docs/tutorials/functions/sporf_datasets_functions.py new file mode 100644 index 0000000000..cbee97c497 --- /dev/null +++ b/docs/tutorials/functions/sporf_datasets_functions.py @@ -0,0 +1,90 @@ +import sys +import numpy as np +import pandas as pd +import csv +from numpy import genfromtxt + +from proglearn.progressive_learner import ProgressiveLearner +from proglearn.voters import TreeClassificationVoter +from proglearn.transformers import TreeClassificationTransformer +from proglearn.transformers import ObliqueTreeClassificationTransformer +from proglearn.deciders import SimpleArgmaxAverage + +from sklearn.model_selection import train_test_split, cross_val_score + +def load_simulated_data(file): + data = genfromtxt(file, delimiter=',') + X = data[:, :-1] + y = data[:, -1] + + return X, y + +def load_data(data_file, task_num): + if "Hill_Valley" in data_file: + df = pd.read_csv(data_file) + X = df[df.columns[:-1]].to_numpy() + y = df[df.columns[-1]].to_numpy() + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y) + + if "acute" in data_file: + + df = pd.read_table(data_file, encoding='utf-16') + df[df == "no"] = 0 + df[df == "yes"] = 1 + + data = df.to_numpy() + temps = data[:, 0] + + temperature = [] + for i in range(len(temps)): + temp_str = temps[i] + temp_str = temp_str.replace(",", ".") + temperature.append(float(temp_str)) + + data[:, 0] = np.array(temperature) + + X = np.array(data[:, :5], dtype=float) + + # 6 for task 1, 7 for task 2 + if task_num == 1: + y = np.array(data[:, 6], dtype=float) + else: + y = np.array(data[:, 7], dtype=float) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y) + + return X_train, X_test, y_train, y_test, len(np.unique(y)) + + +def test(data_file, reps, n_trees, task_num, + default_transformer_class, default_transformer_kwargs): + default_voter_class = TreeClassificationVoter + default_voter_kwargs = {} + + default_decider_class = SimpleArgmaxAverage + + kappa = np.zeros(reps) + for i in range(reps): + X_train, X_test, y_train, y_test, n_classes = load_data(data_file, task_num) + default_decider_kwargs = {"classes": np.arange(n_classes)} + + pl = ProgressiveLearner( + default_transformer_class=default_transformer_class, + default_transformer_kwargs=default_transformer_kwargs, + default_voter_class=default_voter_class, + default_voter_kwargs=default_voter_kwargs, + default_decider_class=default_decider_class, + default_decider_kwargs=default_decider_kwargs) + + pl.add_task(X_train, y_train, num_transformers=n_trees) + + y_hat = pl.predict(X_test, task_id=0) + + acc = np.sum(y_test == y_hat) / len(y_test) + print("Accuracy after iteration ", i, ": ", acc) + + chance_pred = 1 / n_classes + kappa[i] = (acc - chance_pred) / (1 - chance_pred) + + return np.mean(kappa) * 100, (np.std(kappa) * 100) / np.sqrt(reps) \ No newline at end of file diff --git a/docs/tutorials/oblique_decision_boundaries.ipynb b/docs/tutorials/oblique_decision_boundaries.ipynb new file mode 100644 index 0000000000..4332508d39 --- /dev/null +++ b/docs/tutorials/oblique_decision_boundaries.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Use SPORF to Draw Decision Boundaries" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this notebook is to show that the oblique tree in ProgLearn is correct and can accurately determine oblique splits." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from rerf.rerfClassifier import rerfClassifier\n", + "\n", + "import numpy as np\n", + "np.random.seed(42)\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.colors import ListedColormap\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.datasets import make_moons, make_circles, make_classification\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "from proglearn.forest import LifelongClassificationForest\n", + "from proglearn.voters import TreeClassificationVoter\n", + "from proglearn.transformers import TreeClassificationTransformer\n", + "from proglearn.transformers import ObliqueTreeClassificationTransformer\n", + "from proglearn.deciders import SimpleArgmaxAverage\n", + "\n", + "from functions.oblique_decision_boundaries_functions import test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters, initialize datasets, and initialize classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "NT = 10\n", + "\n", + "h = .1 # step size in the mesh\n", + "\n", + "names = [\"RF\", \"RerF\", \"Proglearn-SPORF\"]\n", + "\n", + "classifiers = [\n", + " RandomForestClassifier(max_depth=5, n_estimators=NT, max_features=1),\n", + " rerfClassifier(n_estimators = NT, feature_combinations=1.5, max_features=2),\n", + " LifelongClassificationForest(oblique=True, default_feature_combinations=1, default_density=0.5)]\n", + "\n", + "X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,\n", + " random_state=1, n_clusters_per_class=1)\n", + "rng = np.random.RandomState(2)\n", + "X += 2 * rng.uniform(size=X.shape)\n", + "linearly_separable = (X, y)\n", + "\n", + "datasets = [make_moons(noise=0.3, random_state=0),\n", + " make_circles(noise=0.2, factor=0.5, random_state=1),\n", + " linearly_separable\n", + " ]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run on all datasets for all models" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "figure = plt.figure(figsize=(15, 9))\n", + "test(NT, h, names, classifiers, datasets)\n", + "plt.tight_layout()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/sporf_datasets.ipynb b/docs/tutorials/sporf_datasets.ipynb new file mode 100644 index 0000000000..31ab947477 --- /dev/null +++ b/docs/tutorials/sporf_datasets.ipynb @@ -0,0 +1,331 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SPORF Tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this tutorial is to prove that this pure python implementation of SPORF is identical, in terms of functionality, to the one used in the SPORF paper (Tomita, Tyler M., et al. \"Sparse projection oblique randomer forests.\" Journal of Machine Learning Research 21.104 (2020): 1-39.). In order to do this, this notebook runs this implementation of SPORF on 3 different data sets: hill valley, acute inflammation task 1, and acute inflammation task 2. Cohen's Kappa (fractional decrease in error rate over the chance error rate) is the metric that is being used to compare the implementations. If this implementation has the same kappa values (for the same data sets) as the one in the SPORF paper, we can say with confidence that this implementation is accurate. The datasets used in this notebook all had kappa values of 100 ± 0 in the SPORF paper implementation, which is also what is found when run on this SPORF implementation, as seen below. Thus, we can say with confidence that this implementation of SPORF is accurate." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from proglearn.progressive_learner import ProgressiveLearner\n", + "from proglearn.forest import LifelongClassificationForest\n", + "from proglearn.voters import TreeClassificationVoter\n", + "from proglearn.transformers import TreeClassificationTransformer\n", + "from proglearn.transformers import ObliqueTreeClassificationTransformer\n", + "from proglearn.deciders import SimpleArgmaxAverage\n", + "\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "\n", + "from functions.sporf_datasets_functions import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SPORF\n", + "\n", + "## Set parameters and run on hill valley without noise data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 2\n", + "density = 0.01\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 1 data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 2 data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 2\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth, \"feature_combinations\" : feature_combinations, \"density\" : density}}\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " ObliqueTreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forest (RF)\n", + "\n", + "Now we will run the same datasets on a base Random forest. The goal of this is to show how SPORF can clearly outperform or perform as well as the Random Forest algorithm. As seen by the results below, SPORF has a much higher kappa value, than RF, for the hill valley without noise data and has the same value for the acute inflammation data sets. Having a high kappa value is desired since as mentioned above, it is a measure of how much the error rate over the chance error rate decreases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on hill valley without noise data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 0.5409836065573771\n", + "Accuracy after iteration 1 : 0.5901639344262295\n", + "Accuracy after iteration 2 : 0.5901639344262295\n", + "Accuracy after iteration 3 : 0.6885245901639344\n", + "Accuracy after iteration 4 : 0.5245901639344263\n", + "kappa: 17.37704918032787 , error: 5.1130724431784715\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 2\n", + "density = 0.01\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/hill-valley/Hill_Valley_without_noise_Training.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 1 data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 1\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set parameters and run on acute inflammation task 2 data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy after iteration 0 : 1.0\n", + "Accuracy after iteration 1 : 1.0\n", + "Accuracy after iteration 2 : 1.0\n", + "Accuracy after iteration 3 : 1.0\n", + "Accuracy after iteration 4 : 1.0\n", + "kappa: 100.0 , error: 0.0\n" + ] + } + ], + "source": [ + "max_depth = 10\n", + "feature_combinations = 1.5\n", + "density = 0.5\n", + "reps = 5\n", + "n_trees = 10\n", + "task_num = 2\n", + "\n", + "kwargs = {\"kwargs\" : {\"max_depth\" : max_depth} }\n", + "\n", + "kappa, err = test(\"https://archive.ics.uci.edu/ml/machine-learning-databases/acute/diagnosis.data\", reps, n_trees, task_num,\n", + " TreeClassificationTransformer,\n", + " kwargs)\n", + "\n", + "print(\"kappa: \", kappa, \", error:\", err)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/proglearn/forest.py b/proglearn/forest.py index 50ff12a3b8..5b04099535 100644 --- a/proglearn/forest.py +++ b/proglearn/forest.py @@ -38,6 +38,15 @@ class LifelongClassificationForest(ClassificationProgressiveLearner): The maximum depth of a tree in the Lifelong Classification Forest. This is used if 'max_depth' is not fed to add_task. + oblique : bool, default=False + Specifies if an oblique tree should used for the classifier or not. + + feature_combinations : float, default=1.5 + The feature combinations to use for the oblique split. + + density : float, default=0.5 + Density estimate. + Attributes ---------- pl_ : ClassificationProgressiveLearner @@ -124,6 +133,12 @@ def add_task( The maximum depth of a tree in the Lifelong Classification Forest. The default is used if 'default' is provided. + feature_combinations : float, default='default' + The feature combinations to use for the oblique split. + + density : float, default='default' + Density estimate. + Returns ------- self : LifelongClassificationForest diff --git a/proglearn/transformers.py b/proglearn/transformers.py index ea042606b1..93546fc116 100644 --- a/proglearn/transformers.py +++ b/proglearn/transformers.py @@ -271,8 +271,8 @@ class SplitInfo: """ A class used to store information about a certain split. - Parameters: - --- + Parameters + ---------- feature : int The feature which is used for the particular split. threshold : float @@ -334,8 +334,8 @@ class ObliqueSplitter: A class used to represent an oblique splitter, where splits are done on the linear combination of the features. - Parameters: - --- + Parameters + ---------- X : array of shape [n_samples, n_features] The input data X is a matrix of the examples and their respective feature values for each of the features. @@ -349,7 +349,7 @@ class ObliqueSplitter: Controls the pseudo random number generator used to generate the projection matrix. Methods - --- + ------- sample_proj_mat(sample_inds) This gets the projection matrix and it fits the transform to the samples of interest. leaf_label_proba(idx) @@ -383,12 +383,12 @@ def sample_proj_mat(self, sample_inds): Gets the projection matrix and it fits the transform to the samples of interest. Parameters - --- + ---------- sample_inds : array of shape [n_samples] The data we are transforming. Returns - --- + ------- proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) The generated sparse random matrix. proj_mat : {ndarray, sparse matrix} of shape (n_samples, n_features) @@ -410,13 +410,13 @@ def leaf_label_proba(self, idx): the leaf node for which this is used on. Parameters - --- + ---------- idx : array of shape [n_samples] The indices of the samples that are at the leaf node for which the label and probability need to be found. Returns - --- + ------- label : int The label for any sample that is predicted to be at this node. proba : float @@ -440,7 +440,7 @@ def score(self, y_sort, t): Finds the Gini impurity for the split of interest Parameters - --- + ---------- y_sort : array of shape [n_samples] A sorted array of labels for the examples for which the Gini impurity is being calculated. @@ -448,7 +448,7 @@ def score(self, y_sort, t): The threshold determining where to split y_sort. Returns - --- + ------- gini : float The Gini impurity of the split. """ @@ -480,12 +480,12 @@ def impurity(self, idx): Finds the actual impurity for a set of samples Parameters - --- + ---------- idx : array of shape [n_samples] The indices of the nodes in the set for which the impurity is being calculated. Returns - --- + ------- impurity : float Actual impurity of split. """ @@ -511,12 +511,12 @@ def split(self, sample_inds): bottleneck in integration with scikit-learn. Parameters - --- + ---------- sample_inds : array of shape [n_samples] The indices of the nodes in the set for which the best split is found. Returns - --- + ------- split_info : SplitInfo Class holding information about the split. """ @@ -599,12 +599,12 @@ class Node: """ A class used to represent an oblique node. - Parameters: - --- + Parameters + ---------- None Methods - --- + ------- None """ @@ -629,8 +629,8 @@ class StackRecord: """ A class used to keep track of a node's parent and other information about the node and its split. - Parameters: - --- + Parameters + ---------- parent : int The index of the parent node. depth : int @@ -645,7 +645,7 @@ class StackRecord: The number of samples in this node. Methods - --- + ------- None """ @@ -663,8 +663,8 @@ class ObliqueTree: """ A class used to represent a tree with oblique splits. - Parameters: - --- + Parameters + ---------- splitter : class The type of splitter for this tree, should be an ObliqueSplitter. min_samples_split : int @@ -679,7 +679,7 @@ class ObliqueTree: Minimum amount Gini impurity value must decrease by for a split to be valid. Methods - --- + ------- add_node(parent, is_left, impurity, n_samples, is_leaf, feature, threshold, proj_mat, label, proba) Adds a node to the existing tree build() @@ -731,7 +731,7 @@ def add_node( Adds a node to the existing oblique tree. Parameters - --- + ---------- parent : int The index of the parent node for the new node being added. is_left : bool @@ -757,7 +757,7 @@ def add_node( The probability a predicted sample has of being the node's label. Returns - --- + ------- node_id : int Index of the new node just added. """ @@ -796,11 +796,11 @@ def build(self): Builds the oblique tree. Parameters - --- + ---------- None Returns - --- + ------- None """ @@ -903,12 +903,12 @@ def predict(self, X): Predicts final nodes of samples given. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The input array for which predictions are made. Returns - --- + ------- predictions : array of shape [n_samples] Array of the final node index for each input prediction sample. """ @@ -939,8 +939,8 @@ class ObliqueTreeClassifier(BaseEstimator): """ A class used to represent a classifier that uses an oblique decision tree. - Parameters: - --- + Parameters + ---------- max_depth : int Maximum depth allowed for oblique tree. min_samples_split : int @@ -959,7 +959,7 @@ class ObliqueTreeClassifier(BaseEstimator): Density estimate. Methods - --- + ------- fit(X,y) Fits the oblique tree to the training samples. apply(X) @@ -1014,14 +1014,14 @@ def fit(self, X, y): Predicts final nodes of samples given. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The training samples. y : array of shape [n_samples] Labels for the training samples. Returns - --- + ------- ObliqueTreeClassifier The fit classifier. """ @@ -1047,12 +1047,12 @@ def apply(self, X): Gets predictions form the oblique tree for the test samples. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The testing samples. Returns - --- + ------- pred_nodes : array of shape[n_samples] The indices for each test sample's final node in the oblique tree. """ @@ -1065,12 +1065,12 @@ def predict(self, X): Determines final label predictions for each sample in the test data. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The testing samples. Returns - --- + ------- preds : array of shape[n_samples] The predictions (labels) for each testing sample. """ @@ -1088,12 +1088,12 @@ def predict_proba(self, X): Determines probabilities of the final label predictions for each sample in the test data. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The testing samples. Returns - --- + ------- preds : array of shape[n_samples] The probabilities of the predictions (labels) for each testing sample. """ @@ -1111,12 +1111,12 @@ def predict_log_proba(self, X): Determines log of the probabilities of the final label predictions for each sample in the test data. Parameters - --- + ---------- X : array of shape [n_samples, n_features] The testing samples. Returns - --- + ------- preds : array of shape[n_samples] The log of the probabilities of the predictions (labels) for each testing sample. """ From 822430499394f099c11eef8c4240324a4d238ce6 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Sat, 12 Dec 2020 15:12:09 -0800 Subject: [PATCH 14/17] Update tutorials.rst Co-Authored-By: jmandav1 <39231283+jmandavilli@users.noreply.github.com> Co-Authored-By: parthgvora --- docs/tutorials.rst | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index b1f18032af..c269c60894 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -8,8 +8,18 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package :maxdepth: 1 tutorials/installation_guide - tutorials/xor_nxor_exp + tutorials/fte_bte_food101 tutorials/label_shuffle_exp tutorials/random_class_exp - tutorials/uncertaintyforest_fig1 + tutorials/rotation_cifar + tutorials/spiral_exp + tutorials/sporf_datasets + tutorials/oblique_decision_boundaries tutorials/uncertaintyforest_running_example + tutorials/uncertaintyforest_posteriorestimates + tutorials/uncertaintyforest_conditionalentropyestimates + tutorials/uncertaintyforest_mutualinformationestimates + tutorials/xor_nxor_exp + tutorials/xor_rxor_exp + tutorials/xor_rxor_with_cpd + tutorials/xor_rxor_with_icp From cf93af41af9a18ae104bb3a9f1772a8d8e92ee06 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 14 Dec 2020 20:28:37 -0700 Subject: [PATCH 15/17] renamed notebooks --- docs/tutorials.rst | 2 +- ...ecision_boundaries.ipynb => sporf_decision_boundaries.ipynb} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename docs/tutorials/{oblique_decision_boundaries.ipynb => sporf_decision_boundaries.ipynb} (100%) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index f6f777b9cc..8f7900f709 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -14,7 +14,7 @@ The following tutorials highlight what one can do with the ``ProgLearn`` package tutorials/rotation_cifar tutorials/spiral_exp tutorials/sporf_datasets - tutorials/oblique_decision_boundaries + tutorials/sporf_decision_boundaries tutorials/uncertaintyforest_running_example tutorials/uncertaintyforest_posteriorestimates tutorials/uncertaintyforest_conditionalentropyestimates diff --git a/docs/tutorials/oblique_decision_boundaries.ipynb b/docs/tutorials/sporf_decision_boundaries.ipynb similarity index 100% rename from docs/tutorials/oblique_decision_boundaries.ipynb rename to docs/tutorials/sporf_decision_boundaries.ipynb From 706ef4243d40ac868df3a11623d4663bc834638f Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 14 Dec 2020 19:38:48 -0800 Subject: [PATCH 16/17] Update and rename oblique_decision_boundaries_functions.py to sporf_decision_boundaries_functions.py Co-Authored-By: jmandav1 <39231283+jmandavilli@users.noreply.github.com> Co-Authored-By: parthgvora --- ...ries_functions.py => sporf_decision_boundaries_functions.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/tutorials/functions/{oblique_decision_boundaries_functions.py => sporf_decision_boundaries_functions.py} (97%) diff --git a/docs/tutorials/functions/oblique_decision_boundaries_functions.py b/docs/tutorials/functions/sporf_decision_boundaries_functions.py similarity index 97% rename from docs/tutorials/functions/oblique_decision_boundaries_functions.py rename to docs/tutorials/functions/sporf_decision_boundaries_functions.py index 7be5367339..baef1cfc89 100644 --- a/docs/tutorials/functions/oblique_decision_boundaries_functions.py +++ b/docs/tutorials/functions/sporf_decision_boundaries_functions.py @@ -93,4 +93,4 @@ def test(NT, h, names, classifiers, datasets): ax.set_title(name) ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') - i += 1 \ No newline at end of file + i += 1 From 6e1205e8b16f4baafe31a327bc67cf82c8effd41 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Mon, 14 Dec 2020 19:40:00 -0800 Subject: [PATCH 17/17] Rename function for consistency Co-Authored-By: jmandav1 <39231283+jmandavilli@users.noreply.github.com> Co-Authored-By: parthgvora --- docs/tutorials/sporf_decision_boundaries.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/sporf_decision_boundaries.ipynb b/docs/tutorials/sporf_decision_boundaries.ipynb index 4332508d39..70b10f2802 100644 --- a/docs/tutorials/sporf_decision_boundaries.ipynb +++ b/docs/tutorials/sporf_decision_boundaries.ipynb @@ -45,7 +45,7 @@ "from proglearn.transformers import ObliqueTreeClassificationTransformer\n", "from proglearn.deciders import SimpleArgmaxAverage\n", "\n", - "from functions.oblique_decision_boundaries_functions import test" + "from functions.sporf_decision_boundaries_functions import test" ] }, {