diff --git a/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.ipynb b/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.ipynb new file mode 100644 index 0000000000000..e417dcda0d6d8 --- /dev/null +++ b/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.ipynb @@ -0,0 +1,417 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Demonstration of warmstart grid search to compare classifier performance\n", + "\n", + "An important step in classifier performance comparison is hyperparameter \n", + "optimization. Here, we specify the classifer models we want to tune and a \n", + "dictionary of hyperparameter ranges (preferably similar for fairness in \n", + "comparision) for each classifier. Then, we find the optimal hyperparameters \n", + "through a function that implements warmstart grid search and refit the optimized \n", + "models to obtain accuracies. The performance of each hyperparameter value pairing is visualized in heatmaps.\n", + "\n", + "In this example, we tune hyperparameters for two classifiers, Random Forest and Extra Trees, and compare their performance on an OpenML-CC18 benchmarking suite dataset (https://www.openml.org/d/15). We can see clearly in the resulting plot that the optimized models perform better than or atleast similar to the default parameter models. On the dataset we use in this example, RF performs marginally better than ExtraTrees overall.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Automatically created module for IPython interactive environment\n" + ] + } + ], + "source": [ + "print(__doc__)\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import sklearn\n", + "from sklearn import metrics\n", + "from sklearn.metrics import cohen_kappa_score, make_scorer\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import ExtraTreesClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.datasets import fetch_openml\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from warnings import simplefilter\n", + "\n", + "simplefilter(action=\"ignore\", category=FutureWarning)\n", + "from warnings import simplefilter\n", + "\n", + "simplefilter(action=\"ignore\", category=FutureWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def hyperparameter_optimization(X, y, *argv):\n", + " \"\"\"\n", + " Given any number of classifier types and \n", + " a dictionary of two hyperparameters to tune for each classifier, \n", + " find optimal pairs of hyperparameters.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : numpy.ndarray\n", + " Input data, shape (n_samples, n_features)\n", + " y : numpy.ndarray\n", + " Output data, shape (n_samples, n_outputs)\n", + " *argv : list of tuples (classifier, hyperparameters)\n", + " List of (classifier, hyperparameters) tuples:\n", + "\n", + " classifier : sklearn-compliant classifier\n", + " For example sklearn.ensemble.RandomForestRegressor\n", + " hyperparameters : dictionary of hyperparameter values or range\n", + "\n", + " Returns\n", + " -------\n", + " clf_best_params : dictionary\n", + " Dictionary of classifiers and their respective optimal hyperparameters\n", + " \"\"\"\n", + "\n", + " best_models = {}\n", + "\n", + " # Iterate over all (classifier, hyperparameter dict) pairs\n", + " for clf, params in argv:\n", + " best_params = grid_search(X, y, clf, params)\n", + " best_models[clf.__class__.__name__] = best_params\n", + "\n", + " return best_models\n", + "\n", + "\n", + "def grid_search(X, y, clf, params):\n", + " \"\"\"\n", + " Given a classifier and two hyperparameters and the \n", + " range/values to search for each, find optimal hyperparameter \n", + " values using warmstart grid search parameter sweeps.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : numpy.ndarray\n", + " Input data, shape (n_samples, n_features)\n", + " y : numpy.ndarray\n", + " Output data, shape (n_samples, n_outputs)\n", + " clf : sklearn-compliant classifier\n", + " For example sklearn.ensemble.RandomForestRegressor\n", + " params : dictionary of hyperparameter values or range\n", + "\n", + " Returns\n", + " -------\n", + " best_params : dictionary\n", + " Dictionary of best hyperparameters\n", + " \"\"\"\n", + " param1_name = list(params.keys())[0]\n", + " param2_name = list(params.keys())[1]\n", + " param1 = params[param1_name]\n", + " param2 = params[param2_name]\n", + "\n", + " # sweep over all pairs of parameter combinations and collect mean scores\n", + " kappa_scorer = make_scorer(cohen_kappa_score)\n", + " mean_scores = np.zeros((np.shape(param1)[0], np.shape(param2)[0]))\n", + " for idx1, val1 in enumerate(param1):\n", + " clf.max_features = val1 #change .max_features to .name of 1st parameter\n", + " for idx2, val2 in enumerate(param2):\n", + " clf.n_estimators = val2 #change .n_estimators to .name of 2nd parameter \n", + " score = cross_val_score(clf, X, y, scoring=kappa_scorer, cv=5)\n", + " mean_scores[idx1][idx2] = np.mean(score)\n", + "\n", + " # select parameter pair with highest kappa score\n", + " best_idx1, best_idx2 = np.unravel_index(\n", + " np.argmax(mean_scores, axis=None), np.shape(mean_scores)\n", + " )\n", + " best_params = {param1_name: param1[best_idx1], param2_name: param2[best_idx2]}\n", + "\n", + " # generate heatmap\n", + " param_heatmap(params, mean_scores, clf.__class__.__name__)\n", + "\n", + " return best_params\n", + "\n", + "\n", + "def param_heatmap(params, scores, clf_name):\n", + " \"\"\"\n", + " Given a dictionary of two parameter ranges, scores \n", + " for each pair of parameter values, and classifier name, \n", + " generate heatmap showing model performance scores for each \n", + " pair of parameter values.\n", + "\n", + " Parameters\n", + " ----------\n", + " params : dictionary of hyperparameter ranges\n", + " scores : ndarray \n", + " Scores for each parameter value pair\n", + " clf_name : string\n", + " Name of sklearn-compliant classifier\n", + " \"\"\"\n", + " param1_name = list(params.keys())[0]\n", + " param2_name = list(params.keys())[1]\n", + " param1 = params[param1_name]\n", + " param2 = params[param2_name]\n", + "\n", + " scores = -np.array(scores)\n", + " scores = scores.ravel().argsort().argsort().reshape(scores.shape)\n", + " plt.figure(figsize=(8, 6))\n", + " plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95)\n", + " plt.imshow(scores, interpolation=\"nearest\", cmap=plt.cm.Blues)\n", + " plt.xlabel(param2_name)\n", + " plt.ylabel(param1_name)\n", + " plt.colorbar()\n", + " plt.xticks(np.arange(len(param2)), param2)\n", + " plt.yticks(np.arange(len(param1)), param1)\n", + " plt.title(\"Grid Search Kappa Rank \" + clf_name)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building classifiers and specifying parameter ranges or values to search\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# get some data\n", + "X, y = fetch_openml(data_id=40979, return_X_y=True, as_frame=True)\n", + "y = pd.factorize(y)[0]\n", + "X = X.apply(lambda x: pd.factorize(x)[0])\n", + "n_samples, n_features = np.shape(X)\n", + "\n", + "# build a classifier with warm_start=True\n", + "extraTrees = ExtraTreesClassifier(warm_start=True)\n", + "\n", + "# specify parameters and ranges or values to search\n", + "extraTrees_param_dict = {\n", + " \"max_features\": [\"sqrt\", \"log2\", None],\n", + " \"n_estimators\": [10, 30, 50, 70],\n", + "}\n", + "\n", + "# build another classifier with warm_start=True\n", + "rf = RandomForestClassifier(warm_start=True)\n", + "\n", + "# specify parameters and ranges or values to search\n", + "rf_param_dict = {\n", + " \"max_features\": [\"sqrt\", \"log2\", None],\n", + " \"n_estimators\": [10, 30, 50, 70],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Obtaining best parameters dictionary and refitting" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ExtraTreesClassifier': {'max_features': 'sqrt', 'n_estimators': 70}, 'RandomForestClassifier': {'max_features': 'sqrt', 'n_estimators': 70}}\n" + ] + } + ], + "source": [ + "tuned_params = hyperparameter_optimization(\n", + " X, y, (extraTrees, extraTrees_param_dict), (rf, rf_param_dict)\n", + ")\n", + "\n", + "print(tuned_params)\n", + "\n", + "# extract values from dict - seperate each classifier's param dict\n", + "keys, values = zip(*tuned_params.items())\n", + "\n", + "# train test split\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.33, random_state=42\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accuracy(model, X_train, y_train, X_test, y_test):\n", + " \"\"\"\n", + " Given a model, train, and test data, \n", + " fit model and calculate accuracy of predictions.\n", + "\n", + " Parameters\n", + " ----------\n", + " model : sklearn-compliant classifier\n", + " X_train : numpy.ndarray\n", + " Train input data, shape (n_samples, n_features)\n", + " y_train numpy.ndarray\n", + " Train output data, shape (n_samples, n_outputs)\n", + " X_test: numpy.ndarray\n", + " Test input data, shape (n_samples, n_features)\n", + " y_test:numpy.ndarray\n", + " Test output data, shape (n_samples, n_outputs)\n", + "\n", + " Returns\n", + " -------\n", + " accuracy : float\n", + " An sklearn metric for model performance.\n", + " \"\"\"\n", + "\n", + " model.fit(X_train, y_train)\n", + " predictions = model.predict(X_test)\n", + " accuracy = metrics.accuracy_score(y_test, predictions)\n", + " return accuracy\n", + "\n", + "\n", + "# get accuracies of optimized and default models\n", + "extraTrees_models = [ExtraTreesClassifier(**values[0]), ExtraTreesClassifier()]\n", + "extraTrees_acc = []\n", + "for model in extraTrees_models:\n", + " extraTrees_acc.append(get_accuracy(model, X_train, y_train, X_test, y_test))\n", + "\n", + "rf_models = [RandomForestClassifier(**values[1]), RandomForestClassifier()]\n", + "rf_acc = []\n", + "for model in rf_models:\n", + " rf_acc.append(get_accuracy(model, X_train, y_train, X_test, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting the result" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "labels = [\"ExtraTrees\", \"RF\"]\n", + "x = np.arange(len(labels))\n", + "width = 0.35\n", + "fig, ax = plt.subplots()\n", + "rects1 = ax.bar(x - width / 2, extraTrees_acc, width, label=\"Optimized\")\n", + "rects2 = ax.bar(x + width / 2, rf_acc, width, label=\"Default\")\n", + "\n", + "# Add some text for labels, title and custom x-axis tick labels, etc.\n", + "ax.set_ylabel(\"Accuracy\")\n", + "ax.set_title(\n", + " \"Optimized/Default ExtraTrees and RF Performance on cylinder-bands Dataset\"\n", + ")\n", + "ax.set_xticks(x)\n", + "ax.set_xticklabels(labels)\n", + "ax.legend()\n", + "\n", + "def autolabel(rects):\n", + " \"\"\"Attach a text label above each bar in *rects*, displaying its height.\"\"\"\n", + " for rect in rects:\n", + " height = float(\"%.3f\" % (rect.get_height()))\n", + " ax.annotate(\n", + " \"{}\".format(height),\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 3), # 3 points vertical offset\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "autolabel(rects1)\n", + "autolabel(rects2)\n", + "fig.tight_layout()\n", + "plt.ylim((0.9, 1))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.py b/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.py new file mode 100644 index 0000000000000..4bc5b3bc9f538 --- /dev/null +++ b/examples/ensemble/ExtraTrees_vs_RF_Gridsearch.py @@ -0,0 +1,291 @@ +""" +============================================================================ +Demonstration of warmstart grid search to compare classifier performance +============================================================================ +An important step in classifier performance comparison is hyperparameter +optimization. Here, we specify the classifer models we want to tune and a +dictionary of hyperparameter ranges (preferably similar for fairness in +comparision) for each classifier. Then, we find the optimal hyperparameters +through a function that implements warmstart grid search and refit the +optimized models to obtain accuracies. The performance of each hyperparameter +value pairing is visualized in heatmaps. + +In this example, we tune hyperparameters for two classifiers, Random Forest +and Extra Trees, and compare their performance on an OpenML-CC18 benchmarking +suite dataset (https://www.openml.org/d/15). We can see clearly in the resulting +plot that the optimized models perform better than or atleast similar to the +default parameter models. On the dataset we use in this example, RF performs +marginally better than ExtraTrees overall. +""" +print(__doc__) + +import pandas as pd +import numpy as np + +import sklearn +from sklearn import metrics +from sklearn.metrics import cohen_kappa_score, make_scorer +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import train_test_split +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import fetch_openml + +import matplotlib +import matplotlib.pyplot as plt + +from warnings import simplefilter + +simplefilter(action="ignore", category=FutureWarning) +from warnings import simplefilter + +simplefilter(action="ignore", category=FutureWarning) + + +def hyperparameter_optimization(X, y, *argv): + """ + Given any number of classifier types and + a dictionary of two hyperparameters to tune for each classifier, + find optimal pairs of hyperparameters. + + Parameters + ---------- + X : numpy.ndarray + Input data, shape (n_samples, n_features) + y : numpy.ndarray + Output data, shape (n_samples, n_outputs) + *argv : list of tuples (classifier, hyperparameters) + List of (classifier, hyperparameters) tuples: + + classifier : sklearn-compliant classifier + For example sklearn.ensemble.RandomForestRegressor + hyperparameters : dictionary of hyperparameter values or range + + Returns + ------- + clf_best_params : dictionary + Dictionary of classifiers and their respective optimal hyperparameters + """ + + best_models = {} + + # Iterate over all (classifier, hyperparameter dict) pairs + for clf, params in argv: + best_params = grid_search(X, y, clf, params) + best_models[clf.__class__.__name__] = best_params + + return best_models + + +def grid_search(X, y, clf, params): + """ + Given a classifier and two hyperparameters and the + range/values to search for each, find optimal hyperparameter + values using warmstart grid search parameter sweeps. + + Parameters + ---------- + X : numpy.ndarray + Input data, shape (n_samples, n_features) + y : numpy.ndarray + Output data, shape (n_samples, n_outputs) + clf : sklearn-compliant classifier + For example sklearn.ensemble.RandomForestRegressor + params : dictionary of hyperparameter values or range + + Returns + ------- + best_params : dictionary + Dictionary of best hyperparameters + """ + param1_name = list(params.keys())[0] + param2_name = list(params.keys())[1] + param1 = params[param1_name] + param2 = params[param2_name] + + # sweep over all pairs of parameter combinations and collect mean scores + kappa_scorer = make_scorer(cohen_kappa_score) + mean_scores = np.zeros((np.shape(param1)[0], np.shape(param2)[0])) + for idx1, val1 in enumerate(param1): + clf.max_features = val1 # change .max_features to .name of 1st parameter + for idx2, val2 in enumerate(param2): + clf.n_estimators = val2 # change .n_estimators to .name of 2nd parameter + score = cross_val_score(clf, X, y, scoring=kappa_scorer, cv=5) + mean_scores[idx1][idx2] = np.mean(score) + + # select parameter pair with highest kappa score + best_idx1, best_idx2 = np.unravel_index( + np.argmax(mean_scores, axis=None), np.shape(mean_scores) + ) + best_params = {param1_name: param1[best_idx1], param2_name: param2[best_idx2]} + + # generate heatmap + param_heatmap(params, mean_scores, clf.__class__.__name__) + + return best_params + + +def param_heatmap(params, scores, clf_name): + """ + Given a dictionary of two parameter ranges, scores + for each pair of parameter values, and classifier name, + generate heatmap showing model performance scores for each + pair of parameter values. + + Parameters + ---------- + params : dictionary of hyperparameter ranges + scores : ndarray + Scores for each parameter value pair + clf_name : string + Name of sklearn-compliant classifier + """ + param1_name = list(params.keys())[0] + param2_name = list(params.keys())[1] + param1 = params[param1_name] + param2 = params[param2_name] + + scores = -np.array(scores) + scores = scores.ravel().argsort().argsort().reshape(scores.shape) + plt.figure(figsize=(8, 6)) + plt.subplots_adjust(left=0.2, right=0.95, bottom=0.15, top=0.95) + plt.imshow(scores, interpolation="nearest", cmap=plt.cm.Blues) + plt.xlabel(param2_name) + plt.ylabel(param1_name) + plt.colorbar() + plt.xticks(np.arange(len(param2)), param2) + plt.yticks(np.arange(len(param1)), param1) + plt.title("Grid Search Kappa Rank " + clf_name) + plt.show() + + +############################################################################### +# Building classifiers and specifying parameter ranges or values to search +# ---------------------------------------------------------- +# + +# get some data +X, y = fetch_openml(data_id=40979, return_X_y=True, as_frame=True) +y = pd.factorize(y)[0] +X = X.apply(lambda x: pd.factorize(x)[0]) +n_samples, n_features = np.shape(X) + +# build a classifier with warm_start=True +extraTrees = ExtraTreesClassifier(warm_start=True) + +# specify parameters and ranges or values to search +extraTrees_param_dict = { + "max_features": ["sqrt", "log2", None], + "n_estimators": [10, 30, 50, 70], +} + +# build another classifier with warm_start=True +rf = RandomForestClassifier(warm_start=True) + +# specify parameters and ranges or values to search +rf_param_dict = { + "max_features": ["sqrt", "log2", None], + "n_estimators": [10, 30, 50, 70], +} + +############################################################################### +# Obtaining best parameters dictionary and refitting +# ---------------------------------------------------------- +# + +tuned_params = hyperparameter_optimization( + X, y, (extraTrees, extraTrees_param_dict), (rf, rf_param_dict) +) + +print(tuned_params) + +# extract values from dict - seperate each classifier's param dict +keys, values = zip(*tuned_params.items()) + +# train test split +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=42 +) + + +def get_accuracy(model, X_train, y_train, X_test, y_test): + """ + Given a model, train, and test data, + fit model and calculate accuracy of predictions. + + Parameters + ---------- + model : sklearn-compliant classifier + X_train : numpy.ndarray + Train input data, shape (n_samples, n_features) + y_train numpy.ndarray + Train output data, shape (n_samples, n_outputs) + X_test: numpy.ndarray + Test input data, shape (n_samples, n_features) + y_test:numpy.ndarray + Test output data, shape (n_samples, n_outputs) + + Returns + ------- + accuracy : float + An sklearn metric for model performance. + """ + + model.fit(X_train, y_train) + predictions = model.predict(X_test) + accuracy = metrics.accuracy_score(y_test, predictions) + return accuracy + + +# get accuracies of optimized and default models +extraTrees_models = [ExtraTreesClassifier(**values[0]), ExtraTreesClassifier()] +extraTrees_acc = [] +for model in extraTrees_models: + extraTrees_acc.append(get_accuracy(model, X_train, y_train, X_test, y_test)) + +rf_models = [RandomForestClassifier(**values[1]), RandomForestClassifier()] +rf_acc = [] +for model in rf_models: + rf_acc.append(get_accuracy(model, X_train, y_train, X_test, y_test)) + +############################################################################### +# Plotting the result +# ---------------------------------------------------------- +# + +labels = ["ExtraTrees", "RF"] +x = np.arange(len(labels)) +width = 0.35 +fig, ax = plt.subplots() +rects1 = ax.bar(x - width / 2, extraTrees_acc, width, label="Optimized") +rects2 = ax.bar(x + width / 2, rf_acc, width, label="Default") + +# Add some text for labels, title and custom x-axis tick labels, etc. +ax.set_ylabel("Accuracy") +ax.set_title( + "Optimized/Default ExtraTrees and RF Performance on cylinder-bands Dataset" +) +ax.set_xticks(x) +ax.set_xticklabels(labels) +ax.legend() + + +def autolabel(rects): + """Attach a text label above each bar in *rects*, displaying its height.""" + for rect in rects: + height = float("%.3f" % (rect.get_height())) + ax.annotate( + "{}".format(height), + xy=(rect.get_x() + rect.get_width() / 2, height), + xytext=(0, 3), # 3 points vertical offset + textcoords="offset points", + ha="center", + va="bottom", + ) + + +autolabel(rects1) +autolabel(rects2) +fig.tight_layout() +plt.ylim((0.9, 1)) +plt.show()