Skip to content

Implement Shared Weights + Tests #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
1,827 changes: 1,827 additions & 0 deletions .ipynb_checkpoints/Arff tests-checkpoint.ipynb

Large diffs are not rendered by default.

681 changes: 681 additions & 0 deletions .ipynb_checkpoints/Olivetti_faces_test-checkpoint.ipynb

Large diffs are not rendered by default.

3,315 changes: 3,315 additions & 0 deletions Arff tests.ipynb

Large diffs are not rendered by default.

21 changes: 20 additions & 1 deletion examples/datasets/plot_nonlinear_regression_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# License: BSD 3 clause

import matplotlib.pyplot as plt
#import pandas as pd

from sklearn.datasets import (make_independent_noise, make_log_regression,
make_multiplicative_noise, make_sin_regression,
Expand All @@ -40,9 +41,25 @@ def plot_simulation(simulation_name, ax):
if noise is not None:
X_pure, y_pure = sim(n_samples=1000, n_dimensions=1, noise=0)
X_noise, y_noise = sim(n_samples=100, n_dimensions=1, noise=noise)
'''
df = pd.DataFrame(X_pure)
df.to_csv('nonlinearSimsDataXpure' + simulation_name + '.csv')
df = pd.DataFrame(y_pure)
df.to_csv('nonlinearSimsDataypure' + simulation_name + '.csv')
df = pd.DataFrame(X_noise)
df.to_csv('nonlinearSimsDataXnoise' + simulation_name + '.csv')
df = pd.DataFrame(y_noise)
df.to_csv('nonlinearSimsDataynoise' + simulation_name + '.csv')
'''
else:
X_pure, y_pure = sim(n_samples=1000, n_dimensions=1)

'''
df = pd.DataFrame(X_pure)
df.to_csv('nonlinearSimsDataXpure' + simulation_name + '.csv')
df = pd.DataFrame(y_pure)
df.to_csv('nonlinearSimsDataypure' + simulation_name + '.csv')
'''

# Plot the noiseless and noisy data sets
ax.scatter(X_pure, y_pure, s=10, c="#17202A")
if noise is not None:
Expand All @@ -61,10 +78,12 @@ def plot_simulation(simulation_name, ax):
"Independence": (make_independent_noise, None),
}

plt.rcParams.update({'font.size': 18})
_, axs = plt.subplots(1, 5, figsize=(40, 4))
plt.subplots_adjust(bottom=.15)

for simulation_name, ax in zip(simulations.keys(), axs):
plot_simulation(simulation_name, ax)

plt.savefig("SimDataPlots5_14.png")
plt.show()
197 changes: 126 additions & 71 deletions examples/ensemble/plot_random_forest_regression_criteria_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,44 @@
===============================================================================
Comparing different split criteria for random forest regression on toy datasets
===============================================================================

An example to compare the different split criteria available for
This is an example to compare the different split criteria available for
:class:`sklearn.ensemble.RandomForestRegressor`.

Metrics used to evaluate these splitters include Mean Squared Error (MSE), a
Metrics used to evaluate these split criteria include runtime and Mean Squared Error (MSE), a
measure of distance between the true target (`y_true`) and the predicted output
(`y_pred`), and runtime.

(`y_pred`).
For visual examples of these datasets, see
:ref:`sphx_glr_auto_examples_datasets_plot_nonlinear_regression_datasets.py`.
"""

# Author: Vivek Gopalakrishnan <[email protected]>
# Authors: Vivek Gopalakrishnan <[email protected]>
# Morgan Sanchez <[email protected]>
# License: BSD 3 clause

import time
from itertools import product
from multiprocessing import Pool
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.datasets import (make_independent_noise, make_log_regression,
make_multiplicative_noise, make_sin_regression,
make_square_regression)
from sklearn.datasets import (
make_independent_noise,
make_log_regression,
make_multiplicative_noise,
make_sin_regression,
make_square_regression,
)
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

print(__doc__)

random_state = 0

###############################################################################

random_state = 0
noise = 100.0
simulations = {
"Logarithmic": [make_log_regression, noise],
Expand All @@ -46,13 +49,17 @@
"Independence": [make_independent_noise, None],
}


###############################################################################
def _train_forest(X, y, criterion):
"""Fit a RandomForestRegressor with default parameters and specific criterion."""
regr = RandomForestRegressor(
n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5)
regr.fit(X, y)
if criterion == "dummy":
regr = DummyRegressor(strategy="mean")
regr.fit(X, y)
else:
regr = RandomForestRegressor(
n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5
)
regr.fit(X, y)
return regr


Expand All @@ -62,24 +69,70 @@ def _test_forest(X, y, regr):
return mean_squared_error(y, y_pred)


def _prep_data(sim_dict, simulation_name, max_n_samples, n_dimensions, n_trials):
"""Generate train and test data for all trials."""
# Get simulation parameters and validation dataset
sim, noise, (X_test, y_test) = simulations[simulation_name]
n_samples = int(max_n_samples)
n_dimensions = int(n_dimensions)

np.random.seed(random_state)
seeds = np.random.randint(1e8, size=n_trials)

sim_dict[simulation_name] = {}
for i in range(n_trials):
# Sample training data
if noise is not None:
X_train, y_train = sim(
n_samples=n_samples,
n_dimensions=n_dimensions,
noise=noise,
random_state=seeds[i],
)
else:
X_train, y_train = sim(
n_samples=n_samples, n_dimensions=n_dimensions, random_state=seeds[i]
)
sim_dict[simulation_name][i] = (
np.copy(X_train),
np.copy(y_train),
np.copy(X_test),
np.copy(y_test),
)

return sim_dict


###############################################################################
def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
"""Measure the performance of RandomForest under simulation conditions.

Parameters
----------
simulation_name : str
Key from `simulations` dictionary.
sim_data: tuple (X_train, y_train, X_test, y_test)
X_train : array, shape (n_train_samples, n_features)
All X training data for given simulation
y_train : array, shape (n_train_samples, n_outputs)
All y training data for given simulation
X_test : array, shape (n_test_samples, n_features)
All X testing data for given simulation
y_test : array, shape (n_test_samples, n_outputs)
All y testing data for given simulation
n_samples : int
Number of training samples.
criterion : string
Split criterion used to train forest. Choose from
("mse", "mae", "friedman_mse", "axis", "oblique").
criterion : {'mse', 'mae', 'friedman_mse'}
Split criterion used to train forest:
- 'mse'
Mean Squared Error
- 'mae'
Mean Absolute Error
- 'friedman_mse'
Friedman Mean Squared Error
n_dimensions : int
Number of features and targets to sample.
n_iter : int
Which repeat of the same simulation parameter we're on. Ignored.

Returns
-------
simulation_name : str
Expand All @@ -96,28 +149,19 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
runtime : float
Runtime (in seconds).
"""
print(simulation_name, n_samples)
print(simulation_name, n_samples, criterion, n_dimensions, n_iter)

# Get simulation parameters and validation dataset
sim, noise, (X_test, y_test) = simulations[simulation_name]
n_samples = int(n_samples)
n_dimensions = int(n_dimensions)
# Unpack training and testing data
X_train, y_train, X_test, y_test = sim_data

# Sample training data
if noise is not None:
X_train, y_train = sim(n_samples=n_samples,
n_dimensions=n_dimensions,
noise=noise,
random_state=random_state)
else:
X_train, y_train = sim(n_samples=n_samples,
n_dimensions=n_dimensions,
random_state=random_state)
# Get subset of training data
curr_X_train = X_train[0:n_samples]
curr_y_train = y_train[0:n_samples]

# Train forest
start = time.time()
regr = _train_forest(X_train, y_train, criterion)
stop = time.time()
start = time.process_time()
regr = _train_forest(curr_X_train, curr_y_train, criterion)
stop = time.process_time()

# Evaluate on testing data and record runtime
mse = _test_forest(X_test, y_test, regr)
Expand All @@ -133,50 +177,61 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
n_dimensions = 10
simulation_names = simulations.keys()
sample_sizes = np.arange(5, 51, 3)
criteria = ["mae", "mse", "friedman_mse", "axis", "oblique"]
criteria = ["mae", "mse", "friedman_mse", "axis", "oblique", "dummy"]

# Number of times to repeat each simulation setting
n_repeats = 10
n_repeats = 30

# Create the parameter space
params = product(simulation_names, sample_sizes, criteria,
[n_dimensions], range(n_repeats))
params = product(simulation_names, sample_sizes, criteria, range(n_repeats))


###############################################################################
print("Constructing validation datasets...")
for simulation_name, (sim, noise) in simulations.items():
if noise is not None:
X_test, y_test = sim(n_samples=1000,
n_dimensions=n_dimensions,
noise=noise,
random_state=random_state)
X_test, y_test = sim(
n_samples=1000,
n_dimensions=n_dimensions,
noise=noise,
random_state=random_state,
)
else:
X_test, y_test = sim(n_samples=1000,
n_dimensions=n_dimensions,
random_state=random_state)
X_test, y_test = sim(
n_samples=1000, n_dimensions=n_dimensions, random_state=random_state
)
simulations[simulation_name].append((X_test, y_test))


###############################################################################
print("Running simulations...")

with Pool() as pool:

# Run the simulations in parallel
data = pool.starmap(main, params)

# Save results as a DataFrame
columns = ["simulation", "n_samples", "criterion",
"n_dimensions", "mse", "runtime"]
df = pd.DataFrame(data, columns=columns)

# Plot the results
sns.relplot(x="n_samples",
y="mse",
hue="criterion",
col="simulation",
kind="line",
data=df,
facet_kws={'sharey': False, 'sharex': True})
plt.show()
# Generate training and test data for simulations
sim_data = {}
for sim in simulation_names:
sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_dimensions, n_repeats)

# Run the simulations in parallel
data = Parallel(n_jobs=-2)(
delayed(main)(sim_name, sim_data[sim_name][n_iter], n, crit, n_dimensions, n_iter)
for sim_name, n, crit, n_iter in params
)

# Save results as a DataFrame
columns = ["simulation", "n_samples", "criterion", "n_dimensions", "mse", "runtime"]
df = pd.DataFrame(data, columns=columns)
df.head()
df.to_csv("~/Desktop/sim.csv")

# Plot the results
sns.relplot(
x="n_samples",
y="mse",
hue="criterion",
col="simulation",
kind="line",
data=df,
facet_kws={"sharey": False, "sharex": True},
)
plt.tight_layout()
plt.show()
Loading