NeuroDataDesign · morgsmss7 · Feb 3, 2020 · Feb 8, 2020 · Feb 17, 2020 · Feb 26, 2020
diff --git a/examples/datasets/plot_nonlinear_regression_datasets.py b/examples/datasets/plot_nonlinear_regression_datasets.py
@@ -24,9 +24,13 @@
 
 import matplotlib.pyplot as plt
 
-from sklearn.datasets import (make_independent_noise, make_log_regression,
-                              make_multiplicative_noise, make_sin_regression,
-                              make_square_regression)
+from sklearn.datasets import (
+    make_independent_noise,
+    make_log_regression,
+    make_multiplicative_noise,
+    make_sin_regression,
+    make_square_regression,
+)
 
 print(__doc__)
 
@@ -62,7 +66,7 @@ def plot_simulation(simulation_name, ax):
 }
 
 _, axs = plt.subplots(1, 5, figsize=(40, 4))
-plt.subplots_adjust(bottom=.15)
+plt.subplots_adjust(bottom=0.15)
 
 for simulation_name, ax in zip(simulations.keys(), axs):
     plot_simulation(simulation_name, ax)

diff --git a/examples/ensemble/plot_random_forest_regression_criteria_comparison.py b/examples/ensemble/plot_random_forest_regression_criteria_comparison.py
@@ -2,14 +2,11 @@
 ===============================================================================
 Comparing different split criteria for random forest regression on toy datasets
 ===============================================================================
-
 An example to compare the different split criteria available for
 :class:`sklearn.ensemble.RandomForestRegressor`.
-
 Metrics used to evaluate these splitters include Mean Squared Error (MSE), a
 measure of distance between the true target (`y_true`) and the predicted output
 (`y_pred`), and runtime.
-
 For visual examples of these datasets, see
 :ref:`sphx_glr_auto_examples_datasets_plot_nonlinear_regression_datasets.py`.
 """
@@ -19,16 +16,20 @@
 
 import time
 from itertools import product
-from multiprocessing import Pool
+from joblib import Parallel, delayed
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 
-from sklearn.datasets import (make_independent_noise, make_log_regression,
-                              make_multiplicative_noise, make_sin_regression,
-                              make_square_regression)
+from sklearn.datasets import (
+    make_independent_noise,
+    make_log_regression,
+    make_multiplicative_noise,
+    make_sin_regression,
+    make_square_regression,
+)
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 
@@ -46,12 +47,12 @@
     "Independence": [make_independent_noise, None],
 }
 
-
 ###############################################################################
 def _train_forest(X, y, criterion):
     """Fit a RandomForestRegressor with default parameters and specific criterion."""
     regr = RandomForestRegressor(
-        n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5)
+        n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5
+    )
     regr.fit(X, y)
     return regr
 
@@ -62,24 +63,69 @@ def _test_forest(X, y, regr):
     return mean_squared_error(y, y_pred)
 
 
+def _prep_data(sim_dict, simulation_name, max_n_samples, n_dimensions, n_trials):
+    """Generate train and test data for all trials."""
+    # Get simulation parameters and validation dataset
+    sim, noise, (X_test, y_test) = simulations[simulation_name]
+    n_samples = int(max_n_samples)
+    n_dimensions = int(n_dimensions)
+
+    np.random.seed(random_state)
+    seeds = np.random.randint(1e8, size=n_trials)
+
+    sim_dict[simulation_name] = {}
+    for i in range(n_trials):
+        # Sample training data
+        if noise is not None:
+            X_train, y_train = sim(
+                n_samples=n_samples,
+                n_dimensions=n_dimensions,
+                noise=noise,
+                random_state=seeds[i],
+            )
+        else:
+            X_train, y_train = sim(
+                n_samples=n_samples, n_dimensions=n_dimensions, random_state=seeds[i]
+            )
+        sim_dict[simulation_name][i] = (
+            np.copy(X_train),
+            np.copy(y_train),
+            np.copy(X_test),
+            np.copy(y_test),
+        )
+    return sim_dict
+
+
 ###############################################################################
-def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
+def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
     """Measure the performance of RandomForest under simulation conditions.
-
     Parameters
     ----------
     simulation_name : str
         Key from `simulations` dictionary.
+    sim_data: tuple (X_train, y_train, X_test, y_test)
+            X_train : array, shape (n_train_samples, n_features)
+                All X training data for given simulation
+            y_train : array, shape (n_train_samples, n_outputs)
+                All y training data for given simulation
+            X_test : array, shape (n_test_samples, n_features)
+                All X testing data for given simulation
+            y_test : array, shape (n_test_samples, n_outputs)
+                All y testing data for given simulation
     n_samples : int
         Number of training samples.
-    criterion : string
-        Split criterion used to train forest. Choose from
-        ("mse", "mae", "friedman_mse", "axis", "oblique").
+    criterion : {'mse', 'mae', 'friedman_mse'}
+        Split criterion used to train forest:
+        - 'mse'
+            Mean Squared Error
+        - 'mae'
+            Mean Absolute Error
+        - 'friedman_mse'
+            Friedman Mean Squared Error
     n_dimensions : int
         Number of features and targets to sample.
     n_iter : int
         Which repeat of the same simulation parameter we're on. Ignored.
-
     Returns
     -------
     simulation_name : str
@@ -96,27 +142,18 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
     runtime : float
         Runtime (in seconds).
     """
-    print(simulation_name, n_samples)
+    print(simulation_name, n_samples, criterion, n_dimensions, n_iter)
 
-    # Get simulation parameters and validation dataset
-    sim, noise, (X_test, y_test) = simulations[simulation_name]
-    n_samples = int(n_samples)
-    n_dimensions = int(n_dimensions)
+    # Unpack training and testing data
+    X_train, y_train, X_test, y_test = sim_data
 
-    # Sample training data
-    if noise is not None:
-        X_train, y_train = sim(n_samples=n_samples,
-                               n_dimensions=n_dimensions,
-                               noise=noise,
-                               random_state=random_state)
-    else:
-        X_train, y_train = sim(n_samples=n_samples,
-                               n_dimensions=n_dimensions,
-                               random_state=random_state)
+    # Get subset of training data
+    curr_X_train = X_train[0:n_samples]
+    curr_y_train = y_train[0:n_samples]
 
     # Train forest
     start = time.time()
-    regr = _train_forest(X_train, y_train, criterion)
+    regr = _train_forest(curr_X_train, curr_y_train, criterion)
     stop = time.time()
 
     # Evaluate on testing data and record runtime
@@ -133,50 +170,59 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
 n_dimensions = 10
 simulation_names = simulations.keys()
 sample_sizes = np.arange(5, 51, 3)
-criteria = ["mae", "mse", "friedman_mse", "axis", "oblique"]
+criteria = ["mae", "mse", "friedman_mse"]
 
 # Number of times to repeat each simulation setting
-n_repeats = 10
+n_repeats = 30
 
 # Create the parameter space
-params = product(simulation_names, sample_sizes, criteria,
-                 [n_dimensions], range(n_repeats))
+params = product(simulation_names, sample_sizes, criteria, range(n_repeats))
 
 
 ###############################################################################
 print("Constructing validation datasets...")
 for simulation_name, (sim, noise) in simulations.items():
     if noise is not None:
-        X_test, y_test = sim(n_samples=1000,
-                             n_dimensions=n_dimensions,
-                             noise=noise,
-                             random_state=random_state)
+        X_test, y_test = sim(
+            n_samples=1000,
+            n_dimensions=n_dimensions,
+            noise=noise,
+            random_state=random_state,
+        )
     else:
-        X_test, y_test = sim(n_samples=1000,
-                             n_dimensions=n_dimensions,
-                             random_state=random_state)
+        X_test, y_test = sim(
+            n_samples=1000, n_dimensions=n_dimensions, random_state=random_state
+        )
     simulations[simulation_name].append((X_test, y_test))
 
 
 ###############################################################################
 print("Running simulations...")
 
-with Pool() as pool:
-
-    # Run the simulations in parallel
-    data = pool.starmap(main, params)
-
-    # Save results as a DataFrame
-    columns = ["simulation", "n_samples", "criterion",
-               "n_dimensions", "mse", "runtime"]
-    df = pd.DataFrame(data, columns=columns)
-
-    # Plot the results
-    sns.relplot(x="n_samples",
-                y="mse",
-                hue="criterion",
-                col="simulation",
-                kind="line",
-                data=df,
-                facet_kws={'sharey': False, 'sharex': True})
-    plt.show()
+# Generate training and test data for simulations
+sim_data = {}
+for sim in simulation_names:
+    sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_dimensions, n_repeats)
+
+# Run the simulations in parallel
+data = Parallel(n_jobs=-2)(
+    delayed(main)(sim_name, sim_data[sim_name][n_iter], n, crit, n_dimensions, n_iter)
+    for sim_name, n, crit, n_iter in params
+)
+
+# Save results as a DataFrame
+columns = ["simulation", "n_samples", "criterion", "n_dimensions", "mse", "runtime"]
+df = pd.DataFrame(data, columns=columns)
+
+# Plot the results
+sns.relplot(
+    x="n_samples",
+    y="mse",
+    hue="criterion",
+    col="simulation",
+    kind="line",
+    data=df,
+    facet_kws={"sharey": False, "sharex": True},
+)
+plt.tight_layout()
+plt.show()