NeuroDataDesign · morgsmss7 · Mar 14, 2020 · Mar 23, 2020 · Mar 23, 2020 · Mar 30, 2020
diff --git a/.ipynb_checkpoints/Arff tests-checkpoint.ipynb b/.ipynb_checkpoints/Arff tests-checkpoint.ipynb
diff --git a/.ipynb_checkpoints/Olivetti_faces_test-checkpoint.ipynb b/.ipynb_checkpoints/Olivetti_faces_test-checkpoint.ipynb
diff --git a/Arff tests.ipynb b/Arff tests.ipynb
diff --git a/examples/datasets/plot_nonlinear_regression_datasets.py b/examples/datasets/plot_nonlinear_regression_datasets.py
@@ -23,6 +23,7 @@
 # License: BSD 3 clause
 
 import matplotlib.pyplot as plt
+#import pandas as pd
 
 from sklearn.datasets import (make_independent_noise, make_log_regression,
                               make_multiplicative_noise, make_sin_regression,
@@ -40,9 +41,25 @@ def plot_simulation(simulation_name, ax):
     if noise is not None:
         X_pure, y_pure = sim(n_samples=1000, n_dimensions=1, noise=0)
         X_noise, y_noise = sim(n_samples=100, n_dimensions=1, noise=noise)
+        '''
+        df = pd.DataFrame(X_pure)
+        df.to_csv('nonlinearSimsDataXpure' + simulation_name + '.csv')
+        df = pd.DataFrame(y_pure)
+        df.to_csv('nonlinearSimsDataypure' + simulation_name + '.csv')
+        df = pd.DataFrame(X_noise)
+        df.to_csv('nonlinearSimsDataXnoise' + simulation_name + '.csv')
+        df = pd.DataFrame(y_noise)
+        df.to_csv('nonlinearSimsDataynoise' + simulation_name + '.csv')
+        '''
     else:
         X_pure, y_pure = sim(n_samples=1000, n_dimensions=1)
-
+        '''
+        df = pd.DataFrame(X_pure)
+        df.to_csv('nonlinearSimsDataXpure' + simulation_name + '.csv')
+        df = pd.DataFrame(y_pure)
+        df.to_csv('nonlinearSimsDataypure' + simulation_name + '.csv')
+        '''
+
     # Plot the noiseless and noisy data sets
     ax.scatter(X_pure, y_pure, s=10, c="#17202A")
     if noise is not None:
@@ -61,10 +78,12 @@ def plot_simulation(simulation_name, ax):
     "Independence": (make_independent_noise, None),
 }
 
+plt.rcParams.update({'font.size': 18})
 _, axs = plt.subplots(1, 5, figsize=(40, 4))
 plt.subplots_adjust(bottom=.15)
 
 for simulation_name, ax in zip(simulations.keys(), axs):
     plot_simulation(simulation_name, ax)
 
+plt.savefig("SimDataPlots5_14.png")
 plt.show()
diff --git a/examples/ensemble/plot_random_forest_regression_criteria_comparison.py b/examples/ensemble/plot_random_forest_regression_criteria_comparison.py
@@ -2,41 +2,44 @@
 ===============================================================================
 Comparing different split criteria for random forest regression on toy datasets
 ===============================================================================
-
-An example to compare the different split criteria available for
+This is an example to compare the different split criteria available for
 :class:`sklearn.ensemble.RandomForestRegressor`.
-
-Metrics used to evaluate these splitters include Mean Squared Error (MSE), a
+Metrics used to evaluate these split criteria include runtime and Mean Squared Error (MSE), a
 measure of distance between the true target (`y_true`) and the predicted output
-(`y_pred`), and runtime.
-
+(`y_pred`).  
 For visual examples of these datasets, see
 :ref:`sphx_glr_auto_examples_datasets_plot_nonlinear_regression_datasets.py`.
 """
 
-# Author: Vivek Gopalakrishnan <[email protected]>
+# Authors: Vivek Gopalakrishnan <[email protected]>
+#          Morgan Sanchez       <[email protected]>
 # License: BSD 3 clause
 
 import time
 from itertools import product
-from multiprocessing import Pool
+from joblib import Parallel, delayed
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 
-from sklearn.datasets import (make_independent_noise, make_log_regression,
-                              make_multiplicative_noise, make_sin_regression,
-                              make_square_regression)
+from sklearn.datasets import (
+    make_independent_noise,
+    make_log_regression,
+    make_multiplicative_noise,
+    make_sin_regression,
+    make_square_regression,
+)
+from sklearn.dummy import DummyRegressor
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_squared_error
 
 print(__doc__)
 
-random_state = 0
-
 ###############################################################################
+
+random_state = 0
 noise = 100.0
 simulations = {
     "Logarithmic": [make_log_regression, noise],
@@ -46,13 +49,17 @@
     "Independence": [make_independent_noise, None],
 }
 
-
 ###############################################################################
 def _train_forest(X, y, criterion):
     """Fit a RandomForestRegressor with default parameters and specific criterion."""
-    regr = RandomForestRegressor(
-        n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5)
-    regr.fit(X, y)
+    if criterion == "dummy":
+        regr = DummyRegressor(strategy="mean")
+        regr.fit(X, y)
+    else:
+        regr = RandomForestRegressor(
+            n_estimators=500, criterion=criterion, max_features="sqrt", max_depth=5
+        )
+        regr.fit(X, y)
     return regr
 
 
@@ -62,24 +69,70 @@ def _test_forest(X, y, regr):
     return mean_squared_error(y, y_pred)
 
 
+def _prep_data(sim_dict, simulation_name, max_n_samples, n_dimensions, n_trials):
+    """Generate train and test data for all trials."""
+    # Get simulation parameters and validation dataset
+    sim, noise, (X_test, y_test) = simulations[simulation_name]
+    n_samples = int(max_n_samples)
+    n_dimensions = int(n_dimensions)
+
+    np.random.seed(random_state)
+    seeds = np.random.randint(1e8, size=n_trials)
+
+    sim_dict[simulation_name] = {}
+    for i in range(n_trials):
+        # Sample training data
+        if noise is not None:
+            X_train, y_train = sim(
+                n_samples=n_samples,
+                n_dimensions=n_dimensions,
+                noise=noise,
+                random_state=seeds[i],
+            )
+        else:
+            X_train, y_train = sim(
+                n_samples=n_samples, n_dimensions=n_dimensions, random_state=seeds[i]
+            )
+        sim_dict[simulation_name][i] = (
+            np.copy(X_train),
+            np.copy(y_train),
+            np.copy(X_test),
+            np.copy(y_test),
+        )
+
+    return sim_dict
+
+
 ###############################################################################
-def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
+def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
     """Measure the performance of RandomForest under simulation conditions.
-
     Parameters
     ----------
     simulation_name : str
         Key from `simulations` dictionary.
+    sim_data: tuple (X_train, y_train, X_test, y_test)
+            X_train : array, shape (n_train_samples, n_features)
+                All X training data for given simulation
+            y_train : array, shape (n_train_samples, n_outputs)
+                All y training data for given simulation
+            X_test : array, shape (n_test_samples, n_features)
+                All X testing data for given simulation
+            y_test : array, shape (n_test_samples, n_outputs)
+                All y testing data for given simulation
     n_samples : int
         Number of training samples.
-    criterion : string
-        Split criterion used to train forest. Choose from
-        ("mse", "mae", "friedman_mse", "axis", "oblique").
+    criterion : {'mse', 'mae', 'friedman_mse'}
+        Split criterion used to train forest:
+        - 'mse'
+            Mean Squared Error
+        - 'mae'
+            Mean Absolute Error
+        - 'friedman_mse'
+            Friedman Mean Squared Error
     n_dimensions : int
         Number of features and targets to sample.
     n_iter : int
         Which repeat of the same simulation parameter we're on. Ignored.
-
     Returns
     -------
     simulation_name : str
@@ -96,28 +149,19 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
     runtime : float
         Runtime (in seconds).
     """
-    print(simulation_name, n_samples)
+    print(simulation_name, n_samples, criterion, n_dimensions, n_iter)
 
-    # Get simulation parameters and validation dataset
-    sim, noise, (X_test, y_test) = simulations[simulation_name]
-    n_samples = int(n_samples)
-    n_dimensions = int(n_dimensions)
+    # Unpack training and testing data
+    X_train, y_train, X_test, y_test = sim_data
 
-    # Sample training data
-    if noise is not None:
-        X_train, y_train = sim(n_samples=n_samples,
-                               n_dimensions=n_dimensions,
-                               noise=noise,
-                               random_state=random_state)
-    else:
-        X_train, y_train = sim(n_samples=n_samples,
-                               n_dimensions=n_dimensions,
-                               random_state=random_state)
+    # Get subset of training data
+    curr_X_train = X_train[0:n_samples]
+    curr_y_train = y_train[0:n_samples]
 
     # Train forest
-    start = time.time()
-    regr = _train_forest(X_train, y_train, criterion)
-    stop = time.time()
+    start = time.process_time()
+    regr = _train_forest(curr_X_train, curr_y_train, criterion)
+    stop = time.process_time()
 
     # Evaluate on testing data and record runtime
     mse = _test_forest(X_test, y_test, regr)
@@ -133,50 +177,61 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
 n_dimensions = 10
 simulation_names = simulations.keys()
 sample_sizes = np.arange(5, 51, 3)
-criteria = ["mae", "mse", "friedman_mse", "axis", "oblique"]
+criteria = ["mae", "mse", "friedman_mse", "axis", "oblique", "dummy"]
 
 # Number of times to repeat each simulation setting
-n_repeats = 10
+n_repeats = 30
 
 # Create the parameter space
-params = product(simulation_names, sample_sizes, criteria,
-                 [n_dimensions], range(n_repeats))
+params = product(simulation_names, sample_sizes, criteria, range(n_repeats))
 
 
 ###############################################################################
 print("Constructing validation datasets...")
 for simulation_name, (sim, noise) in simulations.items():
     if noise is not None:
-        X_test, y_test = sim(n_samples=1000,
-                             n_dimensions=n_dimensions,
-                             noise=noise,
-                             random_state=random_state)
+        X_test, y_test = sim(
+            n_samples=1000,
+            n_dimensions=n_dimensions,
+            noise=noise,
+            random_state=random_state,
+        )
     else:
-        X_test, y_test = sim(n_samples=1000,
-                             n_dimensions=n_dimensions,
-                             random_state=random_state)
+        X_test, y_test = sim(
+            n_samples=1000, n_dimensions=n_dimensions, random_state=random_state
+        )
     simulations[simulation_name].append((X_test, y_test))
 
 
 ###############################################################################
 print("Running simulations...")
 
-with Pool() as pool:
-
-    # Run the simulations in parallel
-    data = pool.starmap(main, params)
-
-    # Save results as a DataFrame
-    columns = ["simulation", "n_samples", "criterion",
-               "n_dimensions", "mse", "runtime"]
-    df = pd.DataFrame(data, columns=columns)
-
-    # Plot the results
-    sns.relplot(x="n_samples",
-                y="mse",
-                hue="criterion",
-                col="simulation",
-                kind="line",
-                data=df,
-                facet_kws={'sharey': False, 'sharex': True})
-    plt.show()
+# Generate training and test data for simulations
+sim_data = {}
+for sim in simulation_names:
+    sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_dimensions, n_repeats)
+
+# Run the simulations in parallel
+data = Parallel(n_jobs=-2)(
+    delayed(main)(sim_name, sim_data[sim_name][n_iter], n, crit, n_dimensions, n_iter)
+    for sim_name, n, crit, n_iter in params
+)
+
+# Save results as a DataFrame
+columns = ["simulation", "n_samples", "criterion", "n_dimensions", "mse", "runtime"]
+df = pd.DataFrame(data, columns=columns)
+df.head()
+df.to_csv("~/Desktop/sim.csv")
+
+# Plot the results
+sns.relplot(
+    x="n_samples",
+    y="mse",
+    hue="criterion",
+    col="simulation",
+    kind="line",
+    data=df,
+    facet_kws={"sharey": False, "sharex": True},
+)
+plt.tight_layout()
+plt.show()