Skip to content

Commit 360b594

Browse files
author
morgsmss7
committed
generate new test sets for each run
1 parent db1cbb8 commit 360b594

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

examples/ensemble/plot_random_forest_regression_criteria_comparison.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
"Independence": [make_independent_noise, None],
4444
}
4545

46-
4746
###############################################################################
4847
def _train_forest(X, y, criterion):
4948
"""Fit a RandomForestRegressor with default parameters and specific criterion."""
@@ -58,24 +57,29 @@ def _test_forest(X, y, regr):
5857
y_pred = regr.predict(X)
5958
return mean_squared_error(y, y_pred)
6059

61-
def _prep_data(sim_dict, simulation_name, max_n_samples, n_dimensions):
60+
def _prep_data(sim_dict, simulation_name, max_n_samples, n_dimensions, n_trials):
6261
"""Generate train and test data for all trials."""
6362
# Get simulation parameters and validation dataset
6463
sim, noise, (X_test, y_test) = simulations[simulation_name]
6564
n_samples = int(max_n_samples)
6665
n_dimensions = int(n_dimensions)
6766

68-
# Sample training data
69-
if noise is not None:
70-
X_train, y_train = sim(n_samples=n_samples,
71-
n_dimensions=n_dimensions,
72-
noise=noise,
73-
random_state=random_state)
74-
else:
75-
X_train, y_train = sim(n_samples=n_samples,
76-
n_dimensions=n_dimensions,
77-
random_state=random_state)
78-
sim_dict[simulation_name] = (X_train, y_train, X_test, y_test)
67+
np.random.seed(random_state)
68+
seeds = np.random.randint(1e8, size=n_trials)
69+
70+
sim_dict[simulation_name] = {}
71+
for i in range(n_trials):
72+
# Sample training data
73+
if noise is not None:
74+
X_train, y_train = sim(n_samples=n_samples,
75+
n_dimensions=n_dimensions,
76+
noise=noise,
77+
random_state=seeds[i])
78+
else:
79+
X_train, y_train = sim(n_samples=n_samples,
80+
n_dimensions=n_dimensions,
81+
random_state=seeds[i])
82+
sim_dict[simulation_name][i] = (np.copy(X_train), np.copy(y_train), np.copy(X_test), np.copy(y_test))
7983
return sim_dict
8084

8185
###############################################################################
@@ -85,8 +89,8 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
8589
----------
8690
simulation_name : str
8791
Key from `simulations` dictionary.
88-
sim_data: dict
89-
Contains X_train, y_train, X_test, and y_test for each simulation_name
92+
sim_data: tuple
93+
Contains X_train, y_train, X_test, and y_tests
9094
X_train : np.array #TODO check this
9195
All X training data for given simulation
9296
y_train : np.array # TODO
@@ -178,12 +182,12 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
178182
# Generate training and test data for simulations
179183
sim_data = {}
180184
for sim in simulation_names:
181-
sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_dimensions)
185+
sim_data = _prep_data(sim_data, sim, sample_sizes[-1], n_dimensions, n_repeats)
182186

183187
# Run the simulations in parallel
184188
data = Parallel(n_jobs=-2)(delayed(main)
185-
(sim, sim_data[simulation_name], n, crit, n_dimensions, n_iter)
186-
for sim, n, crit, n_iter in params)
189+
(sim_name, sim_data[sim_name][n_iter], n, crit, n_dimensions, n_iter)
190+
for sim_name, n, crit, n_iter in params)
187191

188192
# Save results as a DataFrame
189193
columns = ["simulation", "n_samples", "criterion",
@@ -198,4 +202,6 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
198202
kind="line",
199203
data=df,
200204
facet_kws={'sharey': False, 'sharex': True})
201-
plt.show()
205+
plt.tight_layout()
206+
plt.savefig("splitter_comparison_02_17.png")
207+
plt.show()

0 commit comments

Comments
 (0)