43
43
"Independence" : [make_independent_noise , None ],
44
44
}
45
45
46
-
47
46
###############################################################################
48
47
def _train_forest (X , y , criterion ):
49
48
"""Fit a RandomForestRegressor with default parameters and specific criterion."""
@@ -58,24 +57,29 @@ def _test_forest(X, y, regr):
58
57
y_pred = regr .predict (X )
59
58
return mean_squared_error (y , y_pred )
60
59
61
- def _prep_data (sim_dict , simulation_name , max_n_samples , n_dimensions ):
60
+ def _prep_data (sim_dict , simulation_name , max_n_samples , n_dimensions , n_trials ):
62
61
"""Generate train and test data for all trials."""
63
62
# Get simulation parameters and validation dataset
64
63
sim , noise , (X_test , y_test ) = simulations [simulation_name ]
65
64
n_samples = int (max_n_samples )
66
65
n_dimensions = int (n_dimensions )
67
66
68
- # Sample training data
69
- if noise is not None :
70
- X_train , y_train = sim (n_samples = n_samples ,
71
- n_dimensions = n_dimensions ,
72
- noise = noise ,
73
- random_state = random_state )
74
- else :
75
- X_train , y_train = sim (n_samples = n_samples ,
76
- n_dimensions = n_dimensions ,
77
- random_state = random_state )
78
- sim_dict [simulation_name ] = (X_train , y_train , X_test , y_test )
67
+ np .random .seed (random_state )
68
+ seeds = np .random .randint (1e8 , size = n_trials )
69
+
70
+ sim_dict [simulation_name ] = {}
71
+ for i in range (n_trials ):
72
+ # Sample training data
73
+ if noise is not None :
74
+ X_train , y_train = sim (n_samples = n_samples ,
75
+ n_dimensions = n_dimensions ,
76
+ noise = noise ,
77
+ random_state = seeds [i ])
78
+ else :
79
+ X_train , y_train = sim (n_samples = n_samples ,
80
+ n_dimensions = n_dimensions ,
81
+ random_state = seeds [i ])
82
+ sim_dict [simulation_name ][i ] = (np .copy (X_train ), np .copy (y_train ), np .copy (X_test ), np .copy (y_test ))
79
83
return sim_dict
80
84
81
85
###############################################################################
@@ -85,8 +89,8 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
85
89
----------
86
90
simulation_name : str
87
91
Key from `simulations` dictionary.
88
- sim_data: dict
89
- Contains X_train, y_train, X_test, and y_test for each simulation_name
92
+ sim_data: tuple
93
+ Contains X_train, y_train, X_test, and y_tests
90
94
X_train : np.array #TODO check this
91
95
All X training data for given simulation
92
96
y_train : np.array # TODO
@@ -178,12 +182,12 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
178
182
# Generate training and test data for simulations
179
183
sim_data = {}
180
184
for sim in simulation_names :
181
- sim_data = _prep_data (sim_data , sim , sample_sizes [- 1 ], n_dimensions )
185
+ sim_data = _prep_data (sim_data , sim , sample_sizes [- 1 ], n_dimensions , n_repeats )
182
186
183
187
# Run the simulations in parallel
184
188
data = Parallel (n_jobs = - 2 )(delayed (main )
185
- ( sim , sim_data [simulation_name ], n , crit , n_dimensions , n_iter )
186
- for sim , n , crit , n_iter in params )
189
+ ( sim_name , sim_data [sim_name ][ n_iter ], n , crit , n_dimensions , n_iter )
190
+ for sim_name , n , crit , n_iter in params )
187
191
188
192
# Save results as a DataFrame
189
193
columns = ["simulation" , "n_samples" , "criterion" ,
@@ -198,4 +202,6 @@ def main(simulation_name, sim_data, n_samples, criterion, n_dimensions, n_iter):
198
202
kind = "line" ,
199
203
data = df ,
200
204
facet_kws = {'sharey' : False , 'sharex' : True })
201
- plt .show ()
205
+ plt .tight_layout ()
206
+ plt .savefig ("splitter_comparison_02_17.png" )
207
+ plt .show ()
0 commit comments