@@ -58,14 +58,43 @@ def _test_forest(X, y, regr):
58
58
y_pred = regr .predict (X )
59
59
return mean_squared_error (y , y_pred )
60
60
61
+ def _prep_data (sim_dict , simulation_name , max_n_samples , n_dimensions ):
62
+ """Generate train and test data for all trials."""
63
+ # Get simulation parameters and validation dataset
64
+ sim , noise , (X_test , y_test ) = simulations [simulation_name ]
65
+ n_samples = int (max_n_samples )
66
+ n_dimensions = int (n_dimensions )
67
+
68
+ # Sample training data
69
+ if noise is not None :
70
+ X_train , y_train = sim (n_samples = n_samples ,
71
+ n_dimensions = n_dimensions ,
72
+ noise = noise ,
73
+ random_state = random_state )
74
+ else :
75
+ X_train , y_train = sim (n_samples = n_samples ,
76
+ n_dimensions = n_dimensions ,
77
+ random_state = random_state )
78
+ sim_dict [simulation_name ] = (X_train , y_train , X_test , y_test )
79
+ return sim_dict
61
80
62
81
###############################################################################
63
- def main (simulation_name , n_samples , criterion , n_dimensions , n_iter ):
82
+ def main (simulation_name , sim_data , n_samples , criterion , n_dimensions , n_iter ):
64
83
"""Measure the performance of RandomForest under simulation conditions.
65
84
Parameters
66
85
----------
67
86
simulation_name : str
68
87
Key from `simulations` dictionary.
88
+ sim_data: dict
89
+ Contains X_train, y_train, X_test, and y_test for each simulation_name
90
+ X_train : np.array #TODO check this
91
+ All X training data for given simulation
92
+ y_train : np.array # TODO
93
+ All y training data for given simulation
94
+ X_test : np.array #TODO check this
95
+ All X testing data for given simulation
96
+ y_test : np.array # TODO
97
+ All y testing data for given simulation
69
98
n_samples : int
70
99
Number of training samples.
71
100
criterion : string
@@ -93,25 +122,16 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
93
122
"""
94
123
print (simulation_name , n_samples , criterion , n_dimensions , n_iter )
95
124
96
- # Get simulation parameters and validation dataset
97
- sim , noise , (X_test , y_test ) = simulations [simulation_name ]
98
- n_samples = int (n_samples )
99
- n_dimensions = int (n_dimensions )
100
-
101
- # Sample training data
102
- if noise is not None :
103
- X_train , y_train = sim (n_samples = n_samples ,
104
- n_dimensions = n_dimensions ,
105
- noise = noise ,
106
- random_state = random_state )
107
- else :
108
- X_train , y_train = sim (n_samples = n_samples ,
109
- n_dimensions = n_dimensions ,
110
- random_state = random_state )
125
+ # Unpack training and testing data
126
+ X_train , y_train , X_test , y_test = sim_data
111
127
128
+ # Get subset of training data
129
+ curr_X_train = X_train [0 :n_samples ]
130
+ curr_y_train = y_train [0 :n_samples ]
131
+
112
132
# Train forest
113
133
start = time .time ()
114
- regr = _train_forest (X_train , y_train , criterion )
134
+ regr = _train_forest (curr_X_train , curr_y_train , criterion )
115
135
stop = time .time ()
116
136
117
137
# Evaluate on testing data and record runtime
@@ -131,11 +151,10 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
131
151
criteria = ["mae" , "mse" , "friedman_mse" ]
132
152
133
153
# Number of times to repeat each simulation setting
134
- n_repeats = 10
154
+ n_repeats = 30
135
155
136
156
# Create the parameter space
137
- params = product (simulation_names , sample_sizes , criteria ,
138
- [n_dimensions ], range (n_repeats ))
157
+ params = product (simulation_names , sample_sizes , criteria , range (n_repeats ))
139
158
140
159
141
160
###############################################################################
@@ -156,8 +175,15 @@ def main(simulation_name, n_samples, criterion, n_dimensions, n_iter):
156
175
###############################################################################
157
176
print ("Running simulations..." )
158
177
178
+ # Generate training and test data for simulations
179
+ sim_data = {}
180
+ for sim in simulation_names :
181
+ sim_data = _prep_data (sim_data , sim , sample_sizes [- 1 ], n_dimensions )
182
+
159
183
# Run the simulations in parallel
160
- data = Parallel (n_jobs = 4 )(delayed (main )(sim , n , crit , n_dim , n_iter ) for sim , n , crit , n_dim , n_iter in params )
184
+ data = Parallel (n_jobs = - 2 )(delayed (main )
185
+ (sim , sim_data [simulation_name ], n , crit , n_dimensions , n_iter )
186
+ for sim , n , crit , n_iter in params )
161
187
162
188
# Save results as a DataFrame
163
189
columns = ["simulation" , "n_samples" , "criterion" ,
0 commit comments