diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/README.md b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/README.md index 8bae1d01e9..fe63214fbd 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/README.md +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/README.md @@ -1,4 +1,4 @@ -# SECIR model with multiple age groups and one damping +# SECIR model with multiple age groups and multiple dampings -This model is an application of the SECIR model implemented in https://github.com/DLR-SC/memilio/tree/main/cpp/models/ode_secir/ stratified by age groups using one damping to represent a change in the contact matrice. -The example is based on https://github.com/DLR-SC/memilio/tree/main/pycode/examples/simulation/secir_groups.py which uses python bindings to run the underlying C++ code. +This model is an application of the SECIR model implemented in https://github.com/DLR-SC/memilio/tree/main/cpp/models/ode_secir/ stratified by age groups using dampings to represent changes in the contact matrix. +The example is based on https://github.com/DLR-SC/memilio/tree/main/pycode/examples/simulation/ode_secir_groups.py which uses python bindings to run the underlying C++ code. diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py index 418e6276ca..6dc74881c9 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/data_generation.py @@ -17,6 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# + import copy import os import pickle @@ -33,65 +34,31 @@ from memilio.simulation.osecir import (Index_InfectionState, InfectionState, Model, interpolate_simulation_result, simulate) +import memilio.surrogatemodel.utils.dampings as dampings +from memilio.surrogatemodel.utils.helper_functions import ( + interpolate_age_groups, remove_confirmed_compartments, normalize_simulation_data) +import memilio.simulation as mio +import memilio.simulation.osecir as osecir -def interpolate_age_groups(data_entry): - """ Interpolates the age groups from the population data into the age groups used in the simulation. - We assume that the people in the age groups are uniformly distributed. - - :param data_entry: Data entry containing the population data. - :returns: List containing the population in each age group used in the simulation. - - """ - age_groups = { - "A00-A04": data_entry['<3 years'] + data_entry['3-5 years'] * 2 / 3, - "A05-A14": data_entry['3-5 years'] * 1 / 3 + data_entry['6-14 years'], - "A15-A34": data_entry['15-17 years'] + data_entry['18-24 years'] + data_entry['25-29 years'] + data_entry['30-39 years'] * 1 / 2, - "A35-A59": data_entry['30-39 years'] * 1 / 2 + data_entry['40-49 years'] + data_entry['50-64 years'] * 2 / 3, - "A60-A79": data_entry['50-64 years'] * 1 / 3 + data_entry['65-74 years'] + data_entry['>74 years'] * 1 / 5, - "A80+": data_entry['>74 years'] * 4 / 5 - } - return [age_groups[key] for key in age_groups] - - -def remove_confirmed_compartments(result_array): - """ Removes the confirmed compartments which are not used in the data generation. - - :param result_array: Array containing the simulation results. - :returns: Array containing the simulation results without the confirmed compartments. - - """ - num_groups = int(result_array.shape[1] / 10) - delete_indices = [index for i in range( - num_groups) for index in (3+10*i, 5+10*i)] - return np.delete(result_array, delete_indices, axis=1) - - -def transform_data(data, transformer, num_runs): - """ Transforms the data by a logarithmic normalization. - Reshaping is necessary, because the transformer needs an array with dimension <= 2. - - :param data: Data to be transformed. - :param transformer: Transformer used for the transformation. - :param num_runs: - :returns: Transformed data. - - """ - data = np.asarray(data).transpose(2, 0, 1).reshape(48, -1) - scaled_data = transformer.transform(data) - return tf.convert_to_tensor(scaled_data.transpose().reshape(num_runs, -1, 48)) - - -def run_secir_groups_simulation(days, damping_day, populations): +def run_secir_groups_simulation(days, damping_days, damping_factors, populations): """ Uses an ODE SECIR model allowing for asymptomatic infection with 6 different age groups. The model is not stratified by region. Virus-specific parameters are fixed and initial number of persons in the particular infection states are chosen randomly from defined ranges. :param days: Describes how many days we simulate within a single run. - :param damping_day: The day when damping is applied. + :param damping_days: The days when damping is applied. + :param damping_factors: damping factors associated to the damping days. :param populations: List containing the population in each age group. - :returns: List containing the populations in each compartment used to initialize the run. + :returns: Tuple of lists (list_of_simulation_results, list_of_damped_matrices), the first containing the simulation results, the second list containing the + damped contact matrices. """ + # Collect indices of confirmed compartments + del_indices = [] + + if len(damping_days) != len(damping_factors): + raise ValueError("Length of damping_days and damping_factors differ!") + set_log_level(LogLevel.Off) start_day = 1 @@ -99,9 +66,9 @@ def run_secir_groups_simulation(days, damping_day, populations): start_year = 2019 dt = 0.1 - # Define age Groups - groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+'] - num_groups = len(groups) + age_groups = ['0-4', '5-14', '15-34', '35-59', '60-79', '80+'] + # Get number of age groups + num_groups = len(age_groups) # Initialize Parameters model = Model(num_groups) @@ -118,26 +85,26 @@ def run_secir_groups_simulation(days, damping_day, populations): # Initial number of people in each compartment with random numbers model.populations[AgeGroup(i), Index_InfectionState( InfectionState.Exposed)] = random.uniform( - 0.00025, 0.0005) * populations[i] + 0.00025, 0.005) * populations[i] model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedNoSymptoms)] = random.uniform( - 0.0001, 0.00035) * populations[i] + 0.0001, 0.0035) * populations[i] model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedNoSymptomsConfirmed)] = 0 model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedSymptoms)] = random.uniform( - 0.00007, 0.0001) * populations[i] + 0.00007, 0.001) * populations[i] model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedSymptomsConfirmed)] = 0 model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedSevere)] = random.uniform( - 0.00003, 0.00006) * populations[i] + 0.00003, 0.0006) * populations[i] model.populations[AgeGroup(i), Index_InfectionState( InfectionState.InfectedCritical)] = random.uniform( - 0.00001, 0.00002) * populations[i] + 0.00001, 0.0002) * populations[i] model.populations[AgeGroup(i), Index_InfectionState( InfectionState.Recovered)] = random.uniform( - 0.002, 0.008) * populations[i] + 0.002, 0.08) * populations[i] model.populations[AgeGroup(i), Index_InfectionState(InfectionState.Dead)] = 0 model.populations.set_difference_from_group_total_AgeGroup( @@ -154,6 +121,14 @@ def run_secir_groups_simulation(days, damping_day, populations): # twice the value of RiskOfInfectionFromSymptomatic model.parameters.MaxRiskOfInfectionFromSymptomatic[AgeGroup(i)] = 0.5 + # Collecting deletable indices + index_no_sym_conf = model.populations.get_flat_index( + osecir.MultiIndex_PopulationsArray(mio.AgeGroup(i), osecir.InfectionState.InfectedNoSymptomsConfirmed)) + index_sym_conf = model.populations.get_flat_index( + osecir.MultiIndex_PopulationsArray(mio.AgeGroup(i), osecir.InfectionState.InfectedSymptomsConfirmed)) + del_indices.append(index_no_sym_conf) + del_indices.append(index_sym_conf) + # StartDay is the n-th day of the year model.parameters.StartDay = ( date(start_year, start_month, start_day) - date(start_year, 1, 1)).days @@ -166,14 +141,16 @@ def run_secir_groups_simulation(days, damping_day, populations): model.parameters.ContactPatterns.cont_freq_mat[0].minimum = minimum # Generate a damping matrix and assign it to the model - damping = np.ones((num_groups, num_groups) - ) * np.float16(random.uniform(0, 0.5)) - - model.parameters.ContactPatterns.cont_freq_mat.add_damping(Damping( - coeffs=(damping), t=damping_day, level=0, type=0)) + damped_matrices = [] - damped_contact_matrix = model.parameters.ContactPatterns.cont_freq_mat.get_matrix_at( - damping_day+1) + for i in np.arange(len(damping_days)): + damping = np.ones((num_groups, num_groups) + ) * damping_factors[i] + day = damping_days[i] + model.parameters.ContactPatterns.cont_freq_mat.add_damping(Damping( + coeffs=(damping), t=day, level=0, type=0)) + damped_matrices.append(model.parameters.ContactPatterns.cont_freq_mat.get_matrix_at( + day+1)) # Apply mathematical constraints to parameters model.apply_constraints() @@ -184,21 +161,22 @@ def run_secir_groups_simulation(days, damping_day, populations): # Interpolate simulation result on days time scale result = interpolate_simulation_result(result) + # Omit first column, as the time points are not of interest here. result_array = remove_confirmed_compartments( - np.transpose(result.as_ndarray()[1:, :])) + np.transpose(result.as_ndarray()[1:, :]), del_indices) - # Omit first column, as the time points are not of interest here. dataset_entries = copy.deepcopy(result_array) + dataset_entries = np.nan_to_num(dataset_entries) - return dataset_entries.tolist(), damped_contact_matrix + return dataset_entries.tolist(), damped_matrices def generate_data( num_runs, path_out, path_population, input_width, label_width, - normalize=True, save_data=True): - """ Generate data sets of num_runs many equation-based model simulations and transforms the computed results by a log(1+x) transformation. + normalize=True, save_data=True, damping_method="random", number_dampings=5): + """ Generate data sets of num_runs many equation-based model simulations and possibly transforms the computed results by a log(1+x) transformation. Divides the results in input and label data sets and returns them as a dictionary of two TensorFlow Stacks. - In general, we have 8 different compartments and 6 age groups. If we choose, + In general, we have 8 different compartments and 6 age groups. If we choose input_width = 5 and label_width = 20, the dataset has - input with dimension 5 x 8 x 6 - labels with dimension 20 x 8 x 6 @@ -210,14 +188,17 @@ def generate_data( :param label_width: Int value that defines the size of the labels. :param normalize: Default: true Option to transform dataset by logarithmic normalization. :param save_data: Default: true Option to save the dataset. + :param damping_method: String specifying the damping method, that should be used. Possible values "classic", "active", "random". + :param number_dampings: Maximal number of possible dampings. :returns: Data dictionary of input and label data sets. """ data = { "inputs": [], "labels": [], - "contact_matrix": [], - "damping_day": [] + "contact_matrices": [], + "damping_factors": [], + "damping_days": [] } # The number of days is the same as the sum of input and label width. @@ -232,16 +213,18 @@ def generate_data( bar = Bar('Number of Runs done', max=num_runs) for _ in range(0, num_runs): - # Generate a random damping day - damping_day = random.randrange( - input_width, input_width+label_width) + # Generate random damping days + damping_days, damping_factors = dampings.generate_dampings( + days, number_dampings, method=damping_method, min_distance=2, + min_damping_day=2) - data_run, damped_contact_matrix = run_secir_groups_simulation( - days, damping_day, population[random.randint(0, len(population) - 1)]) + data_run, damped_matrices = run_secir_groups_simulation( + days, damping_days, damping_factors, population[random.randint(0, len(population) - 1)]) data['inputs'].append(data_run[:input_width]) data['labels'].append(data_run[input_width:]) - data['contact_matrix'].append(np.array(damped_contact_matrix)) - data['damping_day'].append([damping_day]) + data['contact_matrices'].append(damped_matrices) + data['damping_factors'].append(damping_factors) + data['damping_days'].append(damping_days) bar.next() bar.finish() @@ -250,8 +233,10 @@ def generate_data( transformer = FunctionTransformer(np.log1p, validate=True) # transform inputs and labels - data['inputs'] = transform_data(data['inputs'], transformer, num_runs) - data['labels'] = transform_data(data['labels'], transformer, num_runs) + data['inputs'] = normalize_simulation_data( + data['inputs'], transformer, num_runs) + data['labels'] = normalize_simulation_data( + data['labels'], transformer, num_runs) else: data['inputs'] = tf.convert_to_tensor(data['inputs']) data['labels'] = tf.convert_to_tensor(data['labels']) @@ -261,8 +246,15 @@ def generate_data( if not os.path.isdir(path_out): os.mkdir(path_out) - # save dict to json file - with open(os.path.join(path_out, 'data_secir_groups.pickle'), 'wb') as f: + # save dict to pickle file + if num_runs < 1000: + filename = 'data_secir_groups_%ddays_%d_' % ( + label_width, num_runs) + damping_method+'.pickle' + else: + filename = 'data_secir_groups_%ddays_%dk_' % ( + label_width, num_runs//1000) + damping_method+'.pickle' + + with open(os.path.join(path_out, filename), 'wb') as f: pickle.dump(data, f) return data @@ -291,13 +283,13 @@ def getMinimumMatrix(): """ loads the minimum matrix""" minimum_contact_matrix0 = os.path.join( - "./data/contacts/minimum_home.txt") + "./data/Germany/contacts/minimum_home.txt") minimum_contact_matrix1 = os.path.join( - "./data/contacts/minimum_school_pf_eig.txt") + "./data/Germany/contacts/minimum_school_pf_eig.txt") minimum_contact_matrix2 = os.path.join( - "./data/contacts/minimum_work.txt") + "./data/Germany/contacts/minimum_work.txt") minimum_contact_matrix3 = os.path.join( - "./data/contacts/minimum_other.txt") + "./data/Germany/contacts/minimum_other.txt") minimum = np.loadtxt(minimum_contact_matrix0) \ + np.loadtxt(minimum_contact_matrix1) + \ @@ -310,7 +302,8 @@ def getMinimumMatrix(): def get_population(path): """ read population data in list from dataset - :param path: Path to the dataset containing the population data + :param path: Path to the dataset containing the population + :returns: List of interpolated age grouped population data """ @@ -332,7 +325,7 @@ def get_population(path): r"data//Germany//pydata//county_current_population.json") input_width = 5 - label_width = 30 - num_runs = 10000 + label_width = 90 + num_runs = 100 data = generate_data(num_runs, path_output, path_population, input_width, - label_width) + label_width, damping_method="active") diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/model.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/model.py index 221291f36f..8a917bfdb4 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/model.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_groups/model.py @@ -19,6 +19,8 @@ ############################################################################# from memilio.surrogatemodel.ode_secir_groups import network_architectures from memilio.simulation.osecir import InfectionState +from memilio.surrogatemodel.utils.helper_functions import ( + calc_split_index, flat_input) import os import pickle @@ -38,10 +40,9 @@ def plot_compartment_prediction_model( :param inputs: test inputs for model prediction. :param labels: test labels. :param modeltype: type of model. Can be 'classic' or 'timeseries' - :param model: trained model. (Default value = None) - :param plot_col: string name of compartment to be plotted. - :param max_subplots: Number of the simulation runs to be plotted and compared against. (Default value = 8) + :param model: trained model. (Default value = None) :param plot_compartment: (Default value = 'InfectedSymptoms') + :param max_subplots: Number of the simulation runs to be plotted and compared against. (Default value = 8) :returns: No return """ num_groups = 6 @@ -53,6 +54,9 @@ def plot_compartment_prediction_model( (inputs.shape[1] - (1 + num_groups * num_groups)) / (num_groups * num_compartments)) elif modeltype == 'timeseries': input_width = int(inputs.shape[1]) + else: + ValueError("Modeltype "+modeltype + " not known.") + label_width = int(labels.shape[1]) plt.figure(figsize=(12, 8)) @@ -117,7 +121,7 @@ def plot_compartment_prediction_model( input_series = tf.expand_dims(inputs[n], axis=0) pred = model(input_series) pred = pred.numpy() - pred = pred.reshape((30, 48)) + pred = pred.reshape((label_width, number_groups*num_compartments)) mean_per_day_pred = [] for i in pred: @@ -140,45 +144,12 @@ def plot_compartment_prediction_model( #################### # Helper functions # #################### -def calc_split_index(n, split_train=0.7, - split_valid=0.2, split_test=0.1): - """ - Calculating the indixes for a split_train:split_valid:split_test decomposition of a set with size n - - It must hold split_train + split_valid + split_test = 1 - - :param n: integer value - :param split_train: value between 0 and 1 - :param split_valid: value between 0 and 1 - :param split_test: value between 0 and 1 - :returns: a list of the form [i_train, i_valid, i_test] - """ - if split_train + split_valid + split_test > 1 + 1e-10: - raise ValueError( - "Summed data set shares are greater than 1. Please adjust the values.") - n_train = int(n * split_train) - n_valid = int(n * split_valid) - n_test = n - n_train - n_valid - - return [n_train, n_valid, n_test] - - -def flat_input(input): - """ Flatten input dimension - - :param input: input array of size (n,k,l) - :returns: reshaped array of size (n, k*l) - - """ - dim = tf.reduce_prod(tf.shape(input)[1:]) - return tf.reshape(input, [-1, dim]) - def prepare_data_classic(data): """ Transforming data to be processable by "classic" network, simply flattening and concatenating for each data instance. - :param data: dictionary produces by data_generation + :param data: dictionary produced by data_generation :returns: dictionary with entries { "train_inputs", "train_labels", "valid_inputs", "valid_labels", "test_inputs", "test_labels"} @@ -192,35 +163,20 @@ def prepare_data_classic(data): # Flattening all inputs inputs = flat_input(data["inputs"]) labels = data["labels"] - cmatrix = flat_input(data["contact_matrix"]) - dampdays = data["damping_day"] + cmatrices = flat_input(data["contact_matrices"]) + dampdays = flat_input(data["damping_days"]) + + aggregated_inputs = tf.concat( + [tf.cast(inputs, tf.float32), + tf.cast(cmatrices, tf.float32), + tf.cast(dampdays, tf.float32)], + axis=1, name='concat') # Splitting the data - compinputs_train, compinputs_valid, compinputs_test = tf.split( - inputs, split_indices, 0) labels_train, labels_valid, labels_test = tf.split( labels, split_indices, 0) - cmatrix_train, cmatrix_valid, cmatrix_test = tf.split( - cmatrix, split_indices, 0) - dampdays_train, dampdays_valid, dampdays_test = tf.split( - dampdays, split_indices, 0) - - # Combining the ingredients to one input object - inputs_train = tf.concat( - [tf.cast(compinputs_train, tf.float32), - tf.cast(cmatrix_train, tf.float32), - tf.cast(dampdays_train, tf.float32)], - axis=1, name='concat') - inputs_valid = tf.concat( - [tf.cast(compinputs_valid, tf.float32), - tf.cast(cmatrix_valid, tf.float32), - tf.cast(dampdays_valid, tf.float32)], - axis=1, name='concat') - inputs_test = tf.concat( - [tf.cast(compinputs_test, tf.float32), - tf.cast(cmatrix_test, tf.float32), - tf.cast(dampdays_test, tf.float32)], - axis=1, name='concat') + inputs_train, inputs_valid, inputs_test = tf.split( + aggregated_inputs, split_indices, 0) return { "train_inputs": inputs_train, @@ -236,7 +192,7 @@ def prod_time_series(obj, n, length_input): """ Repeating static informations to fit into a time series framework - :param obj: an array of objects, which should be repeated + :param obj: an array of objects of shape (n, shape_rest), which should be repeated :param n: total number of samples :param length_input: number of days observed per input :returns: a tensor of shape [n, length_input, -1], where for each sample the static object is repeated length_input times @@ -253,7 +209,7 @@ def prod_time_series(obj, n, length_input): def prepare_data_timeseries(data): """ - Transforming data to be processable by "time_series" network, simply repeating static values, flattening and concatenating for each data instance. + Transforming data to be processable by "timeseries" network, simply repeating static values, flattening and concatenating for each data instance. :param data: dictionary produces by data_generation :returns: dictionary with entries { @@ -265,16 +221,17 @@ def prepare_data_timeseries(data): n = data["inputs"].shape[0] # number of days per input sample - input_width = 5 + input_width = data["inputs"][0].shape[0] # Reshaping the matrix input - cmatrix = flat_input(tf.stack(data["contact_matrix"])) + cmatrices = flat_input(tf.stack(data["contact_matrices"])) + dampdays = flat_input(tf.stack(data["damping_days"])) - # Repeat data (contact matrix and dampinhg day) to produce time series - cmatrix_repeated = prod_time_series(cmatrix, n, input_width) - dampdays_repeated = prod_time_series(data["damping_day"], n, input_width) + # Repeat data (contact matrix and damping day) to produce time series + cmatrices_repeated = prod_time_series(cmatrices, n, input_width) + dampdays_repeated = prod_time_series(dampdays, n, input_width) - # Calculate split inidces + # Calculate split indices split_indices = calc_split_index(n) # Splitting the data @@ -282,26 +239,26 @@ def prepare_data_timeseries(data): data["inputs"], split_indices, 0) labels_train, labels_valid, labels_test = tf.split( data["labels"], split_indices, 0) - cmatrix_train, cmatrix_valid, cmatrix_test = tf.split( - cmatrix_repeated, split_indices, 0) + cmatrices_train, cmatrices_valid, cmatrices_test = tf.split( + cmatrices_repeated, split_indices, 0) dampdays_train, dampdays_valid, dampdays_test = tf.split( dampdays_repeated, split_indices, 0) # Combining the ingredients to one input object inputs_train = tf.concat( - [tf.cast(compinputs_train, tf.float16), - tf.cast(cmatrix_train, tf.float16), - tf.cast(dampdays_train, tf.float16)], + [tf.cast(compinputs_train, tf.float32), + tf.cast(cmatrices_train, tf.float32), + tf.cast(dampdays_train, tf.float32)], axis=2, name='concat') inputs_valid = tf.concat( - [tf.cast(compinputs_valid, tf.float16), - tf.cast(cmatrix_valid, tf.float16), - tf.cast(dampdays_valid, tf.float16)], + [tf.cast(compinputs_valid, tf.float32), + tf.cast(cmatrices_valid, tf.float32), + tf.cast(dampdays_valid, tf.float32)], axis=2, name='concat') inputs_test = tf.concat( - [tf.cast(compinputs_test, tf.float16), - tf.cast(cmatrix_test, tf.float16), - tf.cast(dampdays_test, tf.float16)], + [tf.cast(compinputs_test, tf.float32), + tf.cast(cmatrices_test, tf.float32), + tf.cast(dampdays_test, tf.float32)], axis=2, name='concat') return { @@ -362,7 +319,7 @@ def initialize_model(parameters): def network_fit( - model, modeltype, training_parameter, path, filename='data_secir_groups_30days_Germany_10k_damp.pickle', plot=True): + model, modeltype, training_parameter, path, filename='data_secir_groups_30days_10k_active.pickle', plot_stats=True): """ Training and evaluation of a given model with mean squared error loss and Adam optimizer using the mean absolute error as a metric. :param model: Keras sequential model. @@ -372,7 +329,7 @@ def network_fit( metrics is a list of used training metrics, e.g. [tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.MeanAbsolutePercentageError()] :param path: path of the dataset. :param filename: name of the file containing the data - :param plot: (Default value = True) + :param plot_stats: (Default value = True) :returns: training history as returned by the keras fit() method. """ # Unpacking training parameters @@ -393,6 +350,9 @@ def network_fit( elif modeltype == 'timeseries': data_prep = prepare_data_timeseries(data) + else: + raise ValueError("modeltype must be either classic or timeseries!") + # Setting up the training parameters batch_size = 32 early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', @@ -412,20 +372,21 @@ def network_fit( batch_size=batch_size, callbacks=[early_stopping]) - if (plot): + if (plot_stats): plot_losses(history) plot_compartment_prediction_model( - test_inputs, test_labels, modeltype, model=model, + data_prep["test_inputs"], data_prep["test_labels"], modeltype, model=model, plot_compartment='InfectedSymptoms', max_subplots=3) - df = get_test_statistic(test_inputs, test_labels, model) + df = get_test_statistic( + data_prep["test_inputs"], data_prep["test_labels"], model) print(df) return history - ##################### # Plots etc. ##################### + def plot_losses(history): """ Plots the losses of the model training. @@ -474,21 +435,6 @@ def get_test_statistic(test_inputs, test_labels, model): return mean_percentage -def get_input_dim_lstm(path): - """ Extract the dimension of the input data - - :param path: path to the data - - """ - file = open(os.path.join(path, 'data_secir_groups.pickle'), 'rb') - - data = pickle.load(file) - input_dim = data['inputs'].shape[2] + np.asarray( - data['contact_matrix']).shape[1] * np.asarray(data['contact_matrix']).shape[2]+1 - - return input_dim - - if __name__ == "__main__": path = os.path.dirname(os.path.realpath(__file__)) path_data = os.path.join(os.path.dirname(os.path.realpath( @@ -502,7 +448,7 @@ def get_input_dim_lstm(path): neurons_in_hidden_layer = 512 activation_function = 'relu' modelname = "Dense" - modeltype = "classic" # or "classic" + modeltype = "classic" # or "timeseries" model_parameters = (label_width, number_age_groups, number_compartments, hidden_layers, neurons_in_hidden_layer, activation_function, modelname) @@ -515,7 +461,6 @@ def get_input_dim_lstm(path): metrics = [tf.keras.metrics.MeanAbsoluteError()] training_parameters = (early_stop, max_epochs, loss, optimizer, metrics) - # input_dim = get_input_dim_lstm(path_data) -> Warum? model = initialize_model(model_parameters) model_output = network_fit( diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py index f7541f5e9f..318ef2a23a 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/data_generation.py @@ -33,31 +33,9 @@ from memilio.simulation.osecir import (Index_InfectionState, InfectionState, Model, interpolate_simulation_result, simulate) - - -def remove_confirmed_compartments(result_array): - """Aggregates the confirmed and non-confirmed infection compartments. - - :param result_array: Array containing simulation results with compartment populations - :returns: Modified array with aggregated compartments - """ - - # Defining relevant indices - indices_inf_no_symp = [2, 3] - index_inf_no_symp = 2 - indices_inf_symp = [4, 5] - index_inf_symp = 4 - indices_removed = [3, 5] - - # Calculating the values for the merged compartments - sum_inf_no_symp = np.sum(result_array[:, indices_inf_no_symp], axis=1) - sum_inf_symp = np.sum(result_array[:, indices_inf_symp], axis=1) - - # Seting the new values - result_array[:, index_inf_no_symp] = sum_inf_no_symp - result_array[:, index_inf_symp] = sum_inf_symp - - return np.delete(result_array, indices_removed, axis=1) +from memilio.surrogatemodel.utils.helper_functions import ( + interpolate_age_groups, remove_confirmed_compartments, normalize_simulation_data) +import memilio.simulation.osecir as osecir def run_secir_simple_simulation(days): @@ -126,6 +104,13 @@ def run_secir_simple_simulation(days): model.parameters.ContactPatterns.cont_freq_mat[0].minimum = np.ones( (num_groups, num_groups)) * 0 + # Collecting deletable indices + index_no_sym_conf = model.populations.get_flat_index( + osecir.MultiIndex_PopulationsArray(A0, osecir.InfectionState.InfectedNoSymptomsConfirmed)) + index_sym_conf = model.populations.get_flat_index( + osecir.MultiIndex_PopulationsArray(A0, osecir.InfectionState.InfectedSymptomsConfirmed)) + del_indices = (index_no_sym_conf, index_sym_conf) + # Apply mathematical constraints to parameters model.apply_constraints() @@ -138,7 +123,7 @@ def run_secir_simple_simulation(days): result_array = result.as_ndarray() result_array = remove_confirmed_compartments( - result_array[1:, :].transpose()) + result_array[1:, :].transpose(), del_indices) dataset_entries = copy.deepcopy(result_array) @@ -188,19 +173,13 @@ def generate_data( if normalize: # logarithmic normalization transformer = FunctionTransformer(np.log1p, validate=True) - inputs = np.asarray(data['inputs']).transpose(2, 0, 1).reshape(8, -1) - scaled_inputs = transformer.transform(inputs) - scaled_inputs = scaled_inputs.transpose().reshape(num_runs, input_width, 8) - scaled_inputs_list = scaled_inputs.tolist() - - labels = np.asarray(data['labels']).transpose(2, 0, 1).reshape(8, -1) - scaled_labels = transformer.transform(labels) - scaled_labels = scaled_labels.transpose().reshape(num_runs, label_width, 8) - scaled_labels_list = scaled_labels.tolist() - - # cast dfs to tensors - data['inputs'] = tf.stack(scaled_inputs_list) - data['labels'] = tf.stack(scaled_labels_list) + data['inputs'] = normalize_simulation_data( + data['inputs'], transformer, num_runs, 1) + data['labels'] = normalize_simulation_data( + data['labels'], transformer, num_runs, 1) + else: + data['inputs'] = tf.convert_to_tensor(data['inputs']) + data['labels'] = tf.convert_to_tensor(data['labels']) if save_data: # check if data directory exists. If necessary, create it. diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/hyperparameter_tuning.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/hyperparameter_tuning.py index 0123c354c3..d1a6f69ba5 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/hyperparameter_tuning.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/hyperparameter_tuning.py @@ -25,7 +25,7 @@ import time from sklearn.model_selection import KFold import numpy as np -import memilio.surrogatemodel.ode_secir_simple.grid_search as grid_search +import memilio.surrogatemodel.utils.grid_search as grid_search import memilio.surrogatemodel.ode_secir_simple.model as md # Setting random seed diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/model.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/model.py index 4033a64120..4db94c0cb2 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/model.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/model.py @@ -27,6 +27,7 @@ from memilio.simulation.osecir import InfectionState import memilio.surrogatemodel.ode_secir_simple.network_architectures as architectures +from memilio.surrogatemodel.utils.helper_functions import (calc_split_index) def initialize_model(parameters): @@ -126,22 +127,17 @@ def split_data(inputs, labels, split_train=0.7, 'valid_labels', 'test_inputs', 'test_labels'} """ - if split_train + split_valid + split_test > 1 + 1e-10: - raise ValueError( - "Summed data set shares are greater than 1. Please adjust the values.") - elif inputs.shape[0] != labels.shape[0] or inputs.shape[2] != labels.shape[2]: + if inputs.shape[0] != labels.shape[0] or inputs.shape[2] != labels.shape[2]: raise ValueError( "Number of batches or features different for input and labels") n = inputs.shape[0] - n_train = int(n * split_train) - n_valid = int(n * split_valid) - n_test = n - n_train - n_valid + splitting = calc_split_index(n, split_train, split_valid, split_test) inputs_train, inputs_valid, inputs_test = tf.split( - inputs, [n_train, n_valid, n_test], 0) + inputs, splitting, 0) labels_train, labels_valid, labels_test = tf.split( - labels, [n_train, n_valid, n_test], 0) + labels, splitting, 0) data = { 'train_inputs': inputs_train, diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/README.md b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/README.md new file mode 100644 index 0000000000..aa0e12f76d --- /dev/null +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/README.md @@ -0,0 +1,4 @@ +# Functions that are used in the various surrogate models +The file dampings.py contains different methods to generate damping days and damping factors. +grid_search.py provides functionality to evaluate and compare different model choices. +Different functions to prepare data can be found helper_functions.py \ No newline at end of file diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/__init__.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/__init__.py new file mode 100644 index 0000000000..b311d22eda --- /dev/null +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/__init__.py @@ -0,0 +1,23 @@ +############################################################################# +# Copyright (C) 2020-2025 MEmilio +# +# Authors: Agatha Schmidt, Henrik Zunker +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# + +""" +Various utils for surrogate models +""" diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/dampings.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/dampings.py new file mode 100644 index 0000000000..1b4d713c83 --- /dev/null +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/dampings.py @@ -0,0 +1,323 @@ +############################################################################# +# Copyright (C) 2020-2025 MEmilio +# +# Authors: Manuel Heger, Henrik Zunker +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# + +import random +import numpy as np + + +def calc_dist_days(days, min_day, n_dampings, min_distance=1): + """ + Calculating distance between two dampings if there are n_dampings on the interval + (min_day, days) + + :param days: Total number of days + :min_day: First day on which a damping can be applied + :n_dampings: Number of dampings + :min_distance: Lower bound for the distance between two dampings + """ + res = np.floor((days-min_day)/n_dampings) + + if res < min_distance: + raise ValueError( + "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance.") + + return res + + +def generate_dampings(days, number_dampings, method, min_distance=2, + min_damping_day=2): + """ + Producing dampings for timeseries of length days according to the used method. + + :param days: Number of days per time series + :param number_dampings: Number of days on which damping can occur. + :param method: Method used to generate the dampings, possible values "classic", "active", "random" + :param min_distance: Minimal distance between two dampings + :min_damping_day: First day, where a damping can be applied + :returns: two lists containing the damping days and the associated damping factors + """ + if method == "classic": + damp_days, damp_factors = dampings_classic( + days, number_dampings, min_distance, min_damping_day) + elif method == "active": + damp_days, damp_factors = dampings_active( + days, number_dampings, min_damping_day) + elif method == "random": + damp_days, damp_factors = dampings_random( + days, number_dampings, min_distance, min_damping_day) + else: + raise ValueError( + "The method argument has to be one of the following: 'classic', 'active' or 'random'.") + + return damp_days, damp_factors + + +# Active Damping +def dampings_active(days, number_dampings, min_damping_day): + """" + Generating list of damping days and corresponding damping factors using the active method. + + The damping days are created with equal distance on the interval [1, days-3]. + + :param days: Number of simulated days + :param number_dampings: Maximal number of dampings per simulation + :param min_damping_day: First day, where a damping can be applied + :returns: list of days and damping factors + """ + + # Setting parameters + gamma_pos = -2 + alpha = -4 + p0 = 0.4 + t1_max = -1 + t1_min = -2.5 + t2_max = 0.95 + t2_min = -0.25 + + # Defining possible damping days + distance_between_days = calc_dist_days( + days, min_damping_day, number_dampings) + damp_days = [min_damping_day + distance_between_days*(i+1) + for i in np.arange(number_dampings)] + + # Generate damping factors + dampings = calc_factors_active( + number_dampings, gamma_pos, alpha, p0, t1_max, t1_min, t2_max, t2_min) + + return damp_days, dampings + + +def calc_factors_active(n_ddays, gamma_pos=0, alpha=-1, p0=0.5, t1_max=-0.3, t1_min=-1.2, t2_max=2, t2_min=-0.5): + ''' + Producing damping factors using active damping method. + + The idea is the following: Damping factors are produced randomly until a threshold value is achieved. + In this case the factors are reduced stepwise until a moderate level is reached. + The new contact matrix is always calculated with resepect to the initial matrix according + to the following rule: + + M = exp(h)*M_0 + + The h-values are generated by the following rule: + 1) h is initialized as 0 + 2) If h > gamma_pos, we start to reduce the h-value actively by choosing changes delta_h uniformly in + the interval (t1_min, t1_max). After each step h is updated according to h = h + delta_h + This procedure is repeated till h < alpha + 3) If h < gamma_pos the changes of h, called delta_h are chosen randomly, s.t. delta_h = 0 with + probability p0, otherwise the value is uniformly distributed on the interval (t2_min, t2_max). + + :param n_ddays: Number of damping days in time series + :param gamma_pos: upper bound for h value + :param alpha: upper bound for h value, where active damping should be stopped + :param p0: probability for a change of the damping factor + :param t1_max: upper end point for size of active damping changes + :param t1_min: lower end point for size of active damping changes + :param t2_max: upper end point for size of active damping changes + :param t2_min: lower end point for size of base damping changes + ''' + h = 0 + k = 0 + dampings = [] + + while k < n_ddays: + # If the threshold value is reached, active damping is started + if h > gamma_pos: + # active reducing the damping factor + while h > alpha and k < n_ddays: + delta_h = np.random.uniform(t1_min, t1_max) + h = h + delta_h + dampings.append(1-np.exp(h)) + k = k+1 + # otherwise changes of the damping factor are generated randomly + else: + # Whether or not a non-trivial change of the damping factor is applied + if np.random.binomial(1, p0): + delta_h = np.random.uniform(t2_min, t2_max) + + else: + delta_h = 0 + h = h+delta_h + dampings.append(1 - np.exp(h)) + k = k+1 + + return dampings + + +# Classic Damping +def dampings_classic(days, number_dampings, min_distance=2, + min_damping_day=2): + """ + Generate the damping days using shadow damping and picking days uniformly with a given minimal distance. + + The idea behind shadow damping is the following: Days are picked randomly in the interval between min_damping_day and days. + After picking one day a fixed number of days before and after the chosen day is blocked. + The procedure is repeated till number_damping many days are chosen. To overcome the problem of higher probability at the boundary, + the interval is artificially increased, artificial days are not counted. + The corresponding factors are drawn uniformly from the interval (0,0.5) + + :param days: Number of days simulated per run. + :param number_dampings: Number of damping days generated. + :param min_distance: Minimal distance between two dampings + :param min_damping_day: First day, where a damping can be applied + :returns: Two lists of length number_dampingss containing the days and the factors. + """ + # Checking, if the given parameters are compatible + calc_dist_days(days, min_damping_day, number_dampings, min_distance) + + # Generating damping days + damp_days = generate_dampings_withshadowdamp( + number_dampings, days, min_distance, min_damping_day, 1)[0] + + # Generating damping factors + damp_factors = np.random.uniform(0, 0.5, number_dampings).tolist() + + return damp_days, damp_factors + + +def generate_dampings_withshadowdamp(number_of_dampings, days, min_distance, min_damping_day, number_of_runs): + """ + Sampling the damping days according to the established method. + + The idea is to draw dampings with a minimum distance while trying to keep + the distribution of damping days uniformly. We create a list of all possible days, + draw one damping day and delete all days before and after the damping that + are within the range of the min_distance. To ensure that the the data is not biased, + we include days outside the usual range. A day x in the middle of the list can + be removed from the list by a drawn day before and after x. A day in the beggining + of the list can be removed only by drawn days y , y>x. This leads to the effect that + the first and last days are chosen more often. By drawing days ouside of the allowed range + (forbidden dampings) which are removed after, we ensure that also the days at the beginning and + end of the list can be removed from the list because of the minimum distance. + + :param number_of_dampings: Number of damping days per run + :param days: Total number of days per run + :param min_distance: Minimal distance between two damping days + :param min_damping_day: First day when a damping can be applied + :number_of_runs: Number of runs for which damping days should be generated + :returns: list of list of damping days. + """ + + all_dampings = [] + count_shadow = 0 + while len(all_dampings) < number_of_runs: + # Reset the days list and dampings for each run + days_list = list(range(min_damping_day, days)) + dampings = [] + + if count_shadow < 2: + for _ in range(number_of_dampings): + if len(days_list) > 0: + damp = random.choice(days_list) + days_before = list(range(damp - min_distance, damp)) + days_after = list(range(damp, damp + min_distance + 1)) + dampings.append(damp) + days_list = [ele for ele in days_list if ele not in ( + days_before + days_after)] + else: + # Restart the process when days_list is empty + break + else: + # Exit loop only if dampings were successfully drawn + forbidden_damping_values = list( + range(0 - min_distance, 0)) + list(range(days + 1, days + min_distance + 1)) + dampings = [ + ele for ele in dampings if ele not in forbidden_damping_values] + if len(dampings) >= number_of_dampings: + all_dampings.append(sorted(dampings)) + continue + else: + # Generate forbidden damping + damp = random.choice( + list(range(0 - min_distance, 0)) + + list(range(days + 1, days + min_distance + 1)) + ) + dampings.append(damp) + for _ in range(number_of_dampings): + if len(days_list) > 0: + damp = random.choice(days_list) + days_before = list(range(damp - min_distance, damp)) + days_after = list(range(damp, damp + min_distance + 1)) + dampings.append(damp) + days_list = [ele for ele in days_list if ele not in ( + days_before + days_after)] + else: + # Restart the process when days_list is empty + break + else: + # Reset shadow count only if dampings were successfully drawn + count_shadow = 0 + forbidden_damping_values = list( + range(0 - min_distance, 0)) + list(range(days + 1, days + min_distance + 1)) + dampings = [ + ele for ele in dampings if ele not in forbidden_damping_values] + if len(dampings) == number_of_dampings: + all_dampings.append(sorted(dampings)) + continue + + # Restart process if any issue occurred + count_shadow += 1 + + return all_dampings + + +# Random Damping + +def dampings_random(days, number_dampings, min_damping_day=2, + min_distance_damping_day=2): + """ + Generate random damping days according to the following rule. + + The days are drawn using geometrical distributed waiting times and a fixed minimal distance betweem two + damping days. The first damping can occure at min_damping_day. The associated damping factors are drawn uniformly + between 0 and 0.5. + + :param days: Number of days simulated per run. + :param number_dampings: Number of damping days generated. + :param min_distance: Minimal distance between two dampings + :param min_damping_day: First day, where a damping can be applied + :returns: Two lists of length number_dampingss containing the days and the factors. + """ + # Setting parameters + + # Calculating the expected distance between two dampings + distance_between_days = calc_dist_days( + days, min_damping_day, number_dampings, min_distance_damping_day) + # Reducing due to minimal distance restriction + reduced_distance = distance_between_days - min_distance_damping_day + + # Try till one admissible configuration of waiting times is produced + running = True + while running: + dist = np.random.geometric(1/reduced_distance, number_dampings) + if np.sum(dist) + min_damping_day + number_dampings*min_distance_damping_day < days: + running = False + + # Reconstructing the days using the waiting times + ddays = [] + day = min_damping_day + for k in np.arange(len(dist)): + day = day + dist[k] + ddays.append(day) + day = day + min_distance_damping_day + + # Generating the associated damping factors + damping_factors = np.random.uniform(0, 0.5, number_dampings) + + return ddays, damping_factors diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/grid_search.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/grid_search.py similarity index 85% rename from pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/grid_search.py rename to pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/grid_search.py index 869c60404d..d6028f4a93 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/ode_secir_simple/grid_search.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/grid_search.py @@ -26,13 +26,15 @@ import time from sklearn.model_selection import KFold import numpy as np -import memilio.surrogatemodel.ode_secir_simple.network_architectures as architectures -import memilio.surrogatemodel.ode_secir_simple.model as md +import memilio.surrogatemodel.ode_secir_simple.network_architectures as architectures_simple +import memilio.surrogatemodel.ode_secir_groups.network_architectures as architectures_groups +import memilio.surrogatemodel.ode_secir_simple.model as md_simple +import memilio.surrogatemodel.ode_secir_groups.model as md_groups # Function to train and evaluate the model using cross-validation -def train_and_evaluate_model(param, inputs, labels, training_parameter, show_results=False): +def train_and_evaluate_model(param, inputs, labels, training_parameter, show_results=False, modeltype="simple", n_kfold=5): """ Training and evaluating a model with given architecture using 5-fold cross validation, returning a dictionary with the main training statistics. :param param: tuple of parameters describing the model architecture, it should be of the form @@ -43,19 +45,22 @@ def train_and_evaluate_model(param, inputs, labels, training_parameter, show_res (early_stop, max_epochs, loss, optimizer, metrics), where loss is a loss-function implemented in keras, optimizer is the name of the used optimizer, metrics is a list of used training metrics, e.g. [tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.MeanAbsolutePercentageError()] :param show_results: Boolean, whether or not the evaluation results are printed. + :param modeltype: String, specifying, which type of model is going to be tested. The possible values are "simple" - refering to the surrogate for the SECIR-model + without age_resolution, "groups" - for the age resolved SECIR model. + :param n_kfold: number of partizions used to cross-validate :returns: a dictionary of training statistics of the form {"model", "activation","optimizer","mean_train_loss_kfold","mean_val_loss_kfold","training_time", "train_losses", "val_losses"} """ # Unpacking parameters - _, _, _, _, activation, modelname = param + activation, modelname = param[-2:] early_stop, max_epochs, loss, optimizer, metrics = training_parameter early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=early_stop, mode='min') # Preparing K-Fold Cross-Validation - kf = KFold(n_splits=5) + kf = KFold(n_splits=n_kfold) train_losses = [] val_losses = [] @@ -74,7 +79,12 @@ def train_and_evaluate_model(param, inputs, labels, training_parameter, show_res valid_labels = tf.gather(labels, indices=val_idx) # Initializing model - model = md.initialize_model(param) + if modeltype == "simple": + model = md_simple.initialize_model(param) + elif modeltype == "groups": + model = md_groups.initialize_model(param) + else: + raise ValueError(modeltype+" is not known.") # Compile the model model.compile(loss=loss, optimizer=optimizer, @@ -115,7 +125,7 @@ def train_and_evaluate_model(param, inputs, labels, training_parameter, show_res } -def perform_grid_search(model_parameters, inputs, labels, training_parameters, filename_df, path=None): +def perform_grid_search(model_parameters, inputs, labels, training_parameters, filename_df, path=None, modeltype="simple"): """ Performing grid search for a given set of model parameters The results are stored in directory 'secir_simple_grid_search', each row has the form @@ -132,6 +142,8 @@ def perform_grid_search(model_parameters, inputs, labels, training_parameters, f metrics is a list of used training metrics, e.g. [tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.MeanAbsolutePercentageError()] :param filename_df: String, giving name of the file, where the data is stored, actual filename is given by filename_df + ".pickle" :param path: String representing the path, where dataframe should be stored + :param modeltype: String, specifying, which type of model is going to be tested. The possible values are "simple" - refering to the surrogate for the SECIR-model + without age_resolution, "groups" - for the age resolved SECIR model. """ # Create a DataFrame to store the results df_results = pd.DataFrame(columns=['model', 'optimizer', 'number_of_hidden_layers', 'number_of_neurons', 'activation', @@ -141,7 +153,7 @@ def perform_grid_search(model_parameters, inputs, labels, training_parameters, f # Iterate the different model architectures and save the training results for param in model_parameters: for training_parameter in training_parameters: - _, _, layer, neuron_number, activation, modelname = param + layer, neuron_number, activation, modelname = param[-4:] results = train_and_evaluate_model( param, inputs, labels, training_parameter) df_results.loc[len(df_results.index)] = [ @@ -156,7 +168,7 @@ def perform_grid_search(model_parameters, inputs, labels, training_parameters, f ] # Save the results in file - folder_name = 'secir_simple_grid_search' + folder_name = 'secir_' + modeltype + '_grid_search' if path is None: path = os.path.dirname(os.path.realpath(__file__)) diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/helper_functions.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/helper_functions.py new file mode 100644 index 0000000000..7b91cd5de8 --- /dev/null +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel/utils/helper_functions.py @@ -0,0 +1,103 @@ +############################################################################# +# Copyright (C) 2020-2025 MEmilio +# +# Authors: Manuel Heger, Henrik Zunker +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# + +import numpy as np +import tensorflow as tf +import os +import pandas as pd + + +def interpolate_age_groups(data_entry): + """! Interpolates the age groups from the population data into the age groups used in the simulation. + We assume that the people in the age groups are uniformly distributed. + @param data_entry Data entry containing the population data. + @return List containing the population in each age group used in the simulation. + """ + age_groups = { + "A00-A04": data_entry['<3 years'] + data_entry['3-5 years'] * 2 / 3, + "A05-A14": data_entry['3-5 years'] * 1 / 3 + data_entry['6-14 years'], + "A15-A34": data_entry['15-17 years'] + data_entry['18-24 years'] + data_entry['25-29 years'] + data_entry['30-39 years'] * 1 / 2, + "A35-A59": data_entry['30-39 years'] * 1 / 2 + data_entry['40-49 years'] + data_entry['50-64 years'] * 2 / 3, + "A60-A79": data_entry['50-64 years'] * 1 / 3 + data_entry['65-74 years'] + data_entry['>74 years'] * 1 / 5, + "A80+": data_entry['>74 years'] * 4 / 5 + } + return [age_groups[key] for key in age_groups] + + +def remove_confirmed_compartments(result_array, delete_indices): + """ Removes the confirmed compartments which are not used in the data generation. + + :param result_array: Array containing the simulation results. + :param delete_indices: flat indices indicating position containing data from confirmed compartments. + :returns: Array containing the simulation results without the confirmed compartments. + + """ + return np.delete(result_array, delete_indices, axis=1) + + +def normalize_simulation_data(data, transformer, num_runs, num_groups=6, num_compartments=8): + """ Transforms the data by a logarithmic normalization. + Reshaping is necessary, because the transformer needs an array with dimension <= 2. + + :param data: Data to be transformed. + :param transformer: Transformer used for the transformation. + :param num_runs: Number of samples represented by the data. + :param num_groups: Number of age groups represented by data. + :param num_compartments: Number of compartments. + :returns: Transformed data. + + """ + data = np.asarray(data).reshape(num_runs, -1) + scaled_data = transformer.transform(data) + return tf.convert_to_tensor(scaled_data.reshape(num_runs, -1, num_groups*num_compartments)) + + +def calc_split_index(n, split_train=0.7, + split_valid=0.2, split_test=0.1): + """ + Calculating the indixes for a split_train:split_valid:split_test decomposition of a set with size n + + It must hold split_train + split_valid + split_test = 1 + + :param n: integer value + :param split_train: value between 0 and 1 + :param split_valid: value between 0 and 1 + :param split_test: value between 0 and 1 + :returns: a list of the form [i_train, i_valid, i_test] + """ + if split_train + split_valid + split_test > 1 + 1e-10: + raise ValueError( + "Summed data set shares are greater than 1. Please adjust the values.") + n_train = int(n * split_train) + n_valid = int(n * split_valid) + n_test = n - n_train - n_valid + + return [n_train, n_valid, n_test] + + +def flat_input(input): + """ Flatten input dimension + + :param input: input array of size (n,k,l) + :returns: reshaped array of size (n, k*l) + + """ + dim = tf.reduce_prod(tf.shape(input)[1:]) + return tf.reshape(input, [-1, dim]) diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_groups.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_groups.py index 80f9c0f90d..118fa16375 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_groups.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_groups.py @@ -1,7 +1,7 @@ ############################################################################# # Copyright (C) 2020-2025 MEmilio # -# Authors: +# Authors: Manuel Heger, Henrik Zunker # # Contact: Martin J. Kuehn # @@ -18,9 +18,9 @@ # limitations under the License. ############################################################################# from pyfakefs import fake_filesystem_unittest - from memilio.surrogatemodel.ode_secir_groups import (data_generation, model, network_architectures) + from unittest.mock import patch import os import unittest @@ -49,25 +49,25 @@ def setUp(self): return_value=0.6 * np.ones((6, 6))) def test_simulation_run(self, mock_baseline, mock_minimum): """ - :param mock_baseline: :param mock_minimum: - """ + days_1 = 10 days_2 = 30 days_3 = 50 - damping_date = 5 + damping_days = [3, 5] + damping_factors = [0.3, 0.4] population = [5256.0, 10551, 32368.5, 43637.833333333336, 22874.066666666666, 8473.6] simulation_1 = data_generation.run_secir_groups_simulation( - days_1, damping_date, population) + days_1, damping_days, damping_factors, population) simulation_2 = data_generation.run_secir_groups_simulation( - days_2, damping_date, population) + days_2, damping_days, damping_factors, population) simulation_3 = data_generation.run_secir_groups_simulation( - days_3, damping_date, population) + days_3, damping_days, damping_factors, population) # result length self.assertEqual(len(simulation_1[0]), days_1+1) @@ -76,23 +76,23 @@ def test_simulation_run(self, mock_baseline, mock_minimum): # damping self.assertEqual( - simulation_1[1].size, + simulation_1[1][0].size, len(population) * len(population)) self.assertEqual( - simulation_2[1].size, + simulation_2[1][0].size, len(population) * len(population)) self.assertEqual( - simulation_3[1].size, + simulation_3[1][0].size, len(population) * len(population)) - self.assertLessEqual(simulation_1[1].max(), 1) - self.assertGreaterEqual(simulation_1[1].max(), 0) + self.assertLessEqual(simulation_1[1][0].max(), 1) + self.assertGreaterEqual(simulation_1[1][0].max(), 0) - self.assertLessEqual(simulation_2[1].max(), 1) - self.assertGreaterEqual(simulation_2[1].max(), 0) + self.assertLessEqual(simulation_2[1][0].max(), 1) + self.assertGreaterEqual(simulation_2[1][0].max(), 0) - self.assertLessEqual(simulation_3[1].max(), 1) - self.assertGreaterEqual(simulation_3[1].max(), 0) + self.assertLessEqual(simulation_3[1][0].max(), 1) + self.assertGreaterEqual(simulation_3[1][0].max(), 0) @patch('memilio.surrogatemodel.ode_secir_groups.data_generation.getMinimumMatrix', return_value=0.1 * np.ones((6, 6))) @@ -104,17 +104,13 @@ def test_simulation_run(self, mock_baseline, mock_minimum): def test_data_generation_runs( self, mock_population, mock_baseline, mock_minimum): """ - :param mock_population: :param mock_baseline: :param mock_minimum: - """ - - input_width_1 = 1 + input_width_1 = 10 input_width_2 = 5 - - label_width_1 = 1 + label_width_1 = 20 label_width_2 = 10 num_runs_1 = 1 @@ -122,7 +118,7 @@ def test_data_generation_runs( data_1 = data_generation.generate_data( num_runs_1, self.path, "", input_width_1, label_width_1, - save_data=False) + save_data=False, number_dampings=2) self.assertEqual(len(data_1['inputs']), num_runs_1) self.assertEqual(len(data_1['inputs'][0]), input_width_1) self.assertEqual(len(data_1['inputs'][0][0]), 48) @@ -132,7 +128,7 @@ def test_data_generation_runs( data_2 = data_generation.generate_data( num_runs_2, self.path, "", input_width_2, label_width_2, - save_data=False) + save_data=False, number_dampings=2) self.assertEqual(len(data_2['inputs']), num_runs_2) self.assertEqual(len(data_2['inputs'][0]), input_width_2) self.assertEqual(len(data_2['inputs'][0][0]), 48) @@ -150,25 +146,25 @@ def test_data_generation_runs( def test_data_generation_save( self, mock_population, mock_baseline, mock_minimum): """ - - :param mock_population: - :param mock_baseline: - :param mock_minimum: - + : param mock_population: + : param mock_baseline: + : param mock_minimum: """ - - input_width = 2 - label_width = 3 + input_width = 5 + label_width = 30 num_runs = 1 data_generation.generate_data(num_runs, self.path, "", input_width, - label_width) + label_width, damping_method="random", number_dampings=2) self.assertEqual(len(os.listdir(self.path)), 1) self.assertEqual(os.listdir(self.path), - ['data_secir_groups.pickle']) + ['data_secir_groups_30days_1_random.pickle']) + + +# # Testing network_architectures.py + -# Testing network_architectures.py def test_mlp_multi_single(self): with self.assertRaises(ValueError) as error: network_architectures.mlp_multi_input_single_output( @@ -248,8 +244,11 @@ def test_mlp_multi_multi(self): self.assertEqual(len(model.layers), 5) input_zero = np.zeros((5, 3, 7)) output_zeros = model(input_zero) + # Number of time series self.assertEqual(output_zeros.shape[0], 5) + # Number of days per predicted time series self.assertEqual(output_zeros.shape[1], 12) + # Size of feature space per day self.assertEqual(output_zeros.shape[2], 21) def test_cnn_multi_multi(self): @@ -303,7 +302,7 @@ def test_cnn_multi_multi(self): self.assertEqual(str(error.exception), error_message) model = network_architectures.cnn_multi_input_multi_output( - 21, 4, 3, 3, 256, 2) + 21, 4, 4, 3, 256, 2) self.assertEqual(len(model.layers), 6) input_zero = np.zeros((12, 5, 7)) output_zeros = model(input_zero) @@ -311,8 +310,8 @@ def test_cnn_multi_multi(self): self.assertEqual(output_zeros.shape[0], 12) # length of one time series self.assertEqual(output_zeros.shape[1], 21) - # Dimension of one time step - self.assertEqual(output_zeros.shape[2], 12) + # Dimension of one time step (16 = 4*4) + self.assertEqual(output_zeros.shape[2], 16) def test_lstm_multi_multi(self): with self.assertRaises(ValueError) as error: @@ -358,7 +357,7 @@ def test_lstm_multi_multi(self): self.assertEqual(str(error.exception), error_message) model = network_architectures.lstm_multi_input_multi_output( - 21, 4, 3, 12, 3, 12) + 21, 4, 4, 12, 3, 12) self.assertEqual(len(model.layers), 6) input_zero = np.zeros((12, 5, 7)) output_zeros = model(input_zero) @@ -366,53 +365,51 @@ def test_lstm_multi_multi(self): self.assertEqual(output_zeros.shape[0], 12) # length of one time series self.assertEqual(output_zeros.shape[1], 21) - # Dimension of one time step - self.assertEqual(output_zeros.shape[2], 12) - -# Testing model.py - def test_calc_split_index(self): - with self.assertRaises(ValueError) as error: - model.calc_split_index( - 10, 0.9, 0.1, 0.1 - ) - error_message = "Summed data set shares are greater than 1. Please adjust the values." - self.assertEqual(str(error.exception), error_message) - split_index = model.calc_split_index(10, 0.7, 0.1, 0.2) - self.assertEqual(split_index, [7, 1, 2]) + # Dimension of one time step (16 = 4*4) + self.assertEqual(output_zeros.shape[2], 16) def test_prepare_data_classic(self): data = { - "inputs": np.zeros((10, 5, 1)), - "labels": np.zeros((10, 2)), - "contact_matrix": [np.zeros((2, 2)) for _ in np.arange(10)], - "damping_day": [[1] for _ in np.arange(10)] + "inputs": np.zeros((10, 5, 48)), + "labels": np.zeros((10, 25, 48)), + "contact_matrices": [[np.zeros((6, 6)) for _ in np.arange(2)] for _ in np.arange(10)], + "damping_days": [[1, 2] for _ in np.arange(10)] } data_new = model.prepare_data_classic(data) - res = tf.cast([0, 0, 0, 0, 0, 0, 0, 0, 0, 1], tf.float32) - self.assertTrue(all(res == data_new["train_inputs"][0])) - self.assertEqual(data_new["train_inputs"].shape, (7, 10)) - self.assertEqual(data_new["valid_labels"].shape, (2, 2)) + a = np.zeros(5*48 + 2*36) + b = np.array([1, 2]) + res = tf.cast(np.hstack((a, b)).reshape( + (1, 5*48 + 2*36+2)), tf.float32) + self.assertTrue(np.asarray(res == data_new["train_inputs"][0]).all()) + self.assertEqual(data_new["train_inputs"].shape, (7, 5*48 + 2*36 + 2)) + self.assertEqual(data_new["valid_labels"].shape, (2, 25, 48)) def test_prod_time_series(self): - obj = [1, 0, 2] + A = np.ones((2, 3)) + obj = [i*A for i in np.arange(3)] ts = model.prod_time_series(obj, 3, 2) + res = [[0*A, 0*A], [A, A], [2*A, 2*A]] bl = (ts == tf.reshape( - tf.stack([[1, 1], [0, 0], [2, 2]]), [3, 2, 1])) + tf.stack(res), [3, 2, -1])) self.assertTrue(tf.math.reduce_all(bl)) - """ def test_prepare_data_timeseries(self): data = { - "inputs": np.zeros((10, 5, 1)), - "labels": np.zeros((10, 2)), - "contact_matrix": [np.zeros((2, 2)) for _ in np.arange(10)], - "damping_day": [[1] for _ in np.arange(10)] + "inputs": np.zeros((10, 5, 48)), + "labels": np.zeros((10, 25, 48)), + "contact_matrices": [[np.zeros((6, 6)) for _ in np.arange(2)] for _ in np.arange(10)], + "damping_days": [[1, 2] for _ in np.arange(10)] } data_new = model.prepare_data_timeseries(data) - res = tf.cast([[0, 0, 0, 0, 0, 1] for _ in np.arange(5)], tf.float16) + a = np.zeros(48 + 2*36) + b = np.array([1, 2]) + res = tf.cast(np.hstack((a, b)).reshape( + (1, 48 + 2*36+2)), tf.float32) + res = tf.cast([res for _ in np.arange(5)], tf.float32) + self.assertTrue(tf.math.reduce_all( + data_new["train_inputs"].shape == (7, 5, 48 + 2*36 + 2))) self.assertTrue(tf.math.reduce_all(res == data_new["train_inputs"][0])) - """ def test_initialize_model(self): # Helper function to normalize the .getconfig() output @@ -519,7 +516,7 @@ def test_network_fit(self, mock_population, mock_baseline, mock_minimum): label_width = 10 num_runs = 5 data_generation.generate_data(num_runs, self.path, "", input_width, - label_width) + label_width, number_dampings=1) # models with multiple outputs model_mlp_multi_input_multi_output = model.network_fit( @@ -527,8 +524,8 @@ def test_network_fit(self, mock_population, mock_baseline, mock_minimum): label_width), training_parameter=training_parameter, path=self.path, - filename="data_secir_groups.pickle", - modeltype='classic', plot=False) + filename="data_secir_groups_10days_5_random.pickle", + modeltype='classic', plot_stats=False) self.assertEqual( model_mlp_multi_input_multi_output.model.output_shape[1], label_width) @@ -542,8 +539,8 @@ def test_network_fit(self, mock_population, mock_baseline, mock_minimum): modeltype='timeseries', training_parameter=training_parameter, path=self.path, - filename="data_secir_groups.pickle", - plot=False) + filename="data_secir_groups_10days_5_random.pickle", + plot_stats=False) self.assertEqual( model_lstm_multi_output.model.output_shape[1], label_width) self.assertEqual( @@ -555,8 +552,8 @@ def test_network_fit(self, mock_population, mock_baseline, mock_minimum): modeltype='timeseries', training_parameter=training_parameter, path=self.path, - filename="data_secir_groups.pickle", - plot=False) + filename="data_secir_groups_10days_5_random.pickle", + plot_stats=False) self.assertEqual( cnn_output.model.output_shape[1], label_width) self.assertEqual( diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_simple.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_simple.py index 8bd3dd5a13..8fbad9f500 100644 --- a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_simple.py +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_ode_secir_simple.py @@ -17,14 +17,12 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# -from pyfakefs import fake_filesystem_unittest +from pyfakefs import fake_filesystem_unittest from memilio.surrogatemodel.ode_secir_simple import (data_generation, - network_architectures, grid_search) + network_architectures) from memilio.surrogatemodel.ode_secir_simple import model as md import os -import pickle -import pandas as pd import unittest from unittest.mock import patch @@ -418,92 +416,6 @@ def test_network_fit(self): self.assertEqual( len(lstm_output.history['val_loss']), max_epochs) - def test_train_and_evaluate_model(self): - """ Testing evaluation procedure """ - inputs = tf.ones([5, 5, 8]) - labels = tf.ones([5, 10, 8]) - max_epochs = 1 - early_stop = 100 - loss = tf.keras.losses.MeanAbsolutePercentageError() - optimizer = 'Adam' - metric = [tf.keras.metrics.MeanAbsoluteError(), - tf.keras.metrics.MeanAbsolutePercentageError()] - model_parameter = ( - 10, 8, 1, 5, "relu", "Dense" - ) - - training_parameter = ( - early_stop, max_epochs, loss, optimizer, metric) - - res = grid_search.train_and_evaluate_model( - model_parameter, inputs, labels, training_parameter, False) - self.assertEqual(len(res["val_losses"][0]), 5) - self.assertEqual(len(res["val_losses"][0][0]), max_epochs) - self.assertEqual(res["optimizer"], "Adam") - - @patch('memilio.surrogatemodel.ode_secir_simple.grid_search.train_and_evaluate_model') - def test_perform_grid_search(self, mock_train_and_evaluate_model): - """ Testing gridsearch along a sequence of possible parameter values""" - mock_train_and_evaluate_model.return_value = { - "model": "._.", - "activation": "relu", - "optimizer": "Eva", - "mean_train_loss_kfold": 42, - "mean_val_loss_kfold": 12, - "training_time": 1, - "train_losses": [3], - "val_losses": [6] - } - filename_df = "dataframe_optimizer" - os.mkdir(self.path) - - # General grid search parameters for the training process: - early_stop = [100] - max_epochs = [1] - losses = [tf.keras.losses.MeanAbsolutePercentageError()] - optimizers = ['Adam', 'Adam'] - metrics = [[tf.keras.metrics.MeanAbsoluteError(), - tf.keras.metrics.MeanAbsolutePercentageError()]] - - # Define grid search parameters for the architecture - label_width = 3 - num_outputs = 2 - hidden_layers = [1, 2] - neurons_in_hidden_layer = [32] - activation_function = ['relu'] - models = ["Dense"] - - # Collecting parameters - training_parameters = [(early, epochs, loss, optimizer, metric) - for early in early_stop for epochs in max_epochs for loss in losses - for optimizer in optimizers for metric in metrics] - - model_parameters = [(label_width, num_outputs, layer, neuron_number, activation, modelname) - for layer in hidden_layers for neuron_number in neurons_in_hidden_layer - for activation in activation_function for modelname in models] - - # generate dataset with multiple output - inputs = tf.ones([5, 1, 2]) - labels = tf.ones([5, 3, 2]) - - grid_search.perform_grid_search( - model_parameters, inputs, labels, training_parameters, - filename_df, self.path - ) - - # testing saving the results - self.assertEqual(len(os.listdir(self.path)), 1) - - self.assertEqual(os.listdir(os.path.join(self.path, 'secir_simple_grid_search')), - [filename_df+".pickle"]) - - # testing size of the output - path_name = os.path.join(self.path, 'secir_simple_grid_search') - with open(os.path.join(path_name, filename_df + ".pickle"), "rb") as f: - d = pickle.load(f) - df = pd.DataFrame(d) - self.assertEqual(len(df['model']), 4) - if __name__ == '__main__': unittest.main() diff --git a/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_utils.py b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_utils.py new file mode 100644 index 0000000000..9c10b60bbf --- /dev/null +++ b/pycode/memilio-surrogatemodel/memilio/surrogatemodel_test/test_surrogatemodel_utils.py @@ -0,0 +1,334 @@ +############################################################################# +# Copyright (C) 2020-2025 MEmilio +# +# Authors: Manuel Heger, Henrik Zunker +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# +from pyfakefs import fake_filesystem_unittest + +from memilio.surrogatemodel.ode_secir_groups import (data_generation, model, + network_architectures) +import memilio.surrogatemodel.utils.helper_functions as utils +from memilio.surrogatemodel.utils import dampings, grid_search +from sklearn.preprocessing import FunctionTransformer + +from unittest.mock import patch +import pandas as pd +import os +import unittest +import pickle + +import numpy as np +import tensorflow as tf +import logging + +# suppress all autograph warnings from tensorflow + +logging.getLogger("tensorflow").setLevel(logging.ERROR) + + +class TestSurrogatemodelUtils(fake_filesystem_unittest.TestCase): + """ """ + + path = '/home/' + + def setUp(self): + """ """ + self.setUpPyfakefs() + + def test_interpolate_age_groups(self): + res = [1666.6666666666665, 1333.3333333333333, 3500.0, + 2166.6666666666665, 1533.3333333333333, 800.0] + entry = {'ID_County': 1000, 'Population': 11000, '<3 years': 1000, '3-5 years': 1000, '6-14 years': 1000, '15-17 years': 1000, + '18-24 years': 1000, '25-29 years': 1000, '30-39 years': 1000, '40-49 years': 1000, '50-64 years': 1000, '65-74 years': 1000, '>74 years': 1000} + interpol_entry = utils.interpolate_age_groups( + entry) + self.assertEqual(res, interpol_entry) + + def test_calc_dist_days(self): + with self.assertRaises(ValueError) as error: + dampings.calc_dist_days(10, 2, 10, 2) + error_message = "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance." + self.assertEqual(str(error.exception), error_message) + + with self.assertRaises(ValueError) as error: + dampings.calc_dist_days(102, 2, 50, 3) + error_message = "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance." + self.assertEqual(str(error.exception), error_message) + + self.assertEqual(dampings.calc_dist_days(102, 2, 50, 1), 2) + self.assertEqual(dampings.calc_dist_days(30, 2, 10, 1), 2) + + def test_generate_dampings(self): + with self.assertRaises(ValueError) as error: + dampings.generate_dampings(100, 2, "gelb") + error_message = "The method argument has to be one of the following: 'classic', 'active' or 'random'." + self.assertEqual(str(error.exception), error_message) + + def test_dampings_active(self): + # length of generated sequence + factors1 = dampings.calc_factors_active(10) + factors2 = dampings.calc_factors_active(18) + # testing if damping factors are in the desired interval + self.assertTrue(np.all(np.log(1-np.array(factors1)) <= 2)) + self.assertTrue(np.all(np.log(1-np.array(factors2)) <= 2)) + self.assertEqual(len(factors1), 10) + self.assertEqual(len(factors2), 18) + + with self.assertRaises(ValueError) as error: + dampings.dampings_active(10, 200, 3) + error_message = "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance." + self.assertEqual(str(error.exception), error_message) + + # Combine days and factors + days1, factors1 = dampings.dampings_active(30, 5, 4) + days2, factors2 = dampings.dampings_active(40, 10, 4) + + self.assertEqual(len(days1), 5) + self.assertEqual(len(factors1), len(days1)) + self.assertEqual(len(days2), 10) + self.assertEqual(len(days2), len(factors2)) + + def test_dampings_classic(self): + + days1 = dampings.generate_dampings_withshadowdamp(4, 30, 3, 4, 1) + days2 = dampings.generate_dampings_withshadowdamp(10, 90, 2, 10, 1) + self.assertEqual(len(days1[0]), 4) + self.assertEqual(len(days2[0]), 10) + + with self.assertRaises(ValueError) as error: + dampings.dampings_classic(10, 200) + error_message = "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance." + self.assertEqual(str(error.exception), error_message) + + # Combine days and factors + days1, factors1 = dampings.dampings_classic(30, 5) + days2, factors2 = dampings.dampings_classic(40, 5) + + self.assertEqual(len(days1), 5) + self.assertEqual(len(factors1), len(days1)) + self.assertEqual(len(days2), 5) + self.assertEqual(len(days2), len(factors2)) + # testing if damping factors are in the desired interval + self.assertTrue(np.all(np.array(factors1) <= 2)) + self.assertTrue(np.all(np.array(factors2) <= 2)) + + def test_dampings_random(self): + with self.assertRaises(ValueError) as error: + dampings.dampings_random(10, 200) + error_message = "It's not possible to arrange this number of dampings in the desired interval with the given minimal distance." + self.assertEqual(str(error.exception), error_message) + + days1, factors1 = dampings.dampings_random(30, 5) + days2, factors2 = dampings.dampings_random(40, 5) + + self.assertEqual(len(days1), 5) + self.assertEqual(len(factors1), len(days1)) + self.assertEqual(len(days2), 5) + self.assertEqual(len(days2), len(factors2)) + # testing if damping factors are in the desired interval + self.assertTrue(np.all(np.array(factors1) <= 2)) + self.assertTrue(np.all(np.array(factors2) <= 2)) + + # Testing helper_functions.py + def test_calc_split_index(self): + with self.assertRaises(ValueError) as error: + utils.calc_split_index( + 10, 0.9, 0.1, 0.1 + ) + error_message = "Summed data set shares are greater than 1. Please adjust the values." + self.assertEqual(str(error.exception), error_message) + split_index = utils.calc_split_index(10, 0.7, 0.1, 0.2) + self.assertEqual(split_index, [7, 1, 2]) + + def test_remove_confirmed_compartments(self): + A = np.asarray([[1, 2, 3, 4, 5]]) + + A_new = utils.remove_confirmed_compartments(A, [1, 2]) + self.assertTrue(np.all(np.asarray([[1, 4, 5]]) == A_new)) + + def test_normalize_simulation_data(self): + transformer = FunctionTransformer(np.log1p, validate=True) + A = [[[1, 2], [1, 2]], + [[3, 4], [3, 4]] + ] + res = [[[0.69314718, 1.09861229], [0.69314718, 1.09861229], + [1.38629436, 1.60943791], [1.38629436, 1.60943791]]] + output = utils.normalize_simulation_data(A, transformer, 1, 1, 2) + for w1, w2 in zip(res, output): + np.testing.assert_allclose(w1, w2) + + def test_flat_input(self): + A = np.zeros((12, 12)) + a1 = [A for _ in np.arange(5)] + a2 = np.zeros((5, 144)) + a1_flatten = utils.flat_input(a1) + b = a2 == a1_flatten + self.assertTrue(np.asarray(b).all()) + + def test_train_and_evaluate_model(self): + """ Testing evaluation procedure """ + inputs = tf.ones([5, 5, 8]) + labels = tf.ones([5, 10, 8]) + max_epochs = 1 + early_stop = 100 + loss = tf.keras.losses.MeanAbsolutePercentageError() + optimizer = 'Adam' + metric = [tf.keras.metrics.MeanAbsoluteError(), + tf.keras.metrics.MeanAbsolutePercentageError()] + model_parameter = ( + 10, 8, 1, 5, "relu", "Dense" + ) + + training_parameter = ( + early_stop, max_epochs, loss, optimizer, metric) + + res = grid_search.train_and_evaluate_model( + model_parameter, inputs, labels, training_parameter, False) + self.assertEqual(len(res["val_losses"][0]), 5) + self.assertEqual(len(res["val_losses"][0][0]), max_epochs) + self.assertEqual(res["optimizer"], "Adam") + + @patch('memilio.surrogatemodel.utils.grid_search.train_and_evaluate_model') + def test_perform_grid_search_simple(self, mock_train_and_evaluate_model): + """ Testing grid search for simple model along a sequence of possible parameter values""" + mock_train_and_evaluate_model.return_value = { + "model": "._.", + "activation": "relu", + "optimizer": "Eva", + "mean_train_loss_kfold": 42, + "mean_val_loss_kfold": 12, + "training_time": 1, + "train_losses": [3], + "val_losses": [6] + } + filename_df = "dataframe_optimizer" + os.mkdir(self.path) + + # General grid search parameters for the training process: + early_stop = [100] + max_epochs = [1] + losses = [tf.keras.losses.MeanAbsolutePercentageError()] + optimizers = ['Adam', 'Adam'] + metrics = [[tf.keras.metrics.MeanAbsoluteError(), + tf.keras.metrics.MeanAbsolutePercentageError()]] + + # Define grid search parameters for the architecture + label_width = 3 + num_outputs = 2 + hidden_layers = [1, 2] + neurons_in_hidden_layer = [32] + activation_function = ['relu'] + models = ["Dense"] + + # Collecting parameters + training_parameters = [(early, epochs, loss, optimizer, metric) + for early in early_stop for epochs in max_epochs for loss in losses + for optimizer in optimizers for metric in metrics] + + model_parameters = [(label_width, num_outputs, layer, neuron_number, activation, modelname) + for layer in hidden_layers for neuron_number in neurons_in_hidden_layer + for activation in activation_function for modelname in models] + + # generate dataset with multiple output + inputs = tf.ones([5, 1, 2]) + labels = tf.ones([5, 3, 2]) + + grid_search.perform_grid_search( + model_parameters, inputs, labels, training_parameters, + filename_df, self.path + ) + + # testing saving the results + self.assertEqual(len(os.listdir(self.path)), 1) + + self.assertEqual(os.listdir(os.path.join(self.path, 'secir_simple_grid_search')), + [filename_df+".pickle"]) + + # testing size of the output + path_name = os.path.join(self.path, 'secir_simple_grid_search') + with open(os.path.join(path_name, filename_df + ".pickle"), "rb") as f: + d = pickle.load(f) + df = pd.DataFrame(d) + self.assertEqual(len(df['model']), 4) + + @patch('memilio.surrogatemodel.utils.grid_search.train_and_evaluate_model') + def test_perform_grid_search_groups(self, mock_train_and_evaluate_model): + """ Testing grid search for groups model along a sequence of possible parameter values""" + mock_train_and_evaluate_model.return_value = { + "model": "._.", + "activation": "relu", + "optimizer": "Eva", + "mean_train_loss_kfold": 42, + "mean_val_loss_kfold": 12, + "training_time": 1, + "train_losses": [3], + "val_losses": [6] + } + filename_df = "dataframe_optimizer" + os.mkdir(self.path) + + # General grid search parameters for the training process: + early_stop = [100] + max_epochs = [1] + losses = [tf.keras.losses.MeanAbsolutePercentageError()] + optimizers = ['Adam', 'Adam'] + metrics = [[tf.keras.metrics.MeanAbsoluteError(), + tf.keras.metrics.MeanAbsolutePercentageError()]] + + # Define grid search parameters for the architecture + label_width = 3 + num_outputs = 2 + num_age_groups = 1 + hidden_layers = [1, 2] + neurons_in_hidden_layer = [32] + activation_function = ['relu'] + models = ["Dense"] + + # Collecting parameters + training_parameters = [(early, epochs, loss, optimizer, metric) + for early in early_stop for epochs in max_epochs for loss in losses + for optimizer in optimizers for metric in metrics] + + model_parameters = [(label_width, num_age_groups, num_outputs, layer, neuron_number, activation, modelname) + for layer in hidden_layers for neuron_number in neurons_in_hidden_layer + for activation in activation_function for modelname in models] + + # generate dataset with multiple output + inputs = tf.ones([5, 1, 2]) + labels = tf.ones([5, 3, 2]) + + grid_search.perform_grid_search( + model_parameters, inputs, labels, training_parameters, + filename_df, self.path, "groups" + ) + + # testing saving the results + self.assertEqual(len(os.listdir(self.path)), 1) + + self.assertEqual(os.listdir(os.path.join(self.path, 'secir_groups_grid_search')), + [filename_df+".pickle"]) + + # testing size of the output + path_name = os.path.join(self.path, 'secir_groups_grid_search') + with open(os.path.join(path_name, filename_df + ".pickle"), "rb") as f: + d = pickle.load(f) + df = pd.DataFrame(d) + self.assertEqual(len(df['model']), 4) + + +if __name__ == '__main__': + unittest.main()