diff --git a/Studies/DNN/DNNEntryQueue.h b/Studies/DNN/DNNEntryQueue.h deleted file mode 100644 index a4ad5252..00000000 --- a/Studies/DNN/DNNEntryQueue.h +++ /dev/null @@ -1,82 +0,0 @@ -/*! Definiton of a thread-safe fixed size entry queue. */ - -#pragma once - -#include -#include -#include - -namespace analysis { - - template - class DNNEntryQueue { - public: - using Queue = std::queue; - using Mutex = std::mutex; - using Lock = std::unique_lock; - using CondVar = std::condition_variable; - - public: - explicit DNNEntryQueue(size_t max_size, size_t max_entries = std::numeric_limits::max()) - : max_size_(max_size), - max_entries_(max_entries), - n_entries_(0), - input_available_(true), - output_needed_(true) {} - - bool Push(const Entry& entry) { - { - Lock lock(mutex_); - cond_var_.wait( - lock, [&] { return queue_.size() < max_size_ || n_entries_ >= max_entries_ || !output_needed_; }); - if (n_entries_ >= max_entries_ || !output_needed_) - return false; - queue_.push(entry); - ++n_entries_; - } - cond_var_.notify_all(); - return true; - } - - bool Pop(Entry& entry) { - bool entry_is_valid = false; - ; - { - Lock lock(mutex_); - cond_var_.wait(lock, [&] { return queue_.size() || !input_available_; }); - if (!queue_.empty()) { - entry = queue_.front(); - entry_is_valid = true; - queue_.pop(); - } - } - cond_var_.notify_all(); - return entry_is_valid; - } - - void SetInputAvailable(bool value) { - { - Lock lock(mutex_); - input_available_ = value; - } - cond_var_.notify_all(); - } - - void SetOutputNeeded(bool value) { - { - Lock lock(mutex_); - output_needed_ = value; - } - cond_var_.notify_all(); - } - - private: - Queue queue_; - const size_t max_size_, max_entries_; - size_t n_entries_; - bool input_available_, output_needed_; - Mutex mutex_; - CondVar cond_var_; - }; - -} // namespace analysis \ No newline at end of file diff --git a/Studies/DNN/DNN_Class_HistTuples.py b/Studies/DNN/DNN_Class_HistTuples.py new file mode 100644 index 00000000..8ef17c70 --- /dev/null +++ b/Studies/DNN/DNN_Class_HistTuples.py @@ -0,0 +1,681 @@ +import tensorflow as tf +import numpy as np +import uproot +import os +import yaml +import tf2onnx +import onnx +import copy +import psutil +import matplotlib.pyplot as plt +import onnxruntime as ort +import ROOT + + +class DataWrapper: + def __init__(self): + print("Init data wrapper") + + self.feature_names = None + + self.features_no_param = None + self.features = None + + self.param_values = None + + self.labels = None + + self.class_weight = None + self.class_target = None + + self.param_list = [ + 250, + 260, + 270, + 280, + 300, + 350, + 450, + 550, + 600, + 650, + 700, + 800, + 1000, + 1200, + 1400, + 1600, + 1800, + 2000, + 2500, + 3000, + 4000, + 5000, + ] + self.use_parametric = False + + self.features_paramSet = None + + self.X_mass = None + + def UseParametric(self, use_parametric): + self.use_parametric = use_parametric + print(f"Parametric feature set to {use_parametric}") + + def SetParamList(self, param_list): + self.param_list = param_list + + def SetPredictParamValue(self, param_value): + # During predict, we want to use a truly random param value even for signal! + if param_value not in self.param_list: + print(f"This param value {param_value} is not an option!") + new_params = np.array([[param_value for x in self.features]]).transpose() + + self.features_paramSet = np.append(self.features_no_param, new_params, axis=1) + + def AddInputFeatures(self, features): + if self.feature_names == None: + self.feature_names = features + else: + self.feature_names = self.feature_names + features + + print(f"Added features {features}") + print(f"New feature list {self.feature_names}") + + def ReadFile(self, file_name, entry_start=None, entry_stop=None): + if self.feature_names == None: + print("Unknown branches to read! DefineInputFeatures first!") + return + + print(f"Reading file {file_name}") + + features_to_load = self.feature_names + + features_to_load.append("X_mass") + features_to_load.append("weight_Central") + + print(f"Only loading these features {features_to_load}") + + print( + f"Going to open file. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" + ) + + # file = uproot.open(file_name) + with uproot.open(file_name) as file: + tree = file["Events"] + branches = tree.arrays( + features_to_load, entry_start=entry_start, entry_stop=entry_stop + ) + + print( + f"Loaded branches. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" + ) + + self.features = np.array( + [ + getattr(branches, feature_name) + for feature_name in self.feature_names + ], + dtype="float32", + ).transpose() + + print( + f"Set Features. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" + ) + + # Add parametric variable + # self.param_values = np.array([[x if (x > 0) else np.random.choice(self.param_list) for x in getattr(branches, 'X_mass') ]]).transpose() + self.X_mass = getattr(branches, "X_mass") + self.physics_weight = getattr(branches, "weight_Central") + self.param_values = np.array( + [getattr(branches, "X_mass")], dtype="float32" + ).transpose() # Init wrong parametric, later we will fill with random sample + print("Got the param values") + + self.features_no_param = self.features + if self.use_parametric: + self.features = np.append(self.features, self.param_values, axis=1) + + print( + f"End read. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" + ) + + def ReadWeightFile(self, weight_name, entry_start=None, entry_stop=None): + print(f"Reading weight file {weight_name}") + with uproot.open(weight_name) as file: + tree = file["weight_tree"] + branches = tree.arrays(entry_start=entry_start, entry_stop=entry_stop) + self.class_weight = np.array( + getattr(branches, "class_weight"), dtype="float32" + ) + self.class_target = np.array( + getattr(branches, "class_target"), dtype="float32" + ) + file.close() + + +class Model(tf.keras.Model): + def __init__(self, setup, *args, **kwargs): + super().__init__(*args, **kwargs) + self.setup = setup + + self.nClasses = setup["nClasses"] + + self.class_loss = tf.keras.losses.categorical_crossentropy + self.class_accuracy = tf.keras.metrics.categorical_accuracy + + self.class_loss_tracker = tf.keras.metrics.Mean(name="class_loss") + self.class_accuracy_tracker = tf.keras.metrics.Mean(name="class_accuracy") + + self.class_min_tracker = tf.keras.metrics.Mean(name="class_min") + self.class_max_tracker = tf.keras.metrics.Mean(name="class_max") + + self.other_class_min_tracker = [ + tf.keras.metrics.Mean(name=f"other_class_min{n}") + for n in range(self.nClasses) + if n != 0 + ] + self.other_class_max_tracker = [ + tf.keras.metrics.Mean(name=f"other_class_max{n}") + for n in range(self.nClasses) + if n != 0 + ] + + self.class_layers = [] + + def add_layer(layer_list, n_units, activation, name): + if setup["use_batch_norm"]: + batch_norm = tf.keras.layers.BatchNormalization( + name=name + "_batch_norm" + ) + layer_list.append(batch_norm) + + layer = tf.keras.layers.Dense( + n_units, + activation=activation, + name=name, + kernel_initializer="random_normal", + bias_initializer="random_normal", + kernel_regularizer=tf.keras.regularizers.l2(0.0001), + ) + layer_list.append(layer) + + if setup["dropout"] > 0: + dropout = tf.keras.layers.Dropout( + setup["dropout"], name=name + "_dropout" + ) + layer_list.append(dropout) + + for n in range(setup["n_layers"]): + add_layer( + self.class_layers, + setup["n_units"], + setup["activation"], + f"layer_{n}", + ) + + self.class_output = tf.keras.layers.Dense( + setup["nClasses"], activation="softmax", name="class_output" + ) + + self.output_names = ["class_output"] + + def call(self, x): + for layer in self.class_layers: + x = layer(x) + class_output = self.class_output(x) + return class_output + + def _step(self, data, training): + x, y = data + + y_class = tf.cast(y[0], dtype=tf.float32) + + class_weight = tf.cast(y[1], dtype=tf.float32) + + def compute_losses(): + y_pred_class = self(x, training=training) + + class_loss_vec = self.class_loss(y_class, y_pred_class) + + class_loss = tf.reduce_mean(class_loss_vec * class_weight) + + return y_pred_class, class_loss_vec, class_loss + + if training: + with tf.GradientTape() as class_tape: + y_pred_class, class_loss_vec, class_loss = compute_losses() + else: + y_pred_class, class_loss_vec, class_loss = compute_losses() + + self.class_min_tracker.update_state(tf.reduce_min(y_pred_class[:, 0])) + self.class_max_tracker.update_state(tf.reduce_max(y_pred_class[:, 0])) + + for n in range(self.nClasses): + if n == 0: + continue + self.other_class_min_tracker[n - 1].update_state( + tf.reduce_min(y_pred_class[:, n]) + ) + self.other_class_max_tracker[n - 1].update_state( + tf.reduce_max(y_pred_class[:, n]) + ) + + class_accuracy_vec = self.class_accuracy(y_class, y_pred_class) + + self.class_loss_tracker.update_state(class_loss_vec, sample_weight=class_weight) + self.class_accuracy_tracker.update_state( + class_accuracy_vec, sample_weight=class_weight + ) + + if training: + grad = class_tape.gradient(class_loss, self.trainable_variables) + self.optimizer.apply_gradients(zip(grad, self.trainable_variables)) + + return {m.name: m.result() for m in self.metrics} + + def train_step(self, data): + return self._step(data, training=True) + + def test_step(self, data): + return self._step(data, training=False) + + @property + def metrics(self): + metric_list = [ + self.class_loss_tracker, + self.class_accuracy_tracker, + self.class_min_tracker, + self.class_max_tracker, + ] + metric_list = ( + metric_list + + [x for x in self.other_class_min_tracker] + + [x for x in self.other_class_max_tracker] + ) + + return metric_list + + +def train_dnn( + setup, + training_file, + weight_file, + test_training_file, + test_weight_file, + output_folder, +): + output_dnn_name = os.path.join(output_folder, f"best.onnx") + + dw = DataWrapper() + dw.AddInputFeatures(setup["features"]) + + dw.UseParametric(setup["UseParametric"]) + dw.SetParamList(setup["parametric_list"]) + + # Prep a test dw + # Must copy before reading file so we can read the test file instead + test_dw = copy.deepcopy(dw) + + entry_start = 0 + # entry_stop = batch_size * 500 # Only load 500 batches for debuging now + + # Do you want to make a larger batch? May increase speed + entry_stop = None + + dw.ReadFile( + training_file, + entry_start=entry_start, + entry_stop=entry_stop, + ) + dw.ReadWeightFile(weight_file, entry_start=entry_start, entry_stop=entry_stop) + + test_dw.ReadFile( + test_training_file, + entry_start=entry_start, + entry_stop=entry_stop, + ) + test_dw.ReadWeightFile( + test_weight_file, entry_start=entry_start, entry_stop=entry_stop + ) + + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" + os.environ["TF_DETERMINISTIC_OPS"] = "1" + tf.random.set_seed(42) + + nClasses = setup["nClasses"] + batch_size = setup["batch_size"] + train_tf_dataset = tf.data.Dataset.from_tensor_slices( + ( + dw.features, + (tf.one_hot(dw.class_target, nClasses), dw.class_weight), + ) + ).batch(batch_size, drop_remainder=True) + train_tf_dataset = train_tf_dataset.shuffle( + len(train_tf_dataset), reshuffle_each_iteration=True + ) + + test_tf_dataset = tf.data.Dataset.from_tensor_slices( + ( + test_dw.features, + ( + tf.one_hot(test_dw.class_target, nClasses), + test_dw.class_weight, + ), + ) + ).batch(batch_size, drop_remainder=True) + test_tf_dataset = test_tf_dataset.shuffle( + len(test_tf_dataset), reshuffle_each_iteration=True + ) + + @tf.function + def new_param_map(*x): + dataset = x + features = dataset[0] + + # Need to randomize the features parametric mass + parametric_mass_probability = ( + np.ones(len(dw.param_list)) * 1.0 / len(dw.param_list) + ) + random_param_mass = tf.random.categorical( + tf.math.log([list(parametric_mass_probability)]), + tf.shape(features)[0], + dtype=tf.int64, + ) + + mass_values = tf.constant(dw.param_list) + mass_keys = tf.constant(np.arange(len(dw.param_list))) + table = tf.lookup.StaticHashTable( + tf.lookup.KeyValueTensorInitializer(mass_keys, mass_values), + default_value=-1, + ) + + actual_new_mass = table.lookup(random_param_mass) + actual_new_mass = tf.cast(actual_new_mass, tf.float32) + + # Lastly we need to keep the signal events the correct mass + class_targets = dataset[1][0] + old_mass_mask = tf.cast(class_targets[:, 0], tf.float32) + new_mass_mask = tf.cast((class_targets[:, 0] == 0), tf.float32) + + actual_mass = old_mass_mask * features[:, -1] + new_mass_mask * actual_new_mass + actual_mass = tf.transpose(actual_mass) + + features = tf.concat([features[:, :-1], actual_mass], axis=-1) + new_dataset = (features, dataset[1]) + return new_dataset + + if setup["UseParametric"]: + train_tf_dataset = train_tf_dataset.map(new_param_map) + test_tf_dataset = test_tf_dataset.map(new_param_map) + + input_shape = [None, dw.features.shape[1]] + input_signature = [tf.TensorSpec(input_shape, tf.double, name="x")] + + model = Model(setup) + model.compile( + loss=None, + optimizer=tf.keras.optimizers.Nadam( + learning_rate=setup["learning_rate"], weight_decay=setup["weight_decay"] + ), + ) + model(dw.features) + model.summary() + + callbacks = [] + + verbose = setup["verbose"] if "verbose" in setup else 0 + verbose = 1 + print("Fit model") + history = model.fit( + train_tf_dataset, + validation_data=test_tf_dataset, + verbose=verbose, + epochs=setup["n_epochs"], + shuffle=False, + callbacks=callbacks, + ) + + os.makedirs(output_folder, exist_ok=True) + + def PlotMetric(history, metric, output_folder): + if metric not in history.history: + print(f"Metric {metric} not found in history") + return + plt.plot(history.history[metric], label=f"train_{metric}") + plt.plot(history.history[f"val_{metric}"], label=f"val_{metric}") + plt.title(f"{metric}") + plt.ylabel(metric) + plt.xlabel("Epoch") + plt.legend(loc="upper right") + plt.grid(True) + plt.savefig(os.path.join(output_folder, f"{metric}.pdf"), bbox_inches="tight") + plt.clf() + + PlotMetric(history, "class_loss", output_folder) + + PlotMetric(history, "class_min", output_folder) + + PlotMetric(history, "class_max", output_folder) + + input_shape = [None, dw.features.shape[1]] + input_signature = [tf.TensorSpec(input_shape, tf.double, name="x")] + onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=13) + onnx.save(onnx_model, output_dnn_name) + + features_config = { + "features": dw.feature_names, + "use_parametric": dw.use_parametric, + "parametric_list": dw.param_list, + "model_setup": setup, + "nClasses": setup["nClasses"], + "nParity": 4, + } + + with open(os.path.join(output_folder, "dnn_config.yaml"), "w") as file: + yaml.dump(features_config, file) + + return + + +def validate_dnn( + setup, + validation_file, + validation_weight_file, + output_file, + model_name, + model_config, +): + print(f"Model load {model_name}") + sess = ort.InferenceSession(model_name) + + dnnConfig = {} + with open(model_config, "r") as file: + dnnConfig = yaml.safe_load(file) + + dw = DataWrapper() + dw.AddInputFeatures(setup["features"]) + + dw.UseParametric(setup["UseParametric"]) + dw.SetParamList(setup["parametric_list"]) + + # Prep a test dw + # Must copy before reading file so we can read the test file instead + test_dw = copy.deepcopy(dw) + + entry_start = 0 + # entry_stop = batch_size * 500 # Only load 500 batches for debuging now + + # Do you want to make a larger batch? May increase speed + entry_stop = None + + dw.ReadFile( + validation_file, + entry_start=entry_start, + entry_stop=entry_stop, + ) + dw.ReadWeightFile( + validation_weight_file, entry_start=entry_start, entry_stop=entry_stop + ) + + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" + os.environ["TF_DETERMINISTIC_OPS"] = "1" + tf.random.set_seed(42) + + nClasses = setup["nClasses"] + batch_size = setup["batch_size"] + train_tf_dataset = tf.data.Dataset.from_tensor_slices( + ( + dw.features, + (tf.one_hot(dw.class_target, nClasses), dw.class_weight), + ) + ).batch(batch_size, drop_remainder=True) + train_tf_dataset = train_tf_dataset.shuffle( + len(train_tf_dataset), reshuffle_each_iteration=True + ) + + para_masspoint_list = [300, 400, 600, 800, 1000, 3000, 4000] # [300, 450, 800] + canvases = [] + for para_masspoint in para_masspoint_list: + print(f"Validating mass {para_masspoint}") + if dw.use_parametric: + dw.SetPredictParamValue(para_masspoint) + features = dw.features_paramSet if dw.use_parametric else dw.features_no_param + + pred = sess.run(None, {"x": features}) + pred_class = pred[0] + pred_signal = pred_class[:, 0] + + class_weight = dw.class_weight + physics_weight = dw.physics_weight + + # Class Plots + # Lets build Masks + Sig_This_Mass = dw.X_mass == para_masspoint + Sig_mask = (Sig_This_Mass) & (dw.class_target == 0) + + Background_mask = dw.class_target == 1 + + TT_mask = dw.class_target == 1 + + DY_mask = dw.class_target == 2 + + Other_mask = dw.class_target == 3 + + # Set class quantiles based on signal + nQuantBins = 10 + quant_binning_class = np.zeros( + nQuantBins + 1 + ) # Need +1 because 10 bins actually have 11 edges + if len(pred_signal[Sig_mask]) == 0: + print("No signal events in this mass point! Skip!") + continue + quant_binning_class[1:nQuantBins] = np.quantile( + pred_signal[Sig_mask], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + ) # Change list to something dynamic with nQuantBins + quant_binning_class[-1] = 1.0 + print("We found quant binning class") + print(quant_binning_class) + print("From the signal prediction") + print(pred_signal[Sig_mask]) + + mask_dict = { + "Signal": Sig_mask, + "TT": TT_mask, + "DY": DY_mask, + "Other": Other_mask, + } + + mask_dict = { + "Signal": Sig_mask, + "Background": Background_mask, + } + + canvases.append(ROOT.TCanvas("c1", "c1", 1200, 600 * len(mask_dict.keys()))) + canvas = canvases[-1] + canvas.Divide(1, len(mask_dict.keys())) + Class_list = [] + legend_list = [] + pads_list = [] + for i, process_name in enumerate(mask_dict.keys()): + canvas.cd(i + 1) + mask = mask_dict[process_name] + + class_out_hist, bins = np.histogram( + pred_signal[mask], + bins=quant_binning_class, + range=(0.0, 1.0), + weights=physics_weight[mask], + ) + class_out_hist_w2, bins = np.histogram( + pred_signal[mask], + bins=quant_binning_class, + range=(0.0, 1.0), + weights=physics_weight[mask] ** 2, + ) + + Class_list.append( + ROOT.TH1D( + f"ClassOutput_{process_name}", + f"ClassOutput_{process_name}", + nQuantBins, + 0.0, + 1.0, + ) + ) + + ROOT_ClassOutput = Class_list[-1] + + for binnum in range(nQuantBins): + ROOT_ClassOutput.SetBinContent(binnum + 1, class_out_hist[binnum]) + ROOT_ClassOutput.SetBinError( + binnum + 1, class_out_hist_w2[binnum] ** (0.5) + ) + + if ROOT_ClassOutput.Integral() == 0: + print( + f"Process {process_name} has no class entries, maybe the background doesn't exist?" + ) + continue + + # ROOT_ClassOutput.Scale(1.0 / ROOT_ClassOutput.Integral()) + + pads_list.append(ROOT.TPad("p1", "p1", 0.0, 0.3, 1.0, 0.9, 0, 0, 0)) + p1 = pads_list[-1] + p1.SetTopMargin(0) + p1.Draw() + + p1.cd() + + plotlabel = f"Class Output for {process_name} ParaMass {para_masspoint} GeV" + ROOT_ClassOutput.Draw() + ROOT_ClassOutput.SetTitle(plotlabel) + ROOT_ClassOutput.SetStats(0) + min_val = max( + 0.0001, + ROOT_ClassOutput.GetMinimum(), + ) + max_val = ROOT_ClassOutput.GetMaximum() + + ROOT_ClassOutput.GetYaxis().SetRangeUser(0.001 * min_val, 1000 * max_val) + + legend_list.append(ROOT.TLegend(0.5, 0.8, 0.9, 0.9)) + legend = legend_list[-1] + legend.AddEntry(ROOT_ClassOutput, f"{process_name}") + legend.Draw() + + print(f"Setting canvas to log scale with range {min_val}, {max_val}") + p1.SetLogy() + p1.SetGrid() + + if para_masspoint == para_masspoint_list[0]: + canvas.Print(f"{output_file}(", f"Title:Mass {para_masspoint} GeV") + print("Saved [") + elif para_masspoint == para_masspoint_list[-1]: + canvas.Print(f"{output_file})", f"Title:Mass {para_masspoint} GeV") + print("Saved ]") + else: + canvas.Print(f"{output_file}", f"Title:Mass {para_masspoint} GeV") + print(f"Saved mass {para_masspoint}") + + canvas.Close() diff --git a/Studies/DNN/DNN_Trainer_Condor.py b/Studies/DNN/DNN_Trainer_Condor.py index 5d9c7a04..22f6fe96 100644 --- a/Studies/DNN/DNN_Trainer_Condor.py +++ b/Studies/DNN/DNN_Trainer_Condor.py @@ -1,5 +1,5 @@ import argparse -import Studies.DNN.DNN_Class as DNNClass +import Studies.DNN.DNN_Class_HistTuples as DNNClass import threading import yaml from FLAF.RunKit.crabLaw import cond as kInit_cond, update_kinit_thread @@ -10,32 +10,12 @@ "--training_file", required=True, type=str, help="Training file" ) parser.add_argument("--weight_file", required=True, type=str, help="Weight file") - parser.add_argument( - "--hme_friend_file", - required=False, - type=str, - default=None, - help="HME friend file", - ) - parser.add_argument( - "--batch_config", required=True, type=str, help="Batch config file" - ) parser.add_argument( "--test_training_file", required=True, type=str, help="Test file" ) parser.add_argument( "--test_weight_file", required=True, type=str, help="Test weight file" ) - parser.add_argument( - "--test_hme_friend_file", - required=False, - type=str, - default=None, - help="Test HME friend file", - ) - parser.add_argument( - "--test_batch_config", required=True, type=str, help="Test batch config file" - ) parser.add_argument("--output_folder", required=True, type=str, help="Output model") parser.add_argument( "--setup-config", required=True, type=str, help="Setup config for training" @@ -54,24 +34,13 @@ thread = threading.Thread(target=update_kinit_thread) thread.start() - config_dict = {} - with open(args.batch_config, "r") as file: - config_dict = yaml.safe_load(file) - test_config_dict = {} - with open(args.test_batch_config, "r") as file: - test_config_dict = yaml.safe_load(file) - model = DNNClass.train_dnn( setup, args.training_file, args.weight_file, - config_dict, args.test_training_file, args.test_weight_file, - test_config_dict, args.output_folder, - args.hme_friend_file, - args.test_hme_friend_file, ) finally: diff --git a/Studies/DNN/DNN_Validator_Condor.py b/Studies/DNN/DNN_Validator_Condor.py index e6d7a128..12322123 100644 --- a/Studies/DNN/DNN_Validator_Condor.py +++ b/Studies/DNN/DNN_Validator_Condor.py @@ -1,5 +1,5 @@ import argparse -import Studies.DNN.DNN_Class as DNNClass +import Studies.DNN.DNN_Class_HistTuples as DNNClass import threading import yaml from FLAF.RunKit.crabLaw import cond as kInit_cond, update_kinit_thread @@ -12,16 +12,6 @@ parser.add_argument( "--validation_weight_file", required=True, type=str, help="Weight file" ) - parser.add_argument( - "--validation_hme_friend_file", - required=False, - type=str, - default=None, - help="Validation HME friend file", - ) - parser.add_argument( - "--validation_batch_config", required=True, type=str, help="Batch config file" - ) parser.add_argument("--output_file", required=True, type=str, help="Output Pdf") parser.add_argument( "--setup-config", required=True, type=str, help="Setup config for training" @@ -46,20 +36,13 @@ thread = threading.Thread(target=update_kinit_thread) thread.start() - config_dict = {} - with open(args.validation_batch_config, "r") as file: - config_dict = yaml.safe_load(file) - - # model = DNNClass.validate_dnn(setup, args.validation_file, args.validation_weight_file, config_dict, args.output_file, args.model_name, args.model_config) - model = DNNClass.validate_disco_dnn( + model = DNNClass.validate_dnn( setup, args.validation_file, args.validation_weight_file, - config_dict, args.output_file, args.model_name, args.model_config, - args.validation_hme_friend_file, ) finally: diff --git a/Studies/DNN/TupleMaker.h b/Studies/DNN/TupleMaker.h deleted file mode 100644 index c6e4c896..00000000 --- a/Studies/DNN/TupleMaker.h +++ /dev/null @@ -1,176 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -#include "DNNEntryQueue.h" - -using RVecF = ROOT::VecOps::RVec; -using RVecI = ROOT::VecOps::RVec; -using RVecUC = ROOT::VecOps::RVec; -using RVecShort = ROOT::VecOps::RVec; - -namespace analysis { - typedef std::variant - DNNMultiType; - - struct DNNEntry { - std::vector var_values; - - explicit DNNEntry(size_t size) : var_values(size) {} - - template - void Add(int index, const T& value) { - if (index >= 0) { - var_values.at(index) = value; - } - } - - template - const T& GetValue(int idx) const { - return std::get(var_values.at(idx)); - } - }; - - namespace detail { - inline void putEntry(std::shared_ptr& entry, int index) {} - - template - //void putEntry(std::vector& entries, int var_index, const T& value, Args&& ...args){ - void putEntry(std::shared_ptr& entry, int var_index, const T& value, Args&&... args) { - // std::cout << "Var index is " << var_index << std::endl; - // std::cout << "And value is " << value << std::endl; - entry->Add(var_index, value); - putEntry(entry, var_index + 1, std::forward(args)...); - } - - inline void read(int index) {} - - template - void read(int var_index, const T& value, Args&&... args) { - //index 138 is lep1_pt, lets only look at that one - if (var_index == 138) { - std::cout << "Var index is " << var_index << std::endl; - std::cout << "And value is " << value << std::endl; - } - read(var_index + 1, std::forward(args)...); - } - - } // namespace detail - - struct DNNStopLoop {}; - - template //Using a template will allow us to pass the column types as a 'variable'! Trop Cool! - struct TupleMaker { - TupleMaker(size_t queue_size, size_t max_entries) : queue(queue_size, max_entries) { - std::cout << "Initializing tuplemaker with queue size " << queue_size << " and max entries " << max_entries - << std::endl; - } - - void readDF(ROOT::RDF::RNode df, const std::vector& column_names) { - int entry_counter = 0; - df.Foreach( - [&](const Args&... args) { - std::cout << "New entry! " << entry_counter << std::endl; - entry_counter++; - detail::read(0, args...); - std::cout << "End of entry" << std::endl; - }, - column_names); - } - - ROOT::RDF::RNode FillDF(ROOT::RDF::RNode new_df, - ROOT::RDF::RNode in_df, - const std::vector& local_to_master_map, - const int master_size, - const std::vector& local_column_names, - int nBatchStart, - int nBatchEnd, - int batch_size) { - auto df0 = in_df.Define( - "_entry", - [=](const Args&... args) { - // auto entry = std::make_shared(local_column_names.size()); - auto entry = std::make_shared(master_size); - int index = 0; - (void)std::initializer_list{(entry->Add(local_to_master_map.at(index++), args), 0)...}; - - return entry; - }, - local_column_names); - - thread = std::make_unique([=]() { - std::cout << "TupleMaker::FillDF: thread started." << std::endl; - try { - //std::cout << "Passed the lock step, starting foreach" << std::endl; - ROOT::RDF::RNode df = df0; - - // df.Foreach([&](const Args& ...args){ - // auto entry = std::make_shared(column_names.size()); - // detail::putEntry(entry, 0, args...); - // if(!queue.Push(entry)){ - // std::cout << "Hey the push returned false" << std::endl; - // throw StopLoop(); - // } - // }, column_names); - - df.Foreach( - [&](const std::shared_ptr& entry) { - if (!queue.Push(entry)) { - std::cout << "Hey the push returned false" << std::endl; - throw DNNStopLoop(); - } - }, - {"_entry"}); - } catch (DNNStopLoop) { - } - - std::cout << "Finished foreach" << std::endl; - - queue.SetInputAvailable(false); - }); - - new_df = new_df.Define("_entry", - [=](ULong64_t rdfentry) { - // Entry entry(column_names.size()); - std::shared_ptr entry; - //int batch_size = 100; - int start_idx = nBatchStart; - int end_idx = nBatchEnd; - const int index = rdfentry % batch_size; - if (index >= start_idx && index < end_idx) { - queue.Pop(entry); - } - return entry; - }, - {"rdfentry_"}); - return new_df; - } - - void join() { - if (thread) { - queue.SetOutputNeeded(false); - thread->join(); - } - } - - DNNEntryQueue> queue; - std::unique_ptr thread; - std::condition_variable cond_var; - }; - -} // namespace analysis \ No newline at end of file diff --git a/Studies/DNN/config/YH_split_doubleLep.yaml b/Studies/DNN/config/YH_split_doubleLep.yaml deleted file mode 100644 index 075a2330..00000000 --- a/Studies/DNN/config/YH_split_doubleLep.yaml +++ /dev/null @@ -1,79 +0,0 @@ -batch_dict: - XtoYHto2B2W: 100 # It is important to put signal first since signal has extra branches - TT: 1200 - DY: 250 - ST: 200 - Other: 250 - - -storage_folder: /eos/uscms/store/user/lpcflaf/HH_bbWW/anaTuples/v2601a/ - -selection_branches: [ 'lep1_legType', 'lep2_legType', 'weight_MC_Lumi_pu', 'event', 'centralJet_pt', 'SelectedFatJet_pt' ] - -# Cuts for uproot -selection_cut: (tree['lep1_legType'] > 0) & (tree['lep2_legType'] > 0) & ( (ak.count(tree['centralJet_pt'], axis=1) > 1) | (ak.count(tree['SelectedFatJet_pt'], axis=1) > 0) ) -parity_func: (tree['event']%{nParity} == {parity_scan}) - -nParity: 4 - -# Cuts for RDF in creating the training file -iterate_cut: (lep1_legType > 0) && (lep2_legType > 0) && ( (centralJet_pt.size() > 1) || (SelectedFatJet_pt.size() > 0) ) && (event%{nParity} == {parity_scan}) - - -signal: - XtoYHto2B2W: - class_value: 0 - spin: 0 - mass_points: [ 300, 400, 500, 550, 600, 650, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 3500, 4000 ] - dataset_name_format: - XtoYHto2B2Wto2B2L2Nu_MX_{}_MY_125 - combined_name: - XtoYHto2B2Wto2B2L2Nu_Combined - use_combined: - True - - -background: - TT: - class_value: 1 - background_datasets: # If there is an extension, just add more to the list - TTto2L2Nu: [ ] - # TTto4Q: [ ] - TTtoLNu2Q: [ ] - DY: - class_value: 2 - background_datasets: - DYto2E_M_10to50_amcatnloFXFX: [ ] - # DYto2E_M_50_0J_amcatnloFXFX: [ ] - DYto2E_M_50_1J_amcatnloFXFX: [ ] - DYto2E_M_50_2J_amcatnloFXFX: [ ] - - DYto2Mu_M_10to50_amcatnloFXFX: [ ] - # DYto2Mu_M_50_0J_amcatnloFXFX: [ ] - DYto2Mu_M_50_1J_amcatnloFXFX: [ ] - DYto2Mu_M_50_2J_amcatnloFXFX: [ ] - - # DYto2Tau_M_50_0J_amcatnloFXFX: [ ] - DYto2Tau_M_50_1J_amcatnloFXFX: [ ] - DYto2Tau_M_50_2J_amcatnloFXFX: [ ] - - ST: - class_value: 3 - background_datasets: - TbarWplusto2L2Nu: [ ] - TbarWplustoLNu2Q: [ ] - TbarWplusto4Q: [ ] - - TWminusto2L2Nu: [ ] - TWminustoLNu2Q: [ ] - TWminusto4Q: [ ] - - Other: - class_value: 4 - background_datasets: - WW: [ ] - WZ: [ ] - - # WtoLNu_0J_amcatnloFXFX: [ ] - # WtoLNu_1J_amcatnloFXFX: [ ] - # WtoLNu_2J_amcatnloFXFX: [ ] diff --git a/Studies/DNN/config/YH_split_singleLep.yaml b/Studies/DNN/config/YH_split_singleLep.yaml deleted file mode 100644 index d523321c..00000000 --- a/Studies/DNN/config/YH_split_singleLep.yaml +++ /dev/null @@ -1,80 +0,0 @@ -batch_dict: - XtoYHto2B2W: 200 # It is important to put signal first since signal has extra branches - TT: 2800 - ST: 700 - W: 300 - Other: 300 - - -storage_folder: /eos/user/d/daebi/bbWW_development/anaTuples/3Nov25/ - -selection_branches: [ 'lep1_legType', 'lep2_legType', 'weight_MC_Lumi_pu', 'event', 'centralJet_pt', 'SelectedFatJet_pt' ] - -# Cuts for uproot -selection_cut: (tree['lep1_legType'] > 0) & (tree['lep2_legType'] <= 0) & ( (ak.count(tree['centralJet_pt'], axis=1) > 1) | (ak.count(tree['SelectedFatJet_pt'], axis=1) > 0) ) -parity_func: (tree['event']%{nParity} == {parity_scan}) - -nParity: 4 - -# Cuts for RDF in creating the training file -iterate_cut: (lep1_legType > 0) && (lep2_legType <= 0) && ( (centralJet_pt.size() > 1) || (SelectedFatJet_pt.size() > 0) ) && (event%{nParity} == {parity_scan}) - - -signal: - XtoYHto2B2W: - class_value: 0 - spin: 0 - mass_points: [ 300, 400, 500, 550, 600, 650, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 3500, 4000 ] - dataset_name_format: - XtoYHto2B2Wto2B2Q1L1Nu_MX_{}_MY_125 - combined_name: - XtoYHto2B2Wto2B2Q1L1Nu_Combined - use_combined: - True - - -background: - TT: - class_value: 1 - background_datasets: # If there is an extension, just add more to the list - TTto2L2Nu: [ ] - TTto4Q: [ ] - TTtoLNu2Q: [ ] - - W: - class_value: 2 - background_datasets: - WtoLNu_0J_amcatnloFXFX: [ ] - WtoLNu_1J_amcatnloFXFX: [ ] - WtoLNu_2J_amcatnloFXFX: [ ] - - ST: - class_value: 3 - background_datasets: - TbarWplusto2L2Nu: [ ] - TbarWplustoLNu2Q: [ ] - TbarWplusto4Q: [ ] - - TWminusto2L2Nu: [ ] - TWminustoLNu2Q: [ ] - TWminusto4Q: [ ] - - Other: - class_value: 4 - background_datasets: - WW: [ ] - WZ: [ ] - - DYto2E_M_10to50_amcatnloFXFX: [ ] - DYto2E_M_50_0J_amcatnloFXFX: [ ] - DYto2E_M_50_1J_amcatnloFXFX: [ ] - DYto2E_M_50_2J_amcatnloFXFX: [ ] - - DYto2Mu_M_10to50_amcatnloFXFX: [ ] - DYto2Mu_M_50_0J_amcatnloFXFX: [ ] - DYto2Mu_M_50_1J_amcatnloFXFX: [ ] - DYto2Mu_M_50_2J_amcatnloFXFX: [ ] - - DYto2Tau_M_50_0J_amcatnloFXFX: [ ] - DYto2Tau_M_50_1J_amcatnloFXFX: [ ] - DYto2Tau_M_50_2J_amcatnloFXFX: [ ] diff --git a/Studies/DNN/config/dataset_setup_doubleLep_boosted.yaml b/Studies/DNN/config/dataset_setup_doubleLep_boosted.yaml new file mode 100644 index 00000000..e7fa1549 --- /dev/null +++ b/Studies/DNN/config/dataset_setup_doubleLep_boosted.yaml @@ -0,0 +1,95 @@ +# storage_folder: /eos/uscms/store/user/lpcflaf/HH_bbWW/histTuples/v2601a/ +storage_folder: /eos/user/d/daebi/HH_bbWW/histTuples/v2601a/*/ # Include all eras with * + +nParity: 4 + +# Cuts for RDF in creating the training file +iterate_cut: ( + (ZVeto_OS_Iso == 1) && + (boosted == 1) && + ( (channelId == 11) || (channelId == 12) || (channelId == 22) ) + ) +parity_cut: ( + (event%{nParity} == {parity_scan}) + ) + +signal: + XtoYHto2B2W: + class_value: 0 + spin: 0 + mass_points: [ 300, 400, 500, 550, 600, 650, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 3500, 4000 ] + dataset_name_format: + XtoYHto2B2Wto2B2L2Nu_MX_{}_MY_125 + combined_name: + XtoYHto2B2Wto2B2L2Nu_Combined + use_combined: + True + +background: + TT: + class_value: 1 + background_datasets: # If there is an extension, just add more to the list + - TTto2L2Nu + - TTtoLNu2Q + - TTto4Q + DY: + class_value: 2 + background_datasets: + - DYto2E_M_10to50_amcatnloFXFX + - DYto2E_M_50_0J_amcatnloFXFX + - DYto2E_M_50_1J_amcatnloFXFX + - DYto2E_M_50_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_40to100_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_40to100_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_100to200_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_100to200_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_200to400_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_200to400_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_400to600_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_400to600_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_600_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_600_2J_amcatnloFXFX + + - DYto2Mu_M_10to50_amcatnloFXFX + - DYto2Mu_M_50_0J_amcatnloFXFX + - DYto2Mu_M_50_1J_amcatnloFXFX + - DYto2Mu_M_50_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_40to100_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_40to100_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_100to200_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_100to200_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_200to400_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_200to400_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_400to600_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_400to600_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_600_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_600_2J_amcatnloFXFX + + - DYto2Tau_M_10to50_amcatnloFXFX + - DYto2Tau_M_50_0J_amcatnloFXFX + - DYto2Tau_M_50_1J_amcatnloFXFX + - DYto2Tau_M_50_2J_amcatnloFXFX + + Other: + class_value: 3 + background_datasets: + - TbarWplusto2L2Nu + - TbarWplustoLNu2Q + - TbarWplusto4Q + + - TWminusto2L2Nu + - TWminustoLNu2Q + - TWminusto4Q + + - WW + - WZ diff --git a/Studies/DNN/config/dataset_setup_doubleLep_resolved.yaml b/Studies/DNN/config/dataset_setup_doubleLep_resolved.yaml new file mode 100644 index 00000000..0ea71151 --- /dev/null +++ b/Studies/DNN/config/dataset_setup_doubleLep_resolved.yaml @@ -0,0 +1,97 @@ +# storage_folder: /eos/uscms/store/user/lpcflaf/HH_bbWW/histTuples/v2601a/ +storage_folder: /eos/user/d/daebi/HH_bbWW/histTuples/v2601a/*/ # Include all eras with * + +nParity: 4 + +# Cuts for RDF in creating the training file +iterate_cut: ( + (ZVeto_OS_Iso == 1) && + (bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino > 70) && + (bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino < 150) && + ( (res2b == 1) || (recovery == 1) ) && + ( (channelId == 11) || (channelId == 12) || (channelId == 22) ) + ) +parity_cut: ( + (event%{nParity} == {parity_scan}) + ) + +signal: + XtoYHto2B2W: + class_value: 0 + spin: 0 + mass_points: [ 300, 400, 500, 550, 600, 650, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 3500, 4000 ] + dataset_name_format: + XtoYHto2B2Wto2B2L2Nu_MX_{}_MY_125 + combined_name: + XtoYHto2B2Wto2B2L2Nu_Combined + use_combined: + True + +background: + TT: + class_value: 1 + background_datasets: # If there is an extension, just add more to the list + - TTto2L2Nu + - TTtoLNu2Q + - TTto4Q + DY: + class_value: 2 + background_datasets: + - DYto2E_M_10to50_amcatnloFXFX + - DYto2E_M_50_0J_amcatnloFXFX + - DYto2E_M_50_1J_amcatnloFXFX + - DYto2E_M_50_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_40to100_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_40to100_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_100to200_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_100to200_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_200to400_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_200to400_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_400to600_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_400to600_2J_amcatnloFXFX + + - DYto2E_M_50_PTLL_600_1J_amcatnloFXFX + - DYto2E_M_50_PTLL_600_2J_amcatnloFXFX + + - DYto2Mu_M_10to50_amcatnloFXFX + - DYto2Mu_M_50_0J_amcatnloFXFX + - DYto2Mu_M_50_1J_amcatnloFXFX + - DYto2Mu_M_50_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_40to100_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_40to100_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_100to200_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_100to200_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_200to400_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_200to400_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_400to600_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_400to600_2J_amcatnloFXFX + + - DYto2Mu_M_50_PTLL_600_1J_amcatnloFXFX + - DYto2Mu_M_50_PTLL_600_2J_amcatnloFXFX + + - DYto2Tau_M_10to50_amcatnloFXFX + - DYto2Tau_M_50_0J_amcatnloFXFX + - DYto2Tau_M_50_1J_amcatnloFXFX + - DYto2Tau_M_50_2J_amcatnloFXFX + + Other: + class_value: 3 + background_datasets: + - TbarWplusto2L2Nu + - TbarWplustoLNu2Q + - TbarWplusto4Q + + - TWminusto2L2Nu + - TWminustoLNu2Q + - TWminusto4Q + + - WW + - WZ diff --git a/Studies/DNN/config/dataset_split.yaml b/Studies/DNN/config/dataset_split.yaml deleted file mode 100644 index d991a5be..00000000 --- a/Studies/DNN/config/dataset_split.yaml +++ /dev/null @@ -1,61 +0,0 @@ -batch_dict: - GluGlutoRadion: 100 # It is important to put signal first since signal has extra branches - GluGlutoBulkGraviton: 100 - TT: 2800 - # DY: 0 - - -storage_folder: /eos/user/d/daebi/bbWW/anaTuples/shared/Run3_2022/ - -selection_branches: [ 'lep1_type', 'lep2_type', 'weight_base', 'event', 'centralJet_pt' ] - -# Cuts for uproot -selection_cut: (tree['lep1_type'] > 0) & (tree['lep2_type'] > 0) & (ak.count(tree['centralJet_pt'], axis=1) > 1) -parity_func: (tree['event']%{nParity} == {parity_scan}) - -nParity: 4 - -# Cuts for RDF in creating the training file -iterate_cut: (lep1_type > 0) && (lep2_type > 0) && (centralJet_pt.size() > 1) && (event%{nParity} == {parity_scan}) - - -signal: - GluGlutoRadion: - class_value: 0 - spin: 0 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoRadiontoHHto2B2Vto2B2L2Nu_M_{} - combined_name: - GluGlutoRadiontoHHto2B2Vto2B2L2Nu_Combined - use_combined: - True - GluGlutoBulkGraviton: - class_value: 0 - spin: 2 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoBulkGravitontoHHto2B2Vto2B2L2Nu_M_{} - combined_name: - GluGlutoBulkGravitontoHHto2B2Vto2B2L2Nu_Combined - use_combined: - True - - -background: - TT: - class_value: 1 - background_datasets: # If there is an extension, just add more to the list - # TT: #When we use TTinclusive and appropriately weight, we actually can not make many batches ): - # - TT_ext1 - TTto2L2Nu: - - TTto2L2Nu_ext1 - TTto4Q: - - TTto4Q_ext1 - TTtoLNu2Q: - - TTtoLNu2Q_ext1 -# DY: -# class_value: 2 -# background_datasets: -# DYto2L_M_10to50_amcatnloFXFX: [] -# DYto2L_M_50_amcatnloFXFX: [] diff --git a/Studies/DNN/config/default_split_doubleLep.yaml b/Studies/DNN/config/default_split_doubleLep.yaml deleted file mode 100644 index 29ba1200..00000000 --- a/Studies/DNN/config/default_split_doubleLep.yaml +++ /dev/null @@ -1,77 +0,0 @@ -batch_dict: - GluGlutoRadion: 100 # It is important to put signal first since signal has extra branches - GluGlutoBulkGraviton: 100 - TT: 1200 - DY: 250 - ST: 100 - - -storage_folder: /eos/user/d/daebi/bbWW_development/anaTuples/FullForTraining/ - -selection_branches: [ 'lep1_legType', 'lep2_legType', 'weight_base', 'event', 'centralJet_pt', 'SelectedFatJet_pt' ] - -# Cuts for uproot -selection_cut: (tree['lep1_legType'] > 0) & (tree['lep2_legType'] > 0) & ( (ak.count(tree['centralJet_pt'], axis=1) > 1) | (ak.count(tree['SelectedFatJet_pt'], axis=1) > 0) ) -parity_func: (tree['event']%{nParity} == {parity_scan}) - -nParity: 4 - -# Cuts for RDF in creating the training file -iterate_cut: (lep1_legType > 0) && (lep2_legType > 0) && ( (centralJet_pt.size() > 1) || (SelectedFatJet_pt.size() > 0) ) && (event%{nParity} == {parity_scan}) - - -signal: - GluGlutoRadion: - class_value: 0 - spin: 0 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoRadiontoHHto2B2Vto2B2L2Nu_M_{} - combined_name: - GluGlutoRadiontoHHto2B2Vto2B2L2Nu_Combined - use_combined: - True - GluGlutoBulkGraviton: - class_value: 0 - spin: 2 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoBulkGravitontoHHto2B2Vto2B2L2Nu_M_{} - combined_name: - GluGlutoBulkGravitontoHHto2B2Vto2B2L2Nu_Combined - use_combined: - True - - -background: - TT: - class_value: 1 - background_datasets: # If there is an extension, just add more to the list - # TT: #When we use TTinclusive and appropriately weight, we actually can not make many batches ): - # - TT_ext1 - TTto2L2Nu: - - TTto2L2Nu_ext1 - TTto4Q: - - TTto4Q_ext1 - TTtoLNu2Q: - - TTtoLNu2Q_ext1 - DY: - class_value: 2 - background_datasets: - DYto2L_M_10to50_amcatnloFXFX: [ ] - DYto2L_M_50_0J_amcatnloFXFX: [ ] - DYto2L_M_50_1J_amcatnloFXFX: [ ] - DYto2L_M_50_2J_amcatnloFXFX: [ ] - ST: - class_value: 3 - background_datasets: - # ST_s_channel_antitop_4f_leptonDecays: [ ] # These samples have very low stats -- when scaled to XS they are limiting our stats - # ST_s_channel_top_4f_leptonDecays: [ ] - # ST_t_channel_antitop_4f_InclusiveDecays: [ ] - # ST_t_channel_top_4f_InclusiveDecays: [ ] - ST_tW_antitop_2L2Nu: [ ] - ST_tW_antitop_4Q: [ ] - ST_tW_antitop_LNu2Q: [ ] - ST_tW_top_2L2Nu: [ ] - ST_tW_top_4Q: [ ] - ST_tW_top_LNu2Q: [ ] diff --git a/Studies/DNN/config/default_split_singleLep.yaml b/Studies/DNN/config/default_split_singleLep.yaml deleted file mode 100644 index 4a7729eb..00000000 --- a/Studies/DNN/config/default_split_singleLep.yaml +++ /dev/null @@ -1,82 +0,0 @@ -batch_dict: - GluGlutoRadion: 100 # It is important to put signal first since signal has extra branches - GluGlutoBulkGraviton: 100 - TT: 2800 - # DY: 250 - ST: 700 - W: 300 - - -storage_folder: /eos/user/d/daebi/bbWW_development/anaTuples/FullForTraining/ - -selection_branches: [ 'lep1_legType', 'lep2_legType', 'weight_base', 'event', 'centralJet_pt', 'SelectedFatJet_pt' ] - -# Cuts for uproot -selection_cut: (tree['lep1_legType'] > 0) & (tree['lep2_legType'] <= 0) & ( (ak.count(tree['centralJet_pt'], axis=1) > 1) | (ak.count(tree['SelectedFatJet_pt'], axis=1) > 0) ) -parity_func: (tree['event']%{nParity} == {parity_scan}) - -nParity: 4 - -# Cuts for RDF in creating the training file -iterate_cut: (lep1_legType > 0) && (lep2_legType <= 0) && ( (centralJet_pt.size() > 1) || (SelectedFatJet_pt.size() > 0) ) && (event%{nParity} == {parity_scan}) - - -signal: - GluGlutoRadion: - class_value: 0 - spin: 0 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoRadiontoHHto2B2Vto2B2JLNu_M_{} - combined_name: - GluGlutoRadiontoHHto2B2Vto2B2JLNu_Combined - use_combined: - True - GluGlutoBulkGraviton: - class_value: 0 - spin: 2 - mass_points: [ 250, 260, 270, 280, 300, 350, 450, 550, 600, 650, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, 4000, 5000 ] - dataset_name_format: - GluGlutoBulkGravitontoHHto2B2Vto2B2JLNu_M_{} - combined_name: - GluGlutoBulkGravitontoHHto2B2Vto2B2JLNu_Combined - use_combined: - True - - -background: - TT: - class_value: 1 - background_datasets: # If there is an extension, just add more to the list - # TT: #When we use TTinclusive and appropriately weight, we actually can not make many batches ): - # - TT_ext1 - TTto2L2Nu: - - TTto2L2Nu_ext1 - TTto4Q: - - TTto4Q_ext1 - TTtoLNu2Q: - - TTtoLNu2Q_ext1 - DY: - class_value: 2 - background_datasets: - DYto2L_M_10to50_amcatnloFXFX: [ ] - DYto2L_M_50_amcatnloFXFX: [ ] - ST: - class_value: 3 - background_datasets: - # ST_s_channel_antitop_4f_leptonDecays: [] # These samples have very low stats -- when scaled to XS they are limiting our stats - # ST_s_channel_top_4f_leptonDecays: [] - # ST_t_channel_antitop_4f_InclusiveDecays: [] - # ST_t_channel_top_4f_InclusiveDecays: [] - ST_tW_antitop_2L2Nu: [ ] - ST_tW_antitop_4Q: [ ] - ST_tW_antitop_LNu2Q: [ ] - ST_tW_top_2L2Nu: [ ] - ST_tW_top_4Q: [ ] - ST_tW_top_LNu2Q: [ ] - W: - class_value: 4 - background_datasets: - WtoLNu_0J_amcatnloFXFX: [ ] - WtoLNu_1J_amcatnloFXFX: [ ] - WtoLNu_2J_amcatnloFXFX: [ ] diff --git a/Studies/DNN/config/default_training_setup_doubleLep.yaml b/Studies/DNN/config/default_training_setup_doubleLep.yaml deleted file mode 100644 index e3c65dae..00000000 --- a/Studies/DNN/config/default_training_setup_doubleLep.yaml +++ /dev/null @@ -1,130 +0,0 @@ -UseParametric: true -adv_activation: relu -adv_grad_factor: 1.0 -adv_learning_rate: 0.001 -adv_model: false -adv_submodule_steps: 50 -adv_submodule_tracker: 0 -adv_weight_decay: 0.004 -apply_common_gradients: true -batch_compression_factor: 10 -nClasses: 4 -class_activation: tanh -class_grad_factor: 0.1 -common_activation: tanh -continue_model: DNN_Models/default/default.keras -continue_training: false -disco_activation: tanh -disco_lambda_factor: 100 -do_step2: false -do_training: true -dropout: 0.0 -features: - - lep1_pt - - lep1_phi - - lep1_eta - - lep1_mass - - lep2_pt - - lep2_phi - - lep2_eta - - lep2_mass - - met_pt - - met_phi -highlevelfeatures: - - HT - - dR_dilep - - dR_dibjet - - dR_dilep_dibjet - - dR_dilep_dijet - - dPhi_lep1_lep2 - - dPhi_jet1_jet2 - - dPhi_MET_dilep - - dPhi_MET_dibjet - # - min_dR_lep0_jets # For some reason, this is sometimes inf - # - min_dR_lep1_jets # Check event 2030682 in this dataset may30 - - MT - - MT2_ll - - MT2_bb - - MT2_blbl - - ll_mass - - CosTheta_bb -hmefeatures: - - DoubleLep_DeepHME_mass -input_folder: /eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v3/Dataset_Run3_2022/ -learning_rate: 0.001 -listfeatures: - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 0 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 1 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 2 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 3 - - - - SelectedFatJet_pt - - SelectedFatJet_phi - - SelectedFatJet_eta - - SelectedFatJet_mass - - SelectedFatJet_particleNetWithMass_HbbvsQCD - - SelectedFatJet_SubJet1_btagDeepB - - SelectedFatJet_SubJet1_eta - - SelectedFatJet_SubJet1_mass - - SelectedFatJet_SubJet1_phi - - SelectedFatJet_SubJet1_pt - - SelectedFatJet_SubJet2_btagDeepB - - SelectedFatJet_SubJet2_eta - - SelectedFatJet_SubJet2_mass - - SelectedFatJet_SubJet2_phi - - SelectedFatJet_SubJet2_pt - - 0 -n_adv_layers: 3 -n_adv_units: 128 -n_class_layers: 3 -n_class_units: 128 -n_common_layers: 5 -n_common_units: 256 -n_disco_layers: 5 -n_disco_units: 512 -n_epochs: 40 -parametric_list: - - 250 - - 260 - - 270 - - 280 - - 300 - - 350 - - 450 - - 550 - - 600 - - 650 - - 700 - - 800 - - 1000 - - 1200 - - 1400 - - 1600 - - 1800 - - 2000 - - 2500 - - 3000 - - 4000 - - 5000 -patience: 100 -use_batch_norm: false -weight_decay: 0.004 diff --git a/Studies/DNN/config/default_training_setup_singleLep.yaml b/Studies/DNN/config/default_training_setup_singleLep.yaml deleted file mode 100644 index eb914a80..00000000 --- a/Studies/DNN/config/default_training_setup_singleLep.yaml +++ /dev/null @@ -1,116 +0,0 @@ -UseParametric: true -adv_activation: relu -adv_grad_factor: 1.0 -adv_learning_rate: 0.001 -adv_model: false -adv_submodule_steps: 50 -adv_submodule_tracker: 0 -adv_weight_decay: 0.004 -apply_common_gradients: true -batch_compression_factor: 1 -nClasses: 5 -class_activation: tanh -class_grad_factor: 0.1 -common_activation: tanh -continue_model: DNN_Models/default/default.keras -continue_training: false -disco_activation: tanh -disco_lambda_factor: 100 -do_step2: true -do_training: false -dropout: 0.0 -features: - - lep1_pt - - lep1_phi - - lep1_eta - - lep1_mass - - met_pt - - met_phi -highlevelfeatures: - - HT - - dR_dibjet - - dPhi_jet1_jet2 - - dPhi_MET_dibjet - # - min_dR_lep0_jets # For some reason, this is sometines inf - # - min_dR_lep1_jets # Check event 2030682 in this dataset may30 - - MT - - CosTheta_bb -# input_folder: DNN_Datasets/Dataset_2025-05-30-01-19-55/ -input_folder: /eos/user/d/daebi/DNN_Training_Datasets/SingleLepton_v3/Dataset_Run3_2022/ -# input_folder: DNN_Datasets/Dataset_SmallTest/ -learning_rate: 0.001 -listfeatures: - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 0 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 1 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 2 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 3 - - - - SelectedFatJet_pt - - SelectedFatJet_phi - - SelectedFatJet_eta - - SelectedFatJet_mass - - SelectedFatJet_SubJet1_btagDeepB - - SelectedFatJet_SubJet1_eta - - SelectedFatJet_SubJet1_mass - - SelectedFatJet_SubJet1_phi - - SelectedFatJet_SubJet1_pt - - SelectedFatJet_SubJet2_btagDeepB - - SelectedFatJet_SubJet2_eta - - SelectedFatJet_SubJet2_mass - - SelectedFatJet_SubJet2_phi - - SelectedFatJet_SubJet2_pt - - 0 -n_adv_layers: 5 -n_adv_units: 128 -n_class_layers: 5 -n_class_units: 128 -n_common_layers: 10 -n_common_units: 256 -n_disco_layers: 5 -n_disco_units: 512 -n_epochs: 40 -parametric_list: - - 250 - - 260 - - 270 - - 280 - - 300 - - 350 - - 450 - - 550 - - 600 - - 650 - - 700 - - 800 - - 1000 - - 1200 - - 1400 - - 1600 - - 1800 - - 2000 - - 2500 - - 3000 - - 4000 - - 5000 -patience: 100 -use_batch_norm: true -weight_decay: 0.004 diff --git a/Studies/DNN/config/step2_training_setup_doubleLep.yaml b/Studies/DNN/config/step2_training_setup_doubleLep.yaml deleted file mode 100644 index a01c35a7..00000000 --- a/Studies/DNN/config/step2_training_setup_doubleLep.yaml +++ /dev/null @@ -1,86 +0,0 @@ -UseParametric: true -apply_common_gradients: true -batch_compression_factor: 10 -nClasses: 2 -disco_activation: tanh -disco_lambda_factor: 0 -dropout: 0.0 -features: - - lep1_pt - - lep1_phi - - lep1_eta - - lep1_mass - - lep2_pt - - lep2_phi - - lep2_eta - - lep2_mass - - met_pt - - met_phi -highlevelfeatures: - - HT - - dR_dilep - - dR_dibjet - - dR_dilep_dibjet - - dR_dilep_dijet - - dPhi_lep1_lep2 - - dPhi_jet1_jet2 - - dPhi_MET_dilep - - dPhi_MET_dibjet - - MT - - MT2_ll - - MT2_bb - - MT2_blbl - - ll_mass - - CosTheta_bb -hmefeatures: [ ] -input_folder: /eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v3/Dataset_Run3_2022/ -learning_rate: 0.001 -listfeatures: - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 0 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 1 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 2 - - - - centralJet_pt - - centralJet_phi - - centralJet_eta - - centralJet_mass - - centralJet_btagPNetB - - 3 - - - - SelectedFatJet_pt - - SelectedFatJet_phi - - SelectedFatJet_eta - - SelectedFatJet_mass - - SelectedFatJet_particleNetWithMass_HbbvsQCD - - SelectedFatJet_SubJet1_btagDeepB - - SelectedFatJet_SubJet1_eta - - SelectedFatJet_SubJet1_mass - - SelectedFatJet_SubJet1_phi - - SelectedFatJet_SubJet1_pt - - SelectedFatJet_SubJet2_btagDeepB - - SelectedFatJet_SubJet2_eta - - SelectedFatJet_SubJet2_mass - - SelectedFatJet_SubJet2_phi - - SelectedFatJet_SubJet2_pt - - 0 -n_disco_layers: 5 -n_disco_units: 512 -n_epochs: 40 -parametric_list: - - 600 -patience: 100 -use_batch_norm: false -weight_decay: 0.004 diff --git a/Studies/DNN/config/training_setup_doubleLep_boosted.yaml b/Studies/DNN/config/training_setup_doubleLep_boosted.yaml new file mode 100644 index 00000000..7a5d2586 --- /dev/null +++ b/Studies/DNN/config/training_setup_doubleLep_boosted.yaml @@ -0,0 +1,99 @@ +UseParametric: true +nClasses: 2 +activation: tanh +batch_size: 1000 +dropout: 0.5 +features: + - lep1_pt + - lep1_phi + - lep1_eta + - lep1_mass + - lep2_pt + - lep2_phi + - lep2_eta + - lep2_mass + - PuppiMET_pt + - PuppiMET_phi + - HT + - dR_dilep + - dR_dibjet + - dR_dilep_dibjet + - dR_dilep_dijet + - dPhi_lep1_lep2 + - dPhi_jet1_jet2 + - dPhi_MET_dilep + - dPhi_MET_dibjet + - MT + - MT2_ll + - MT2_bb + - MT2_blbl + - MT2_blbl2 + - ll_mass + - CosTheta_bb + - bjet1_pt + - bjet1_phi + - bjet1_eta + - bjet1_mass + - bjet1_btagPNetB + - bjet2_pt + - bjet2_phi + - bjet2_eta + - bjet2_mass + - bjet2_btagPNetB + - other_jet1_pt + - other_jet1_phi + - other_jet1_eta + - other_jet1_mass + - other_jet1_btagPNetB + - other_jet2_pt + - other_jet2_phi + - other_jet2_eta + - other_jet2_mass + - other_jet2_btagPNetB + - fatbjet_pt + - fatbjet_phi + - fatbjet_eta + - fatbjet_mass + - fatbjet_mass_PNetCorr + - fatbjet_particleNetWithMass_HbbvsQCD + - fatbjet_tau1 + - fatbjet_tau2 + - fatbjet_tau3 + - fatbjet_tau4 + - fatbjet_msoftdrop + - fatbjet_neMultiplicity + - fatbjet_neHEF + - fatbjet_nConstituents + - fatbjet_muEF + - fatbjet_neEmEF + + - DoubleLep_DeepHME_mass + - DoubleLep_DeepHME_mass_error + +learning_rate: 0.00001 +n_layers: 5 +n_units: 512 +n_epochs: 100 +parametric_list: + - 300 + - 400 + - 500 + - 550 + - 600 + - 650 + - 700 + - 800 + - 900 + - 1000 + - 1200 + - 1400 + - 1600 + - 1800 + - 2000 + - 2500 + - 3000 + - 3500 + - 4000 +patience: 100 +use_batch_norm: true +weight_decay: 0.004 diff --git a/Studies/DNN/config/training_setup_doubleLep_resolved.yaml b/Studies/DNN/config/training_setup_doubleLep_resolved.yaml new file mode 100644 index 00000000..39d8b8d6 --- /dev/null +++ b/Studies/DNN/config/training_setup_doubleLep_resolved.yaml @@ -0,0 +1,83 @@ +UseParametric: true +nClasses: 2 +activation: tanh +batch_size: 10000 +dropout: 0.3 +features: + - lep1_pt + - lep1_phi + - lep1_eta + - lep1_mass + - lep2_pt + - lep2_phi + - lep2_eta + - lep2_mass + - PuppiMET_pt + - PuppiMET_phi + - HT + - dR_dilep + - dR_dibjet + - dR_dilep_dibjet + - dR_dilep_dijet + - dPhi_lep1_lep2 + - dPhi_jet1_jet2 + - dPhi_MET_dilep + - dPhi_MET_dibjet + - MT + - MT2_ll + - MT2_bb + - MT2_blbl + - MT2_blbl2 + - ll_mass + - CosTheta_bb + - bjet1_pt + - bjet1_phi + - bjet1_eta + - bjet1_mass + - bjet1_btagPNetB + - bjet2_pt + - bjet2_phi + - bjet2_eta + - bjet2_mass + - bjet2_btagPNetB + - other_jet1_pt + - other_jet1_phi + - other_jet1_eta + - other_jet1_mass + - other_jet1_btagPNetB + - other_jet2_pt + - other_jet2_phi + - other_jet2_eta + - other_jet2_mass + - other_jet2_btagPNetB + + - DoubleLep_DeepHME_mass + - DoubleLep_DeepHME_mass_error + +learning_rate: 0.0001 +n_layers: 5 +n_units: 512 +n_epochs: 500 +parametric_list: + - 300 + - 400 + - 500 + - 550 + - 600 + - 650 + - 700 + - 800 + - 900 + - 1000 + - 1200 + - 1400 + - 1600 + - 1800 + - 2000 + - 2500 + - 3000 + - 3500 + - 4000 +patience: 100 +use_batch_norm: true +weight_decay: 0.004 diff --git a/Studies/DNN/create_condor_configs.py b/Studies/DNN/create_condor_configs.py index d6619740..9bfca222 100644 --- a/Studies/DNN/create_condor_configs.py +++ b/Studies/DNN/create_condor_configs.py @@ -2,54 +2,30 @@ import yaml import awkward as ak +# Resolved +# template = "config/training_setup_doubleLep_resolved.yaml" +# output_folder = "CondorConfigs_25Feb_DoubleLepton_Resolved_Full_HME" +# input_file_template = "/afs/cern.ch/work/d/daebi/diHiggs/HH_bbWW_v2601a/Studies/DNN/ResolvedDataset_Feb25/Dataset/nParity{}_Merged.root" +# weight_file_template = "/afs/cern.ch/work/d/daebi/diHiggs/HH_bbWW_v2601a/Studies/DNN/ResolvedDataset_Feb25/Dataset/nParity{}_Merged_weight.root" +# training_name = "DNN_DoubleLepton_Resolved_Training{i}_par{j}" +# var_parse_dict = { +# 'learning_rate': [ 0.00001 ], +# 'n_epochs': [ 500 ], +# 'dropout': [ 0.3 ], +# } -template = "config/default_training_setup_singleLep.yaml" -output_folder = "CondorConfigs_Oct12_SingleLepton" -input_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/SingleLepton_v4/Dataset_Run3_2022/batchfile{}.root" -weight_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/SingleLepton_v4/Dataset_Run3_2022/weightfile{}.root" -hme_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/SingleLepton_v4/Dataset_Run3_2022/batchfile{}_HME_Friend.root" -batch_config_template = "/eos/user/d/daebi/DNN_Training_Datasets/SingleLepton_v4/Dataset_Run3_2022/batch_config_parity{}.yaml" -training_name = "DNN_SingleLep_Training{i}_par{j}" +# Boosted +template = "config/training_setup_doubleLep_boosted.yaml" +output_folder = "CondorConfigs_25Feb_DoubleLepton_Boosted_Full_HME" +input_file_template = "/afs/cern.ch/work/d/daebi/diHiggs/HH_bbWW_v2601a/Studies/DNN/BoostedDataset_Feb25/Dataset/nParity{}_Merged.root" +weight_file_template = "/afs/cern.ch/work/d/daebi/diHiggs/HH_bbWW_v2601a/Studies/DNN/BoostedDataset_Feb25/Dataset/nParity{}_Merged_weight.root" +training_name = "DNN_DoubleLepton_Boosted_Training{i}_par{j}" var_parse_dict = { - "adv_grad_factor": [0.0], - "adv_learning_rate": [0.0001], - "adv_submodule_steps": [0], - "class_grad_factor": [1.0], "learning_rate": [0.00001], "n_epochs": [100], "dropout": [0.3], - "weight_decay": [None], - "adv_weight_decay": [None], - "disco_lambda_factor": [0], - "n_disco_layers": [5], - "n_disco_units": [256], - "disco_activation": ["relu"], - "hmefeatures": [["SingleLep_DeepHME_mass", "SingleLep_DeepHME_mass_error"], None], } - -# template = "config/default_training_setup_doubleLep.yaml" -# output_folder = "CondorConfigs_Oct10_DoubleLepton" -# input_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v5/Dataset_Run3_2022/batchfile{}.root" -# weight_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v5/Dataset_Run3_2022/weightfile{}.root" -# hme_file_template = "/eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v5/Dataset_Run3_2022/batchfile{}_HME_Friend.root" -# batch_config_template = "/eos/user/d/daebi/DNN_Training_Datasets/DoubleLepton_v5/Dataset_Run3_2022/batch_config_parity{}.yaml" -# training_name = "DNN_DoubleLep_Training{i}_par{j}" -# var_parse_dict = { -# 'adv_grad_factor': [ 0.0 ], -# 'adv_learning_rate': [ 0.0001 ], -# 'adv_submodule_steps': [ 0,], -# 'class_grad_factor': [ 1.0 ], -# 'learning_rate': [ 0.00001 ], -# 'n_epochs': [ 50 ], -# 'dropout': [ 0.1 ], -# 'weight_decay': [ None ], -# 'adv_weight_decay': [ None ], -# 'disco_lambda_factor': [ 0 ], -# 'n_disco_layers': [ 10 ], -# 'hmefeatures': [ ['DoubleLep_DeepHME_mass', 'DoubleLep_DeepHME_mass_error'], None ] -# } - os.makedirs(output_folder, exist_ok=True) with open(template, "r") as f: @@ -64,20 +40,14 @@ config = default_config.copy() for name, var in zip(var_names, varset.tolist()): config[name] = var - for j in range(1): + for j in range(4): # Set up each parity config["training_file"] = input_file_template.format(j) config["weight_file"] = weight_file_template.format(j) - config["hme_friend_file"] = hme_file_template.format(j) - config["batch_config"] = batch_config_template.format(j) config["test_training_file"] = input_file_template.format((j + 1) % 4) config["test_weight_file"] = weight_file_template.format((j + 1) % 4) - config["test_hme_friend_file"] = hme_file_template.format((j + 1) % 4) - config["test_batch_config"] = batch_config_template.format((j + 1) % 4) config["validation_file"] = input_file_template.format((j + 2) % 4) config["validation_weight_file"] = weight_file_template.format((j + 2) % 4) - config["validation_hme_friend_file"] = hme_file_template.format((j + 2) % 4) - config["validation_batch_config"] = batch_config_template.format((j + 2) % 4) config["training_name"] = training_name.format(i=i, j=j) diff --git a/Studies/DNN/create_dataset.py b/Studies/DNN/create_dataset.py index 81f11e6b..31f3a195 100644 --- a/Studies/DNN/create_dataset.py +++ b/Studies/DNN/create_dataset.py @@ -1,1710 +1,246 @@ -import os, sys, gc, psutil +import os import uproot import numpy as np -from datetime import datetime import yaml -import awkward as ak from tqdm import tqdm import ROOT -import Analysis.hh_bbww as analysis - -import importlib - import FLAF.RunKit.grid_tools as grid_tools +from FLAF.RunKit.run_tools import ps_call ROOT.gROOT.SetBatch(True) ROOT.EnableThreadSafety() -# ROOT.EnableImplicitMT(4) - -sys.path.append(os.environ["ANALYSIS_PATH"]) -ana_path = os.environ["ANALYSIS_PATH"] - -header_path_AnalysisTools = "FLAF/include/AnalysisTools.h" -ROOT.gInterpreter.Declare( - f'#include "{os.path.join(ana_path,header_path_AnalysisTools)}"' -) -header_path_AnalysisMath = "FLAF/include/AnalysisMath.h" -ROOT.gInterpreter.Declare( - f'#include "{os.path.join(ana_path,header_path_AnalysisMath)}"' -) -header_path_MT2 = "FLAF/include/MT2.h" -ROOT.gInterpreter.Declare(f'#include "{os.path.join(ana_path,header_path_MT2)}"') -header_path_Lester_mt2_bisect = "FLAF/include/Lester_mt2_bisect.cpp" -ROOT.gInterpreter.Declare( - f'#include "{os.path.join(ana_path,header_path_Lester_mt2_bisect)}"' -) -lep1_p4 = "ROOT::Math::LorentzVector>(lep1_pt,lep1_eta,lep1_phi,lep1_mass)" -lep2_p4 = "ROOT::Math::LorentzVector>(lep2_pt,lep2_eta,lep2_phi,lep2_mass)" -b1_p4 = "ROOT::Math::LorentzVector>(centralJet_pt[0],centralJet_eta[0],centralJet_phi[0],centralJet_mass[0])" -b2_p4 = "ROOT::Math::LorentzVector>(centralJet_pt[1],centralJet_eta[1],centralJet_phi[1],centralJet_mass[1])" -MET_p4 = "ROOT::Math::LorentzVector>(met_pt, 0., met_phi, 0.)" - - -def create_signal_files(config_dict, output_folder, era): - storage_folder = os.path.join(config_dict["storage_folder"], era) - - for signal_name in config_dict["signal"]: - signal_dict = config_dict["signal"][signal_name] - mass_points = signal_dict["mass_points"] - dataset_name_format = signal_dict["dataset_name_format"] - use_combined = signal_dict["use_combined"] - - if use_combined: - combined_name = signal_dict["combined_name"] - - out_file_folder = os.path.join(output_folder, combined_name) - out_file_name = f"{os.path.join(out_file_folder, combined_name)}.root" - - if os.path.exists(out_file_name): - print(f"Combined file {out_file_name} already exists, skip") - continue - out_file = uproot.recreate(out_file_name) - - new_array = {} - nEvents = 0 - - print(f"Starting to merge signal {signal_name} files") - - for mass_point in tqdm(mass_points): - dataset_name = dataset_name_format.format(mass_point) - extension_list = [ - fn - for fn in os.listdir(storage_folder) - if fn.startswith(f"{dataset_name}_ext") - ] - - for ext_name in [dataset_name] + extension_list: - process_dir = os.path.join(storage_folder, ext_name) - for nano_file in [ - x for x in os.listdir(process_dir) if x.endswith(".root") - ]: - with uproot.open( - f"{os.path.join(process_dir, nano_file)}:Events" - ) as h: - tree = h.arrays() - nEvents += h.num_entries - - keys = tree.fields - for key in keys: - if key not in new_array.keys(): - new_array[key] = tree[key] - else: - new_array[key] = ak.concatenate( - [new_array[key], tree[key]] - ) - - # Shuffle the signal data - index = np.arange(nEvents) - np.random.shuffle(index) - for key in new_array.keys(): - new_array[key] = new_array[key][index] - - out_file["Events"] = new_array - out_file.close() +ROOT.EnableImplicitMT(4) -def create_signal_files_gfal(config_dict, output_folder, era): - storage_folder = os.path.join(config_dict["storage_folder"], era) +def measure_cut_datasets(config_dict, output_folder, remote=False): + storage_folder = os.path.join(config_dict["storage_folder"]) - for signal_name in config_dict["signal"]: - signal_dict = config_dict["signal"][signal_name] - mass_points = signal_dict["mass_points"] - dataset_name_format = signal_dict["dataset_name_format"] - use_combined = signal_dict["use_combined"] - - if use_combined: - combined_name = signal_dict["combined_name"] - - out_file_folder = os.path.join(output_folder, combined_name) - out_file_name = f"{os.path.join(out_file_folder, combined_name)}.root" - - if os.path.exists(out_file_name): - print(f"Combined file {out_file_name} already exists, skip") - continue - out_file = uproot.recreate(out_file_name) - - new_array = {} - nEvents = 0 - - print(f"Starting to merge signal {signal_name} files") - - for mass_point in tqdm(mass_points): - dataset_name = dataset_name_format.format(mass_point) - extension_list = [ - fn.name - for fn in grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{storage_folder}" - ) - if fn.name.startswith(f"{dataset_name}_ext") - ] - - for ext_name in [dataset_name] + extension_list: - process_dir = os.path.join(storage_folder, ext_name) - gfal_listdir = grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{process_dir}" - ) - for nano_file in [ - x.name for x in gfal_listdir if x.name.endswith(".root") - ]: - tmp_file = os.path.join( - f"root://cmseos.fnal.gov/{process_dir}", nano_file - ) - with uproot.open(f"{tmp_file}:Events") as h: - tree = h.arrays() - nEvents += h.num_entries - - keys = tree.fields - for key in keys: - if key not in new_array.keys(): - new_array[key] = tree[key] - else: - new_array[key] = ak.concatenate( - [new_array[key], tree[key]] - ) - - # Shuffle the signal data - index = np.arange(nEvents) - np.random.shuffle(index) - for key in new_array.keys(): - new_array[key] = new_array[key][index] - - out_file["Events"] = new_array - out_file.close() - - -def create_dict(config_dict, output_folder, era): - batch_dict = config_dict["batch_dict"] - storage_folder = os.path.join(config_dict["storage_folder"], era) - selection_branches = config_dict["selection_branches"] - selection_cut = config_dict["selection_cut"] - - if "batch_size" not in batch_dict.keys(): - batch_dict["batch_size"] = 0 process_dict = {} - for key in batch_dict.keys(): - if key == "batch_size": - continue - batch_dict["batch_size"] += batch_dict[key] - process_dict[key] = {} - for nParity in range(config_dict["nParity"]): - empty_dict_example_file = "" - nParity_Cut = config_dict["parity_func"].format( - nParity=config_dict["nParity"], parity_scan=nParity - ) - total_cut = f"{selection_cut} & {nParity_Cut}" + iterate_cut = config_dict["iterate_cut"] + parity_cut = config_dict["parity_cut"] - out_yaml = f"batch_config_parity{nParity}.yaml" + signal_list = config_dict["signal"] - if os.path.exists(os.path.join(output_folder, out_yaml)): - print(f"YAML file {out_yaml} already exists, skip") - continue - - print("Looping over signals in config") - for signal_name in tqdm(config_dict["signal"]): - signal_dict = config_dict["signal"][signal_name] - class_value = signal_dict["class_value"] - spin_value = signal_dict["spin"] - mass_points = signal_dict["mass_points"] - dataset_name_format = signal_dict["dataset_name_format"] - use_combined = signal_dict["use_combined"] - - # If a combined file exists, lets use that - # if f"{signal_dict['combined_name']}.root" in os.listdir(output_folder): - if use_combined: - dataset_name = signal_dict["combined_name"] - - process_dict[signal_name][dataset_name] = { - "total": 0, - "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "spin": spin_value, - "mass": -1, - "all_extensions": [], - "storage_folder": os.path.join(os.getcwd(), output_folder), - } - - process_dict[signal_name][dataset_name]["all_extensions"] = [ - dataset_name - ] - - with uproot.open( - f"{os.path.join(output_folder, dataset_name, dataset_name)}.root:Events" - ) as h: - tree = h.arrays(selection_branches) - process_dict[signal_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[signal_name][dataset_name]["total_cut"] += int( - np.sum(eval(total_cut)) - ) - eval_string = f"float(np.sum(tree[{total_cut}].weight_base))" - process_dict[signal_name][dataset_name]["weight_cut"] += eval( - eval_string - ) - - for mass_point in mass_points: - dataset_name = dataset_name_format.format(mass_point) - - process_dict[signal_name][dataset_name] = { - "total": 0, - "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "spin": spin_value, - "mass": mass_point, - "all_extensions": [], - "storage_folder": storage_folder, - } - - extension_list = [ - fn - for fn in os.listdir(storage_folder) - if fn.startswith(f"{dataset_name}_ext") - ] - process_dict[signal_name][dataset_name]["all_extensions"] = [ - dataset_name - ] + extension_list - - for ext_name in process_dict[signal_name][dataset_name][ - "all_extensions" - ]: - process_dir = os.path.join(storage_folder, ext_name) - for nano_file in [ - x for x in os.listdir(process_dir) if x.endswith(".root") - ]: - with uproot.open( - f"{os.path.join(process_dir, nano_file)}:Events" - ) as h: - tree = h.arrays(selection_branches) - process_dict[signal_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[signal_name][dataset_name]["total_cut"] += int( - np.sum(eval(total_cut)) - ) - eval_string = ( - f"float(np.sum(tree[{total_cut}].weight_base))" - ) - process_dict[signal_name][dataset_name][ - "weight_cut" - ] += eval(eval_string) - empty_dict_example_file = os.path.join(process_dir, nano_file) - - print("Looping over backgrounds in config") - for background_name in config_dict["background"]: - background_dict = config_dict["background"][background_name] - class_value = background_dict["class_value"] - dataset_names = background_dict["background_datasets"] - - if background_name not in process_dict.keys(): - print(f"Background {background_name} not in process_dict, skip") - continue - - print(f"Looping background {background_name}") - for dataset_name in tqdm(dataset_names): - process_dict[background_name][dataset_name] = { - "total": 0, - "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "all_extensions": [], - "storage_folder": storage_folder, - } - - extension_list = [ - fn - for fn in os.listdir(storage_folder) - if fn.startswith(f"{dataset_name}_ext") - ] - process_dict[background_name][dataset_name]["all_extensions"] = [ - dataset_name - ] + extension_list - - for ext_name in process_dict[background_name][dataset_name][ - "all_extensions" - ]: - process_dir = os.path.join(storage_folder, ext_name) - for nano_file in [ - x for x in os.listdir(process_dir) if x.endswith(".root") - ]: - with uproot.open( - f"{os.path.join(process_dir, nano_file)}:Events" - ) as h: - tree = h.arrays(selection_branches) - process_dict[background_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[background_name][dataset_name][ - "total_cut" - ] += int(np.sum(eval(total_cut))) - eval_string = ( - f"float(np.sum(tree[{total_cut}].weight_base))" - ) - process_dict[background_name][dataset_name][ - "weight_cut" - ] += eval(eval_string) - - print( - f"Finished background {dataset_name}, how many total? {process_dict[background_name][dataset_name]['total_cut']}" - ) - - # Add totals to start the spin/mass dist and remove the individual signal files - signal_names = config_dict["signal"].keys() - for process in process_dict: - process_dict[process]["total"] = 0 - process_dict[process]["weight"] = 0 - use_combined = False - if process in signal_names: - use_combined = config_dict["signal"][process]["use_combined"] - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if use_combined: - if subprocess == config_dict["signal"][process]["combined_name"]: - continue - process_dict[process]["total"] += process_dict[process][subprocess][ - "total_cut" - ] - process_dict[process]["weight"] += process_dict[process][subprocess][ - "weight_cut" - ] - - # Calculate the random spin/mass distribution for backgrounds to be assigned during parametric DNN - spin_mass_dist = {} - - total_signal = 0 - for signal_name in config_dict["signal"]: - total_signal += process_dict[signal_name]["total"] - for signal_name in config_dict["signal"]: - keys_to_remove = ( - [] - ) # Keys we want to remove if the combined option is being used - use_combined = config_dict["signal"][signal_name]["use_combined"] - for subprocess in process_dict[signal_name]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if use_combined: - if ( - subprocess - == config_dict["signal"][signal_name]["combined_name"] - ): - continue - subprocess_dict = process_dict[signal_name][subprocess] - if f"{subprocess_dict['spin']}" not in spin_mass_dist.keys(): - spin_mass_dist[f"{subprocess_dict['spin']}"] = {} - spin_mass_dist[f"{subprocess_dict['spin']}"][ - f"{subprocess_dict['mass']}" - ] = (subprocess_dict["total_cut"] / total_signal) - if use_combined: - keys_to_remove.append(subprocess) - # Remove unneeded keys since we will use combined anyway - for key in keys_to_remove: - del process_dict[signal_name][key] - - for process in process_dict: - batch_size_sum = 0 - for subprocess in process_dict[process]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - process_dict[process][subprocess]["batch_size"] = int( - batch_dict[process] - * process_dict[process][subprocess]["weight_cut"] - / process_dict[process]["weight"] - ) - print(f"Looking at subprocess {subprocess}") - nBatches = 0 - if process_dict[process][subprocess]["batch_size"] != 0: - nBatches = int( - process_dict[process][subprocess]["total_cut"] - / process_dict[process][subprocess]["batch_size"] - ) - process_dict[process][subprocess]["nBatches"] = nBatches - batch_size_sum += process_dict[process][subprocess]["batch_size"] - - print(f"Process {process} has batch size sum {batch_size_sum}") - while batch_size_sum != batch_dict[process]: - print( - f"Warning this is bad batch size, size={batch_size_sum} where goal is {batch_dict[process]}" - ) - max_batches_subprocess = "" - max_batches_val = 0 - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith( - "weight" - ): - continue - if process_dict[process][subprocess]["nBatches"] > max_batches_val: - max_batches_val = process_dict[process][subprocess]["nBatches"] - max_batches_subprocess = subprocess - - print( - f"Trying to fix, incrementing {max_batches_subprocess} batch size {process_dict[process][max_batches_subprocess]['batch_size']} by 1" - ) - process_dict[process][max_batches_subprocess]["batch_size"] += 1 - print( - f"nBatches went from {process_dict[process][max_batches_subprocess]['nBatches']}" - ) - process_dict[process][max_batches_subprocess]["nBatches"] = int( - process_dict[process][max_batches_subprocess]["total_cut"] - / process_dict[process][max_batches_subprocess]["batch_size"] - ) - print(f"To {process_dict[process][max_batches_subprocess]['nBatches']}") - batch_size_sum += 1 - - current_index = 0 - for process in process_dict.keys(): - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - process_dict[process][subprocess]["batch_start"] = current_index - current_index += process_dict[process][subprocess]["batch_size"] - - nBatches = 1e100 - for process in process_dict.keys(): - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if process_dict[process][subprocess]["nBatches"] < nBatches and ( - process_dict[process][subprocess]["nBatches"] != 0 - ): - nBatches = process_dict[process][subprocess]["nBatches"] - - print(f"Creating {nBatches} batches, according to distribution. ") - print(process_dict) - print(f"And total batch size is {batch_dict['batch_size']}") - - machine_yaml = { - "meta_data": {}, - "processes": [], - } - - machine_yaml["meta_data"]["storage_folder"] = storage_folder - machine_yaml["meta_data"]["batch_dict"] = batch_dict - machine_yaml["meta_data"]["selection_branches"] = selection_branches - machine_yaml["meta_data"]["selection_cut"] = total_cut - machine_yaml["meta_data"]["iterate_cut"] = config_dict["iterate_cut"].format( - nParity=config_dict["nParity"], parity_scan=nParity - ) - machine_yaml["meta_data"][ - "empty_dict_example" - ] = empty_dict_example_file # Example for empty dict structure - machine_yaml["meta_data"]["input_filename"] = f"batchfile{nParity}.root" - machine_yaml["meta_data"][ - "hme_friend_filename" - ] = f"batchfile{nParity}_HME_Friend.root" - machine_yaml["meta_data"][ - "output_DNNname" - ] = f"ResHH_Classifier_parity{nParity}" - - machine_yaml["meta_data"][ - "spin_mass_dist" - ] = spin_mass_dist # Dict of spin/mass distribution values for random choice parametric - - for process in process_dict: - for subprocess in process_dict[process]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - print("Using subprocess ", subprocess) - subprocess_dict = process_dict[process][subprocess] - datasets_full_pathway = [ - os.path.join(subprocess_dict["storage_folder"], fn) - for fn in subprocess_dict["all_extensions"] - ] - tmp_process_dict = { - "datasets": datasets_full_pathway, - "class_value": subprocess_dict["class_value"], - "batch_start": subprocess_dict["batch_start"], - "batch_size": subprocess_dict["batch_size"], - "nBatches": subprocess_dict["nBatches"], - } - machine_yaml["processes"].append(tmp_process_dict) - - with open(os.path.join(output_folder, out_yaml), "w") as outfile: - yaml.dump(machine_yaml, outfile) - - -def create_dict_gfal(config_dict, output_folder, era): - batch_dict = config_dict["batch_dict"] - storage_folder = os.path.join(config_dict["storage_folder"], era) - selection_branches = config_dict["selection_branches"] - selection_cut = config_dict["selection_cut"] - - if "batch_size" not in batch_dict.keys(): - batch_dict["batch_size"] = 0 - process_dict = {} - for key in batch_dict.keys(): - if key == "batch_size": - continue - batch_dict["batch_size"] += batch_dict[key] - process_dict[key] = {} + background_list = config_dict["background"] for nParity in range(config_dict["nParity"]): - empty_dict_example_file = "" - nParity_Cut = config_dict["parity_func"].format( - nParity=config_dict["nParity"], parity_scan=nParity - ) - total_cut = f"{selection_cut} & {nParity_Cut}" + output_nParity = os.path.join(output_folder, f"nParity{nParity}_Merged") + os.makedirs(output_nParity, exist_ok=True) - out_yaml = f"batch_config_parity{nParity}.yaml" + nParity_string = f"nParity_{nParity}" - if os.path.exists(os.path.join(output_folder, out_yaml)): - print(f"YAML file {out_yaml} already exists, skip") - continue + process_dict[nParity_string] = {} - print("Looping over signals in config") - for signal_name in tqdm(config_dict["signal"]): + for signal_name in config_dict["signal"]: + process_dict[nParity_string][signal_name] = {} signal_dict = config_dict["signal"][signal_name] - class_value = signal_dict["class_value"] - spin_value = signal_dict["spin"] mass_points = signal_dict["mass_points"] - dataset_name_format = signal_dict["dataset_name_format"] - use_combined = signal_dict["use_combined"] - - # If a combined file exists, lets use that - # if f"{signal_dict['combined_name']}.root" in os.listdir(output_folder): - if use_combined: - dataset_name = signal_dict["combined_name"] - - process_dict[signal_name][dataset_name] = { - "total": 0, - "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "spin": spin_value, - "mass": -1, - "all_extensions": [], - "storage_folder": os.path.join(os.getcwd(), output_folder), - } - - process_dict[signal_name][dataset_name]["all_extensions"] = [ - dataset_name - ] - - with uproot.open( - f"{os.path.join(output_folder, dataset_name, dataset_name)}.root:Events" - ) as h: - tree = h.arrays(selection_branches) - process_dict[signal_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[signal_name][dataset_name]["total_cut"] += int( - np.sum(eval(total_cut)) - ) - eval_string = f"float(np.sum(tree[{total_cut}].weight_MC_Lumi_pu))" - process_dict[signal_name][dataset_name]["weight_cut"] += eval( - eval_string - ) - for mass_point in mass_points: - dataset_name = dataset_name_format.format(mass_point) - - process_dict[signal_name][dataset_name] = { + process_dict[nParity_string][signal_name][mass_point] = { "total": 0, "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "spin": spin_value, - "mass": mass_point, - "all_extensions": [], - "storage_folder": storage_folder, + "total_cut_weighted": 0.0, } - extension_list = [ - fn.name - for fn in grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{storage_folder}" - ) - if fn.name.startswith(f"{dataset_name}_ext") - ] - - process_dict[signal_name][dataset_name]["all_extensions"] = [ - dataset_name - ] + extension_list - - for ext_name in process_dict[signal_name][dataset_name][ - "all_extensions" - ]: - process_dir = os.path.join(storage_folder, ext_name) - gfal_listdir = grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{process_dir}" - ) - for nano_file in [ - x.name for x in gfal_listdir if x.name.endswith(".root") - ]: - tmp_file = os.path.join( - f"root://cmseos.fnal.gov/{process_dir}", nano_file - ) - with uproot.open(f"{tmp_file}:Events") as h: - tree = h.arrays(selection_branches) - process_dict[signal_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[signal_name][dataset_name]["total_cut"] += int( - np.sum(eval(total_cut)) - ) - eval_string = ( - f"float(np.sum(tree[{total_cut}].weight_MC_Lumi_pu))" - ) - process_dict[signal_name][dataset_name][ - "weight_cut" - ] += eval(eval_string) - empty_dict_example_file = os.path.join(process_dir, nano_file) - - print("Looping over backgrounds in config") for background_name in config_dict["background"]: + process_dict[nParity_string][background_name] = {} background_dict = config_dict["background"][background_name] - class_value = background_dict["class_value"] dataset_names = background_dict["background_datasets"] - - if background_name not in process_dict.keys(): - print(f"Background {background_name} not in process_dict, skip") - continue - - print(f"Looping background {background_name}") - for dataset_name in tqdm(dataset_names): - process_dict[background_name][dataset_name] = { + for dataset_name in dataset_names: + process_dict[nParity_string][background_name][dataset_name] = { "total": 0, "total_cut": 0, - "weight_cut": 0, - "nBatches": 0, - "batch_size": 0, - "batch_start": 0, - "class_value": class_value, - "all_extensions": [], - "storage_folder": storage_folder, + "total_cut_weighted": 0.0, } - extension_list = [ - fn.name - for fn in grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{storage_folder}" - ) - if fn.name.startswith(f"{dataset_name}_ext") - ] - - process_dict[background_name][dataset_name]["all_extensions"] = [ - dataset_name - ] + extension_list - - for ext_name in process_dict[background_name][dataset_name][ - "all_extensions" - ]: - process_dir = os.path.join(storage_folder, ext_name) - gfal_listdir = grid_tools.gfal_ls( - f"davs://cmseos.fnal.gov:9000/{process_dir}" - ) - for nano_file in [ - x.name for x in gfal_listdir if x.name.endswith(".root") - ]: - tmp_file = os.path.join( - f"root://cmseos.fnal.gov/{process_dir}", nano_file - ) - with uproot.open(f"{tmp_file}:Events") as h: - tree = h.arrays(selection_branches) - process_dict[background_name][dataset_name]["total"] += int( - h.num_entries - ) - process_dict[background_name][dataset_name][ - "total_cut" - ] += int(np.sum(eval(total_cut))) - eval_string = ( - f"float(np.sum(tree[{total_cut}].weight_MC_Lumi_pu))" - ) - process_dict[background_name][dataset_name][ - "weight_cut" - ] += eval(eval_string) + print("Looping signal datasets") + for signal_name in signal_list: + signal_dict = config_dict["signal"][signal_name] + mass_points = signal_dict["mass_points"] + dataset_name_format = signal_dict["dataset_name_format"] + class_value = signal_dict["class_value"] - print( - f"Finished background {dataset_name}, how many total? {process_dict[background_name][dataset_name]['total_cut']}" - ) + for mass_point in tqdm(mass_points): + X_mass = mass_point + dataset_name = dataset_name_format.format(mass_point) - # Add totals to start the spin/mass dist and remove the individual signal files - signal_names = config_dict["signal"].keys() - for process in process_dict: - process_dict[process]["total"] = 0 - process_dict[process]["weight"] = 0 - use_combined = False - if process in signal_names: - use_combined = config_dict["signal"][process]["use_combined"] - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if use_combined: - if subprocess == config_dict["signal"][process]["combined_name"]: - continue - process_dict[process]["total"] += process_dict[process][subprocess][ - "total_cut" - ] - process_dict[process]["weight"] += process_dict[process][subprocess][ - "weight_cut" - ] + process_dir = os.path.join(storage_folder, dataset_name) - # Calculate the random spin/mass distribution for backgrounds to be assigned during parametric DNN - spin_mass_dist = {} + if remote: + input_files = f"root://cmseos.fnal.gov/{process_dir}/*.root" + else: + input_files = f"{process_dir}/*.root" + treeName = "Events" + rdf = ROOT.RDataFrame(treeName, input_files) - total_signal = 0 - for signal_name in config_dict["signal"]: - total_signal += process_dict[signal_name]["total"] - for signal_name in config_dict["signal"]: - keys_to_remove = ( - [] - ) # Keys we want to remove if the combined option is being used - use_combined = config_dict["signal"][signal_name]["use_combined"] - for subprocess in process_dict[signal_name]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if use_combined: - if ( - subprocess - == config_dict["signal"][signal_name]["combined_name"] - ): - continue - subprocess_dict = process_dict[signal_name][subprocess] - if f"{subprocess_dict['spin']}" not in spin_mass_dist.keys(): - spin_mass_dist[f"{subprocess_dict['spin']}"] = {} - spin_mass_dist[f"{subprocess_dict['spin']}"][ - f"{subprocess_dict['mass']}" - ] = (subprocess_dict["total_cut"] / total_signal) - if use_combined: - keys_to_remove.append(subprocess) - # Remove unneeded keys since we will use combined anyway - for key in keys_to_remove: - del process_dict[signal_name][key] + total = rdf.Count().GetValue() + rdf = rdf.Filter(iterate_cut) - for process in process_dict: - batch_size_sum = 0 - for subprocess in process_dict[process]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - process_dict[process][subprocess]["batch_size"] = int( - batch_dict[process] - * process_dict[process][subprocess]["weight_cut"] - / process_dict[process]["weight"] + for nParity in range(config_dict["nParity"]): + nParity_string = f"nParity_{nParity}" + parity_cut_formatted = parity_cut.format( + nParity=config_dict["nParity"], parity_scan=nParity ) - print(f"Looking at subprocess {subprocess}") - nBatches = 0 - if process_dict[process][subprocess]["batch_size"] != 0: - nBatches = int( - process_dict[process][subprocess]["total_cut"] - / process_dict[process][subprocess]["batch_size"] - ) - process_dict[process][subprocess]["nBatches"] = nBatches - batch_size_sum += process_dict[process][subprocess]["batch_size"] + output_nParity = os.path.join(output_folder, f"nParity{nParity}_Merged") + output_file = os.path.join(output_nParity, f"{dataset_name}_merge.root") + + rdf_tmp = rdf.Filter(parity_cut_formatted) + cut = rdf_tmp.Count().GetValue() + weighted_cut = rdf_tmp.Sum("weight_Central").GetValue() + + process_dict[nParity_string][signal_name][mass_point]["total"] = total + process_dict[nParity_string][signal_name][mass_point]["total_cut"] = cut + process_dict[nParity_string][signal_name][mass_point][ + "total_cut_weighted" + ] = weighted_cut + rdf_tmp = rdf_tmp.Define("class_value", f"{class_value}") + rdf_tmp = rdf_tmp.Define("X_mass", f"{X_mass}") + rdf_tmp.Snapshot(treeName, output_file) + + for background_name in background_list: + background_dict = config_dict["background"][background_name] + dataset_names = background_dict["background_datasets"] + class_value = background_dict["class_value"] + X_mass = 0 + + print(f"Looping background {background_name}") + for dataset_name in tqdm(dataset_names): + process_dir = os.path.join(storage_folder, dataset_name) + + if remote: + input_files = f"root://cmseos.fnal.gov/{process_dir}/*.root" + else: + input_files = f"{process_dir}/*.root" + treeName = "Events" + rdf = ROOT.RDataFrame(treeName, input_files) - print(f"Process {process} has batch size sum {batch_size_sum}") - while batch_size_sum != batch_dict[process]: - print( - f"Warning this is bad batch size, size={batch_size_sum} where goal is {batch_dict[process]}" - ) - max_batches_subprocess = "" - max_batches_val = 0 - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith( - "weight" - ): - continue - if process_dict[process][subprocess]["nBatches"] > max_batches_val: - max_batches_val = process_dict[process][subprocess]["nBatches"] - max_batches_subprocess = subprocess + total = rdf.Count().GetValue() + rdf = rdf.Filter(iterate_cut) - print( - f"Trying to fix, incrementing {max_batches_subprocess} batch size {process_dict[process][max_batches_subprocess]['batch_size']} by 1" - ) - process_dict[process][max_batches_subprocess]["batch_size"] += 1 - print( - f"nBatches went from {process_dict[process][max_batches_subprocess]['nBatches']}" - ) - process_dict[process][max_batches_subprocess]["nBatches"] = int( - process_dict[process][max_batches_subprocess]["total_cut"] - / process_dict[process][max_batches_subprocess]["batch_size"] + for nParity in range(config_dict["nParity"]): + nParity_string = f"nParity_{nParity}" + parity_cut_formatted = parity_cut.format( + nParity=config_dict["nParity"], parity_scan=nParity ) - print(f"To {process_dict[process][max_batches_subprocess]['nBatches']}") - batch_size_sum += 1 + output_nParity = os.path.join(output_folder, f"nParity{nParity}_Merged") + output_file = os.path.join(output_nParity, f"{dataset_name}_merge.root") - current_index = 0 - for process in process_dict.keys(): - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - process_dict[process][subprocess]["batch_start"] = current_index - current_index += process_dict[process][subprocess]["batch_size"] + rdf_tmp = rdf.Filter(parity_cut_formatted) + cut = rdf_tmp.Count().GetValue() + weighted_cut = rdf_tmp.Sum("weight_Central").GetValue() - nBatches = 1e100 - for process in process_dict.keys(): - for subprocess in process_dict[process].keys(): - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - if process_dict[process][subprocess]["nBatches"] < nBatches and ( - process_dict[process][subprocess]["nBatches"] != 0 - ): - nBatches = process_dict[process][subprocess]["nBatches"] - - print(f"Creating {nBatches} batches, according to distribution. ") - print(process_dict) - print(f"And total batch size is {batch_dict['batch_size']}") - - machine_yaml = { - "meta_data": {}, - "processes": [], - } - - machine_yaml["meta_data"]["storage_folder"] = storage_folder - machine_yaml["meta_data"]["batch_dict"] = batch_dict - machine_yaml["meta_data"]["selection_branches"] = selection_branches - machine_yaml["meta_data"]["selection_cut"] = total_cut - machine_yaml["meta_data"]["iterate_cut"] = config_dict["iterate_cut"].format( - nParity=config_dict["nParity"], parity_scan=nParity - ) - machine_yaml["meta_data"][ - "empty_dict_example" - ] = empty_dict_example_file # Example for empty dict structure - machine_yaml["meta_data"]["input_filename"] = f"batchfile{nParity}.root" - machine_yaml["meta_data"][ - "hme_friend_filename" - ] = f"batchfile{nParity}_HME_Friend.root" - machine_yaml["meta_data"][ - "output_DNNname" - ] = f"ResHH_Classifier_parity{nParity}" - - machine_yaml["meta_data"][ - "spin_mass_dist" - ] = spin_mass_dist # Dict of spin/mass distribution values for random choice parametric - - for process in process_dict: - for subprocess in process_dict[process]: - if subprocess.startswith("total") or subprocess.startswith("weight"): - continue - print("Using subprocess ", subprocess) - subprocess_dict = process_dict[process][subprocess] - datasets_full_pathway = [ - os.path.join(subprocess_dict["storage_folder"], fn) - for fn in subprocess_dict["all_extensions"] - ] - tmp_process_dict = { - "datasets": datasets_full_pathway, - "class_value": subprocess_dict["class_value"], - "batch_start": subprocess_dict["batch_start"], - "batch_size": subprocess_dict["batch_size"], - "nBatches": subprocess_dict["nBatches"], - } - machine_yaml["processes"].append(tmp_process_dict) + process_dict[nParity_string][background_name][dataset_name][ + "total" + ] = total + process_dict[nParity_string][background_name][dataset_name][ + "total_cut" + ] = cut + process_dict[nParity_string][background_name][dataset_name][ + "total_cut_weighted" + ] = weighted_cut + rdf_tmp = rdf_tmp.Define("class_value", f"{class_value}") + rdf_tmp = rdf_tmp.Define("X_mass", f"{X_mass}") + rdf_tmp.Snapshot(treeName, output_file) + for nParity in range(config_dict["nParity"]): + nParity_string = f"nParity_{nParity}" + out_yaml = f"dataset_distribution_parity{nParity}.yaml" with open(os.path.join(output_folder, out_yaml), "w") as outfile: - yaml.dump(machine_yaml, outfile) - - -def create_file(config_dict, output_folder, out_filename): - print( - f"Starting create file. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" - ) - nBatches = None - print(config_dict.keys()) - for process in config_dict["processes"]: - if (nBatches is None) or ( - (process["nBatches"] <= nBatches) and (process["nBatches"] != 0) - ): - nBatches = process["nBatches"] - - print(f"Going to make {nBatches} batches") - batch_size = config_dict["meta_data"]["batch_dict"]["batch_size"] - - step_idx = 0 + yaml.dump(process_dict[nParity_string], outfile) - # Get the name/type (And order!) of signal columns - master_column_types = [] - master_column_names_vec = ROOT.std.vector("string")() - # Assume master(signal) is saved first and use idx==0 entry to fill + # hadd the files together to make a final merged.root + hadd_out = os.path.join(output_folder, f"nParity{nParity}_Merged.root") + hadd_in = os.path.join(output_folder, f"nParity{nParity}_Merged/*.root") + ps_call(f"hadd {hadd_out} {hadd_in}", verbose=1) - for process in config_dict["processes"]: - process_filelist = [f"{x}/*.root" for x in process["datasets"]] - - tmp_filename = os.path.join(output_folder, f"tmp{step_idx}.root") - tmpnext_filename = os.path.join(output_folder, f"tmp{step_idx+1}.root") - - print(process_filelist) - df_in = ROOT.RDataFrame("Events", process_filelist) - - # Filter for nLeps and Parity (iterate cut in config) - df_in = df_in.Filter(config_dict["meta_data"]["iterate_cut"]) - - nEntriesPerBatch = process["batch_size"] - nBatchStart = process["batch_start"] - nBatchEnd = nBatchStart + nEntriesPerBatch - - if nEntriesPerBatch == 0: - print(f"Process has batch size of 0, skip the save loop") - continue - - # Load df_out, if first iter then load an empty, otherwise load the past file - if step_idx == 0: - df_out = ROOT.RDataFrame(nBatches * batch_size) - df_out = df_out.Define("is_valid", "false") - # Fill master column nametype - for name in df_in.GetColumnNames(): - if name.startswith("gen"): - continue - if name.startswith("weight_") and not name == "weight_base": - continue - master_column_names_vec.push_back(name) - master_column_types = [ - str(df_in.GetColumnType(str(c))) for c in master_column_names_vec - ] - else: - df_out = ROOT.RDataFrame("Events", tmp_filename) - - local_column_names_vec = ROOT.std.vector("string")() - for name in df_in.GetColumnNames(): - if name.startswith("gen"): - continue - if name.startswith("weight_") and not name == "weight_base": - continue - local_column_names_vec.push_back(name) - local_column_types = [ - str(df_in.GetColumnType(str(c))) for c in local_column_names_vec - ] - # Need a local_to_master_map so that local columns keep the same index as the master columns - local_to_master_map = [ - list(master_column_names_vec).index(local_name) - for local_name in local_column_names_vec - ] - master_size = len(master_column_names_vec) - - queue_size = 10 - max_entries = nEntriesPerBatch * nBatches - - tuple_maker = ROOT.analysis.TupleMaker(*local_column_types)( - queue_size, max_entries - ) - - df_out = tuple_maker.FillDF( - ROOT.RDF.AsRNode(df_out), - ROOT.RDF.AsRNode(df_in), - local_to_master_map, - master_size, - local_column_names_vec, - nBatchStart, - nBatchEnd, - batch_size, - ) - - for column_idx, column_name in enumerate(master_column_names_vec): - column_type = master_column_types[column_idx] - - if step_idx == 0: - df_out = df_out.Define( - str(column_name), - f"_entry ? _entry->GetValue<{column_type}>({column_idx}) : {column_type}() ", - ) - else: - if column_name not in local_column_names_vec: - continue - df_out = df_out.Redefine( - str(column_name), - f"_entry ? _entry->GetValue<{column_type}>({column_idx}) : {column_name} ", - ) - - class_value = process["class_value"] - if step_idx == 0: - df_out = df_out.Define( - "class_value", f"_entry ? int({class_value}) : int()" - ) - else: - df_out = df_out.Redefine( - "class_value", f"_entry ? int({class_value}) : class_value" - ) - - df_out = df_out.Redefine("is_valid", "(is_valid) || (_entry)") - - snapshotOptions = ROOT.RDF.RSnapshotOptions() - # snapshotOptions.fOverwriteIfExists=False - # snapshotOptions.fLazy=True - snapshotOptions.fMode = "RECREATE" - snapshotOptions.fCompressionAlgorithm = getattr( - ROOT.ROOT.RCompressionSetting.EAlgorithm, "k" + "ZLIB" - ) - snapshotOptions.fCompressionLevel = 4 - ROOT.RDF.Experimental.AddProgressBar(df_out) - print("Going to snapshot") - save_column_names = ROOT.std.vector("string")(master_column_names_vec) - save_column_names.push_back("is_valid") - save_column_names.push_back("class_value") - df_out.Snapshot("Events", tmpnext_filename, save_column_names, snapshotOptions) - - if step_idx != 0: - os.system(f"rm {tmp_filename}") - - tuple_maker.join() - - step_idx += 1 - - print("Finished create file loop, now we must add the DNN variables") - # Increment the name indexes before I embarass myself again - tmp_filename = os.path.join(output_folder, f"tmp{step_idx}.root") - tmpnext_filename = os.path.join(output_folder, f"tmp{step_idx+1}.root") - - df_out = ROOT.RDataFrame("Events", tmp_filename) - - snapshotOptions = ROOT.RDF.RSnapshotOptions() - # snapshotOptions.fOverwriteIfExists=False - # snapshotOptions.fLazy=True - snapshotOptions.fMode = "RECREATE" - snapshotOptions.fCompressionAlgorithm = getattr( - ROOT.ROOT.RCompressionSetting.EAlgorithm, "k" + "ZLIB" - ) - snapshotOptions.fCompressionLevel = 4 - ROOT.RDF.Experimental.AddProgressBar(df_out) - print("Going to snapshot") - # Only need to save the prexisting columns plus the new DNN variables - save_column_names = ROOT.std.vector("string")(df_out.GetColumnNames()) - df_out = analysis.defineAllP4(df_out) - df_out = analysis.AddDNNVariables(df_out) - highlevel_names = [ - "HT", - "dR_dilep", - "dR_dibjet", - "dR_dilep_dijet", - "dR_dilep_dibjet", - "dPhi_lep1_lep2", - "dPhi_jet1_jet2", - "dPhi_MET_dilep", - "dPhi_MET_dibjet", - "min_dR_lep0_jets", - "min_dR_lep1_jets", - "MT", - "MT2", - "MT2_ll", - "MT2_bb", - "MT2_blbl", - "CosTheta_bb", - "ll_mass", - "bb_mass", - "bb_mass_PNetRegPtRawCorr", - "bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino", +def add_weight_file(output_folder): + inNames = [ + os.path.join(output_folder, x) + for x in os.listdir(output_folder) + if x.endswith(".root") ] - for highlevel_name in highlevel_names: - save_column_names.push_back(highlevel_name) - df_out.Snapshot("Events", tmpnext_filename, save_column_names, snapshotOptions) - - print(f"Finished create file, will copy tmp file to final output {out_filename}") - - os.system(f"mv {tmpnext_filename} {out_filename}") - os.system(f"rm {tmp_filename}") - - -def create_file_gfal(config_dict, output_folder, out_filename): - print( - f"Starting create file. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" - ) - nBatches = None - print(config_dict.keys()) - for process in config_dict["processes"]: - if (nBatches is None) or ( - (process["nBatches"] <= nBatches) and (process["nBatches"] != 0) - ): - nBatches = process["nBatches"] - - print(f"Going to make {nBatches} batches") - batch_size = config_dict["meta_data"]["batch_dict"]["batch_size"] - - step_idx = 0 - - # Get the name/type (And order!) of signal columns - master_column_types = [] - master_column_names_vec = ROOT.std.vector("string")() - # Assume master(signal) is saved first and use idx==0 entry to fill - - for process in config_dict["processes"]: - process_filelist = [ - ( - f"root://cmseos.fnal.gov/{x}/*.root" - if x.startswith("/eos/") - else f"{x}/*.root" - ) - for x in process["datasets"] - ] - print("Process filelist is ", process_filelist) - - tmp_filename = os.path.join(output_folder, f"tmp{step_idx}.root") - tmpnext_filename = os.path.join(output_folder, f"tmp{step_idx+1}.root") - - print(process_filelist) - df_in = ROOT.RDataFrame("Events", process_filelist) - - # Filter for nLeps and Parity (iterate cut in config) - df_in = df_in.Filter(config_dict["meta_data"]["iterate_cut"]) - - nEntriesPerBatch = process["batch_size"] - nBatchStart = process["batch_start"] - nBatchEnd = nBatchStart + nEntriesPerBatch - - if nEntriesPerBatch == 0: - print(f"Process has batch size of 0, skip the save loop") + for inName in inNames: + if "weight" in inName: continue + print(f"On file {inName}") + in_file = uproot.open(inName) + outName = f"{inName[:-5]}_weight.root" + out_file = uproot.recreate(outName) + + tree = in_file["Events"] + branches_to_load = [ + "class_value", + "X_mass", + "weight_Central", + ] + branches = tree.arrays(branches_to_load) - # Load df_out, if first iter then load an empty, otherwise load the past file - if step_idx == 0: - df_out = ROOT.RDataFrame(nBatches * batch_size) - df_out = df_out.Define("is_valid", "false") - # Fill master column nametype - for name in df_in.GetColumnNames(): - if name.startswith("gen"): - continue - if name.startswith("weight_") and not name == "weight_MC_Lumi_pu": - continue - master_column_names_vec.push_back(name) - master_column_types = [ - str(df_in.GetColumnType(str(c))) for c in master_column_names_vec - ] - else: - df_out = ROOT.RDataFrame("Events", tmp_filename) + class_targets = branches["class_value"] + class_weight = branches["weight_Central"] - local_column_names_vec = ROOT.std.vector("string")() - for name in df_in.GetColumnNames(): - if name.startswith("gen"): - continue - if name.startswith("weight_") and not name == "weight_MC_Lumi_pu": - continue - local_column_names_vec.push_back(name) - local_column_types = [ - str(df_in.GetColumnType(str(c))) for c in local_column_names_vec - ] + # Set to binary for now actually + class_targets = np.where(class_targets > 0, 1, class_targets) - # Need a local_to_master_map so that local columns keep the same index as the master columns - local_to_master_map = [ - list(master_column_names_vec).index(local_name) - for local_name in local_column_names_vec - ] - master_size = len(master_column_names_vec) + # Set any negative weight events to 0 + class_weight = np.where(class_weight <= 0, 0.0, class_weight) - queue_size = 10 - max_entries = nEntriesPerBatch * nBatches + # Total_Signal == Total_Background + total_signal = np.sum(np.where(class_targets == 0, class_weight, 0.0)) + total_background = np.sum(np.where(class_targets != 0, class_weight, 0.0)) - tuple_maker = ROOT.analysis.TupleMaker(*local_column_types)( - queue_size, max_entries + print(f"Total signal: {total_signal}") + print(f"Total background: {total_background}") + norm_factor = total_signal / total_background + class_weight = np.where( + class_targets == 0, class_weight, class_weight * norm_factor ) - - df_out = tuple_maker.FillDF( - ROOT.RDF.AsRNode(df_out), - ROOT.RDF.AsRNode(df_in), - local_to_master_map, - master_size, - local_column_names_vec, - nBatchStart, - nBatchEnd, - batch_size, + print(f"After reweight") + print( + f"Total signal: {np.sum(np.where(class_targets == 0, class_weight, 0.0))}" ) - - for column_idx, column_name in enumerate(master_column_names_vec): - column_type = master_column_types[column_idx] - - if step_idx == 0: - df_out = df_out.Define( - str(column_name), - f"_entry ? _entry->GetValue<{column_type}>({column_idx}) : {column_type}() ", - ) - else: - if column_name not in local_column_names_vec: - continue - df_out = df_out.Redefine( - str(column_name), - f"_entry ? _entry->GetValue<{column_type}>({column_idx}) : {column_name} ", - ) - - class_value = process["class_value"] - if step_idx == 0: - df_out = df_out.Define( - "class_value", f"_entry ? int({class_value}) : int()" - ) - else: - df_out = df_out.Redefine( - "class_value", f"_entry ? int({class_value}) : class_value" - ) - - df_out = df_out.Redefine("is_valid", "(is_valid) || (_entry)") - - snapshotOptions = ROOT.RDF.RSnapshotOptions() - # snapshotOptions.fOverwriteIfExists=False - # snapshotOptions.fLazy=True - snapshotOptions.fMode = "RECREATE" - snapshotOptions.fCompressionAlgorithm = getattr( - ROOT.ROOT.RCompressionSetting.EAlgorithm, "k" + "ZLIB" + print( + f"Total background: {np.sum(np.where(class_targets != 0, class_weight, 0.0))}" ) - snapshotOptions.fCompressionLevel = 4 - ROOT.RDF.Experimental.AddProgressBar(df_out) - print("Going to snapshot") - save_column_names = ROOT.std.vector("string")(master_column_names_vec) - save_column_names.push_back("is_valid") - save_column_names.push_back("class_value") - df_out.Snapshot("Events", tmpnext_filename, save_column_names, snapshotOptions) - - if step_idx != 0: - os.system(f"rm {tmp_filename}") - - tuple_maker.join() - - step_idx += 1 - - print("Finished create file loop, now we must add the DNN variables") - # Increment the name indexes before I embarass myself again - tmp_filename = os.path.join(output_folder, f"tmp{step_idx}.root") - tmpnext_filename = os.path.join(output_folder, f"tmp{step_idx+1}.root") - - df_out = ROOT.RDataFrame("Events", tmp_filename) - - snapshotOptions = ROOT.RDF.RSnapshotOptions() - # snapshotOptions.fOverwriteIfExists=False - # snapshotOptions.fLazy=True - snapshotOptions.fMode = "RECREATE" - snapshotOptions.fCompressionAlgorithm = getattr( - ROOT.ROOT.RCompressionSetting.EAlgorithm, "k" + "ZLIB" - ) - snapshotOptions.fCompressionLevel = 4 - ROOT.RDF.Experimental.AddProgressBar(df_out) - print("Going to snapshot") - # Only need to save the prexisting columns plus the new DNN variables - save_column_names = ROOT.std.vector("string")(df_out.GetColumnNames()) - df_out = analysis.defineAllP4(df_out) - df_out = analysis.AddDNNVariables(df_out) - highlevel_names = [ - "HT", - "dR_dilep", - "dR_dibjet", - "dR_dilep_dijet", - "dR_dilep_dibjet", - "dPhi_lep1_lep2", - "dPhi_jet1_jet2", - "dPhi_MET_dilep", - "dPhi_MET_dibjet", - "min_dR_lep0_jets", - "min_dR_lep1_jets", - "MT", - "MT2", - "MT2_ll", - "MT2_bb", - "MT2_blbl", - "CosTheta_bb", - "ll_mass", - "bb_mass", - "bb_mass_PNetRegPtRawCorr", - "bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino", - ] - for highlevel_name in highlevel_names: - save_column_names.push_back(highlevel_name) - df_out.Snapshot("Events", tmpnext_filename, save_column_names, snapshotOptions) - - print(f"Finished create file, will copy tmp file to final output {out_filename}") - - os.system(f"mv {tmpnext_filename} {out_filename}") - os.system(f"rm {tmp_filename}") - - -def add_HME(config_dict, output_folder, input_filename, out_filename): - print(f"Starting add HME to {out_filename}") - - with open(os.path.join(ana_path, "config/global.yaml")) as f: - global_cfg_dict = yaml.safe_load(f) - - Single_producer_config = global_cfg_dict["payload_producers"]["SingleLep_DeepHME"] - Single_producers_module_name = Single_producer_config["producers_module_name"] - Single_producer_name = Single_producer_config["producer_name"] - Single_producers_module = importlib.import_module(Single_producers_module_name) - Single_producer_class = getattr(Single_producers_module, Single_producer_name) - Single_producer = Single_producer_class(Single_producer_config, "SingleLep_DeepHME") - - Double_producer_config = global_cfg_dict["payload_producers"]["DoubleLep_DeepHME"] - Double_producers_module_name = Double_producer_config["producers_module_name"] - Double_producer_name = Double_producer_config["producer_name"] - Double_producers_module = importlib.import_module(Double_producers_module_name) - Double_producer_class = getattr(Double_producers_module, Double_producer_name) - Double_producer = Double_producer_class(Double_producer_config, "DoubleLep_DeepHME") - - final_array = None - uproot_stepsize = Single_producer_config.get("uproot_stepsize", "100MB") - for array in uproot.iterate( - f"{input_filename}:Events", step_size=uproot_stepsize - ): # For DNN 50MB translates to ~300_000 events - new_array1 = Single_producer.run(ak.copy(array)) - new_array2 = Double_producer.run(ak.copy(array)) - tmp_combo = new_array1 - for branch in new_array2.fields: - if branch == "FullEventId": - continue - tmp_combo[branch] = new_array2[branch] - if final_array is None: - final_array = tmp_combo - else: - final_array = ak.concatenate([final_array, tmp_combo]) - - uprootCompression = getattr(uproot, "ZLIB") - uprootCompression = uprootCompression(4) - - with uproot.recreate(out_filename, compression=uprootCompression) as outfile: - outfile["Events"] = final_array - - print(f"Finished add payloads to {out_filename}") - - -def add_DNN(config_dict, output_folder, input_filename, out_filename): - print(f"Starting add HME to {out_filename}") - - with open(os.path.join(ana_path, "config/global.yaml")) as f: - global_cfg_dict = yaml.safe_load(f) - - Single_producer_config = global_cfg_dict["payload_producers"][ - "DNNParametric_SL_NoHME" - ] - Single_producers_module_name = Single_producer_config["producers_module_name"] - Single_producer_name = Single_producer_config["producer_name"] - Single_producers_module = importlib.import_module(Single_producers_module_name) - Single_producer_class = getattr(Single_producers_module, Single_producer_name) - Single_producer = Single_producer_class( - Single_producer_config, "DNNParametric_SL_NoHME" - ) - - Double_producer_config = global_cfg_dict["payload_producers"][ - "DNNParametric_DL_NoHME" - ] - Double_producers_module_name = Double_producer_config["producers_module_name"] - Double_producer_name = Double_producer_config["producer_name"] - Double_producers_module = importlib.import_module(Double_producers_module_name) - Double_producer_class = getattr(Double_producers_module, Double_producer_name) - Double_producer = Double_producer_class( - Double_producer_config, "DNNParametric_DL_NoHME" - ) - - final_array = None - uproot_stepsize = Single_producer_config.get("uproot_stepsize", "100MB") - for array in uproot.iterate( - f"{input_filename}:Events", step_size=uproot_stepsize - ): # For DNN 50MB translates to ~300_000 events - new_array1 = Single_producer.run(ak.copy(array)) - new_array2 = Double_producer.run(ak.copy(array)) - tmp_combo = new_array1 - for branch in new_array2.fields: - if branch == "FullEventId": - continue - tmp_combo[branch] = new_array2[branch] - if final_array is None: - final_array = tmp_combo - else: - final_array = ak.concatenate([final_array, tmp_combo]) - uprootCompression = getattr(uproot, "ZLIB") - uprootCompression = uprootCompression(4) - - with uproot.recreate(out_filename, compression=uprootCompression) as outfile: - outfile["Events"] = final_array - - print(f"Finished add payloads to {out_filename}") - - -def add_DNN(config_dict, output_folder, input_filename, out_filename): - print(f"Starting add HME to {out_filename}") - - with open(os.path.join(ana_path, "config/global.yaml")) as f: - global_cfg_dict = yaml.safe_load(f) - - Single_producer_config = global_cfg_dict["payload_producers"][ - "DNNParametric_SL_NoHME" - ] - Single_producers_module_name = Single_producer_config["producers_module_name"] - Single_producer_name = Single_producer_config["producer_name"] - Single_producers_module = importlib.import_module(Single_producers_module_name) - Single_producer_class = getattr(Single_producers_module, Single_producer_name) - Single_producer = Single_producer_class( - Single_producer_config, "DNNParametric_SL_NoHME" - ) - - Double_producer_config = global_cfg_dict["payload_producers"][ - "DNNParametric_DL_NoHME" - ] - Double_producers_module_name = Double_producer_config["producers_module_name"] - Double_producer_name = Double_producer_config["producer_name"] - Double_producers_module = importlib.import_module(Double_producers_module_name) - Double_producer_class = getattr(Double_producers_module, Double_producer_name) - Double_producer = Double_producer_class( - Double_producer_config, "DNNParametric_DL_NoHME" - ) - - final_array = None - uproot_stepsize = Single_producer_config.get("uproot_stepsize", "100MB") - for array in uproot.iterate( - f"{input_filename}:Events", step_size=uproot_stepsize - ): # For DNN 50MB translates to ~300_000 events - new_array1 = Single_producer.run(ak.copy(array)) - new_array2 = Double_producer.run(ak.copy(array)) - tmp_combo = new_array1 - for branch in new_array2.fields: - if branch == "FullEventId": - continue - tmp_combo[branch] = new_array2[branch] - if final_array is None: - final_array = tmp_combo - else: - final_array = ak.concatenate([final_array, tmp_combo]) - - uprootCompression = getattr(uproot, "ZLIB") - uprootCompression = uprootCompression(4) - - with uproot.recreate(out_filename, compression=uprootCompression) as outfile: - outfile["Events"] = final_array - - print(f"Finished add payloads to {out_filename}") - - -def create_weight_file(inName, outName, bb_low=70, bb_high=150, bb_min=70, bb_max=300): - print(f"On file {inName}") - in_file = uproot.open(inName) - out_file = uproot.recreate(outName) - - tree = in_file["Events"] - branches_to_load = [ - "class_value", - "bb_mass", - "bb_mass_PNetRegPtRawCorr", - "bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino", - "X_mass", - "centralJet_hadronFlavour", - "centralJet_pt", - "SelectedFatJet_hadronFlavour", - "weight_base", - ] - branches = tree.arrays(branches_to_load) - - class_value = branches["class_value"] - bb_mass = branches["bb_mass"] - bb_mass = branches["bb_mass_PNetRegPtRawCorr"] - bb_mass = branches["bb_mass_PNetRegPtRawCorr_PNetRegPtRawCorrNeutrino"] - - X_mass = branches["X_mass"] - - hadronFlavour = ak.fill_none( - ak.pad_none(branches["centralJet_hadronFlavour"], 2, axis=1), 0 - ) - ak8_hadronFlavour = ak.fill_none( - ak.pad_none(branches["SelectedFatJet_hadronFlavour"], 1, axis=1), 0 - ) - - # type_to_name = {'1': 'Signal', '2': 'Signal', '8': 'TT', '5': 'DY', '9': 'ST'} # 1 is Radion, 2 is Graviton - # type_to_target = {'1': 0, '2': 0, '8': 1, '5': 2, '9': 3} # Multiclass type-to-target - # type_to_target = {'1': 0, '2': 0, '8': 1, '5': 1, '9': 1} # Binary type-to-target - - value_to_name = {"0": "Signal", "1": "TT", "2": "DY", "3": "ST", "4": "W"} - # value_to_target = {'0': 0, '1': 1, '2': 1, '3': 1, '4': 1} # Binary type-to-target - value_to_target = { - "0": 0, - "1": 1, - "2": 2, - "3": 3, - "4": 4, - } # Multiclass type-to-target - - sample_name = np.array([value_to_name[str(value)] for value in class_value]) - class_targets = np.array([value_to_target[str(value)] for value in class_value]) - - # Initialize the two branches, class weight and adv weight - # Starting from their genWeight (includes XS and such) - class_weight = branches["weight_base"] - adv_weight = branches["weight_base"] - - # Lets just flatten the weight by bb_mass first for each sample - for this_name in np.unique(sample_name): - bb_mass_thissample = np.where(sample_name == this_name, bb_mass, -1.0) - weights_thissample = np.where(sample_name == this_name, class_weight, 0.0) - - # Create a histogram of bb_mass and change the weight of each event - # to be 1 / (nEvents in that bin) - # This will make the bb_mass distribution flat - hist, bin_edges = np.histogram( - bb_mass_thissample, bins=100, range=(70, 300), weights=weights_thissample - ) - bin_indices = np.digitize(bb_mass_thissample, bin_edges) - bin_indices = np.where( - bin_indices == len(hist) + 1, len(hist), bin_indices - ) # If we are in the overflow bin, set to last bin - bin_indices = np.where( - bin_indices == 0, 1, bin_indices - ) # If we are in the underflow bin, set to first bin - bin_counts = hist[bin_indices - 1] # Get the counts for each event - # For a non-blind don't norm class weights over mbb + # Total_Background1 == Total_Background2 == Total_Background3 + # Scale each background to total, then reduce all to total + ### Do not scale backgrounds to each other in binary classifier ### + # total_background = np.sum(np.where(class_targets != 0, class_weight, 0.0)) + # for class_value in np.unique(class_targets): + # if class_value == 0: + # continue # Don't do anything with signal here + # this_total = np.sum(np.where(class_targets == class_value, class_weight, 0.0)) + # rescale_factor = total_background / this_total + # class_weight = np.where( + # class_targets == class_value, class_weight*rescale_factor, class_weight + # ) + # current_total = np.sum(np.where(class_targets != 0, class_weight, 0.0)) + # rescale_factor = total_background / current_total # class_weight = np.where( - # (sample_name == this_name) & (bin_counts > 0), - # class_weight / bin_counts, - # class_weight, + # class_targets != 0, class_weight*rescale_factor, class_weight # ) - adv_weight = np.where( - (sample_name == this_name) & (bin_counts > 0), - adv_weight / bin_counts, - adv_weight, - ) - - # First step, remove any sample types we want to - # samples_to_remove = [ 'DY' ] - samples_to_remove = [] - - for sample_to_remove in samples_to_remove: - class_weight = np.where(sample_name == sample_to_remove, 0.0, class_weight) - - adv_weight = np.where(sample_name == sample_to_remove, 0.0, adv_weight) - - # Next normalize between sample types (class) - - # First remove the signal that is not gen bb - # class_weight = np.where( - # sample_name == 'Signal', - # np.where( - # ((hadronFlavour[:,0] == 5) & (hadronFlavour[:,1] == 5)) | (ak8_hadronFlavour[:,0] == 5), # & (X_mass == 800), # For now, only train on m450 - # class_weight, - # 0.0 - # ), - # class_weight - # ) - - # Total_Signal == Total_DY + Total_TT (Equal weight of signal vs background in binary) - total_signal = np.sum(np.where(sample_name == "Signal", class_weight, 0.0)) - total_background = np.sum(np.where(sample_name != "Signal", class_weight, 0.0)) - - norm_factor = total_signal / total_background - class_weight = np.where( - sample_name == "Signal", class_weight, class_weight * norm_factor - ) - - # Next normalize between m_bb regions (adversarial) - # TT_Low == TT_Mid == TT_High - # DY_Low == DY_Mid == DY_High - - # TT_Total / DY_Total == TT_yield / DY_yield - adv_weight = np.where(sample_name == "Signal", 0.0, adv_weight) - # bb_low = 70 - # bb_high = 150 - - # Set adv targets - adv_targets = np.where(bb_mass < bb_low, -1, np.where(bb_mass < bb_high, 0, 1)) - - # Option to set an lower and upper - # bb_min = 70 - # bb_max = 300 - adv_weight = np.where(bb_mass > bb_min, adv_weight, 0.0) - adv_weight = np.where(bb_mass < bb_max, adv_weight, 0.0) - for this_name in np.unique(sample_name): - if this_name == "Signal": - continue - print(f"On sample {this_name}") - total_low = np.sum( - np.where((sample_name == this_name) & (bb_mass < bb_low), adv_weight, 0.0) - ) - total_mid = np.sum( - np.where( - (sample_name == this_name) & (bb_mass > bb_low) & (bb_mass < bb_high), - adv_weight, - 0.0, - ) - ) - total_high = np.sum( - np.where((sample_name == this_name) & (bb_mass > bb_high), adv_weight, 0.0) - ) - # norm to mid - adv_weight = np.where( - (sample_name == this_name) & (bb_mass < bb_low), - # total_mid * adv_weight / total_low, - 0.0, # For now, we will just ignore the down category - adv_weight, - ) - adv_weight = np.where( - (sample_name == this_name) & (bb_mass > bb_high), - total_mid * adv_weight / total_high, - adv_weight, + print(f"Final reweight") + print( + f"Total signal: {np.sum(np.where(class_targets == 0, class_weight, 0.0))}" ) - - total_scaled = np.sum(np.where(sample_name == this_name, adv_weight, 0.0)) - adv_weight = np.where( - (sample_name == this_name), adv_weight / total_scaled, adv_weight + print( + f"Total background: {np.sum(np.where(class_targets != 0, class_weight, 0.0))}" ) - # Nan to num for any divide by 0 errors - class_weight = np.nan_to_num(class_weight, 0.0) - adv_weight = np.nan_to_num(adv_weight, 0.0) - - # Normalize both class weights and adv weights to nEvents - print( - f"Before normalization our class total {np.sum(class_weight)} and adv total {np.sum(adv_weight)}" - ) - nEvents = len(class_weight) - class_weight = (nEvents / np.sum(class_weight)) * class_weight - adv_weight = (nEvents / np.sum(adv_weight)) * adv_weight - print( - f"After normalization our class total {np.sum(class_weight)} and adv total {np.sum(adv_weight)}" - ) - - out_dict = { - "class_weight": class_weight, - "adv_weight": adv_weight, - "class_target": class_targets, - "adv_target": adv_targets, - } - - out_file["weight_tree"] = out_dict + out_dict = { + "class_weight": class_weight, + "class_target": class_targets, + } - # m_bb weight validation plots - import matplotlib.pyplot as plt - import mplhep as hep + print("Finished with dict") + print(out_dict) - plt.style.use(hep.style.ROOT) - fig, ax = plt.subplots(1, 1) - for this_name in np.unique(sample_name): - mask = sample_name == this_name - ax.hist( - bb_mass[mask], - weights=class_weight[mask], - bins=100, - range=(0, 500), - histtype="step", - label=this_name, - ) - # ax.set_yscale('log') - ax.set_xlabel(r"$m_{bb}$ [GeV]") - ax.set_ylabel("Weighted Events") - ax.legend() - plt.savefig(outName.replace(".root", "_class_weight_bbmass.png")) - plt.close() - - fig, ax = plt.subplots(1, 1) - for this_name in np.unique(sample_name): - mask = sample_name == this_name - ax.hist( - bb_mass[mask], - weights=adv_weight[mask], - bins=100, - range=(0, 500), - histtype="step", - label=this_name, - ) - # ax.set_yscale('log') - ax.set_xlabel(r"$m_{bb}$ [GeV]") - ax.set_ylabel("Weighted Events") - ax.legend() - plt.savefig(outName.replace(".root", "_adv_weight_bbmass.png")) - plt.close() + out_file["weight_tree"] = out_dict + out_file.close() if __name__ == "__main__": @@ -1725,13 +261,6 @@ def create_weight_file(inName, outName, bb_low=70, bb_high=150, bb_min=70, bb_ma default="/eos/user/d/daebi/DNN_Training_Datasets", help="Output folder to store dataset", ) - parser.add_argument( - "--era", - required=False, - type=str, - default="Run3_2022", - help="Era of data taking", - ) args = parser.parse_args() @@ -1740,112 +269,11 @@ def create_weight_file(inName, outName, bb_low=70, bb_high=150, bb_min=70, bb_ma config_dict = yaml.safe_load(file) output_base = args.output_folder - output_folder = os.path.join(output_base, f"Dataset_{args.era}") + output_folder = os.path.join(output_base, f"Dataset") if os.path.exists(output_folder): print(f"Output folder {output_folder} exists!!!") os.makedirs(output_folder, exist_ok=True) os.system(f"cp {config_file} {output_folder}/.") - print("Will create signal files") - create_signal_files_gfal(config_dict, output_folder, args.era) - print("Creating the batch dict") - create_dict_gfal(config_dict, output_folder, args.era) - - gc.collect() - - print( - f"We have finished making the dicts. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" - ) - - headers_dir = os.path.dirname(os.path.abspath(__file__)) - # headers = [ 'AnalysisTools.h', 'TupleMaker.h' ] #Order here matters since TupleMaker requires AnalysisTools - headers = [ - "TupleMaker.h" - ] # Order here matters since TupleMaker requires AnalysisTools - for header in headers: - header_path = os.path.join(headers_dir, header) - if not ROOT.gInterpreter.Declare(f'#include "{header_path}"'): - raise RuntimeError(f"Failed to load {header_path}") - - print( - f"Starting the create file loop. Memory usage in MB is {psutil.Process(os.getpid()).memory_info()[0] / float(2 ** 20)}" - ) - print(output_folder) - print(os.listdir(output_folder)) - yaml_list = [ - fname - for fname in os.listdir(output_folder) - if ((".yaml" in fname) and ("batch_config_parity" in fname)) - ] - yaml_list.sort() - for i, yamlname in enumerate(yaml_list): - print(f"Starting batch {i} with yaml {yamlname}") - config_dict = {} - with open(os.path.join(output_folder, yamlname), "r") as file: - config_dict = yaml.safe_load(file) - if not os.path.exists( - os.path.join(output_folder, config_dict["meta_data"]["input_filename"]) - ): - create_file_gfal( - config_dict, - output_folder, - os.path.join(output_folder, config_dict["meta_data"]["input_filename"]), - ) - """ - for i, yamlname in enumerate(yaml_list): - print(f"Starting HME {i} with yaml {yamlname}") - config_dict = {} - with open(os.path.join(output_folder, yamlname), "r") as file: - config_dict = yaml.safe_load(file) - HME_friend_name = ( - os.path.join( - output_folder, config_dict["meta_data"]["input_filename"] - ).split(".")[0] - + "_HME_Friend.root" - ) - if not os.path.exists(HME_friend_name): - add_HME( - config_dict, - output_folder, - os.path.join(output_folder, config_dict["meta_data"]["input_filename"]), - HME_friend_name, - ) - DNN_friend_name = ( - os.path.join( - output_folder, config_dict["meta_data"]["input_filename"] - ).split(".")[0] - + "_DNN_Friend.root" - if not os.path.exists(DNN_friend_name): - add_DNN( - config_dict, - output_folder, - os.path.join(output_folder, config_dict["meta_data"]["input_filename"]), - DNN_friend_name, - ) - """ - - """ - print("Finished making all the batch files, now we will make the weight files") - inDir = output_folder - batchfiles = [ - x for x in os.listdir(inDir) if "batchfile" in x and "Friend" not in x - ] - - bb_low = 70 - bb_high = 150 - - bb_min = 70 - bb_max = 300 - - for batchfile_name in batchfiles: - weightfile_name = f"weightfile{batchfile_name[-6:]}" - - in_file = os.path.join(inDir, batchfile_name) - out_file = os.path.join(inDir, weightfile_name) - - print(f"Starting infile {in_file} and making outfile {out_file}") - if os.path.exists(out_file): - print(f"Weight file {out_file} exists, skip") - # continue - create_weight_file(in_file, out_file, bb_low, bb_high, bb_min, bb_max) - """ + measure_cut_datasets(config_dict, output_folder) + add_weight_file(output_folder) diff --git a/Studies/DNN/tasks.py b/Studies/DNN/tasks.py index e89d7c1a..47c27834 100644 --- a/Studies/DNN/tasks.py +++ b/Studies/DNN/tasks.py @@ -50,8 +50,8 @@ def output(self): os.path.basename(config_name), ) return [ - self.remote_target(output_path, fs=self.fs_anaTuple), - self.remote_target(config_path, fs=self.fs_anaTuple), + self.remote_target(output_path, fs=self.fs_histograms), + self.remote_target(config_path, fs=self.fs_histograms), ] def run(self): @@ -67,15 +67,9 @@ def run(self): training_file = config["training_file"] weight_file = config["weight_file"] - batch_config = config["batch_config"] test_training_file = config["test_training_file"] test_weight_file = config["test_weight_file"] - test_batch_config = config["test_batch_config"] - hme_friend_file = config["hme_friend_file"] - test_hme_friend_file = config["test_hme_friend_file"] - - # with config["training_file"].localize("r") as training_file, config["weight_file"].localize("r") as weight_file, config["batch_config"].localize("r") as batch_config, config["test_training_file"].localize("r") as test_training_file, config["test_weight_file"].localize("r") as test_weight_file, config["test_batch_config"].localize("r") as test_batch_config: dnn_trainer_cmd = [ "python3", "-u", @@ -84,22 +78,14 @@ def run(self): training_file, "--weight_file", weight_file, - "--batch_config", - batch_config, "--test_training_file", test_training_file, "--test_weight_file", test_weight_file, - "--test_batch_config", - test_batch_config, "--output_folder", tmpFolder, "--setup-config", config_name, - "--hme_friend_file", - hme_friend_file, - "--test_hme_friend_file", - test_hme_friend_file, ] ps_call(dnn_trainer_cmd, verbose=1) @@ -152,7 +138,7 @@ def output(self): output_path = os.path.join( "DNNTraining", self.version, self.period, training_name, outFileName ) - return [self.remote_target(output_path, fs=self.fs_anaTuple)] + return [self.remote_target(output_path, fs=self.fs_histograms)] def run(self): config, config_name, n_branch = self.branch_data @@ -167,13 +153,10 @@ def run(self): validation_file = config["validation_file"] valitation_weight_file = config["validation_weight_file"] - valitation_batch_config = config["validation_batch_config"] - - validation_hme_friend_file = config["validation_hme_friend_file"] tmp_local = os.path.join(self.input()[0].path, "best.onnx") - # with self.input()[0].localize("r") as model_file, self.input()[1].localize("r") as model_config: - with self.remote_target(tmp_local, fs=self.fs_anaTuple).localize( + + with self.remote_target(tmp_local, fs=self.fs_histograms).localize( "r" ) as model_file, self.input()[1].localize("r") as model_config: print(os.listdir()) @@ -185,8 +168,6 @@ def run(self): validation_file, "--validation_weight_file", valitation_weight_file, - "--validation_batch_config", - valitation_batch_config, "--output_file", tmpFile, "--setup-config", @@ -195,8 +176,6 @@ def run(self): model_file.path, "--model-config", model_config.path, - "--validation_hme_friend_file", - validation_hme_friend_file, ] ps_call(dnn_validator_cmd, verbose=1)