Simplify methods to output stats

chembl · Feb 21, 2024 · 0c82748 · 0c82748
1 parent fe2a49a
commit 0c82748
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 162 deletions.
diff --git a/src/dataset.py b/src/dataset.py
@@ -11,12 +11,15 @@ class Dataset:
                                 used for DTI assignments
     drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
                                 used for DTI assigments
-    df_sizes_all:               List of intermediate sized of the dataset used for debugging
-    df_sizes_pchembl:           List of intermediate sized of the dataset used for debugging
+    df_sizes_all:               Pandas DataFrame of intermediate sizes of the dataset,
+                                used for debugging
+    df_sizes_pchembl:           Pandas DataFrame of intermediate sizes of the dataset,
+                                restricted to entries with a pchembl value,
+                                used for debugging
     """
 
     df_result: pd.DataFrame
     drug_mechanism_pairs_set: set
     drug_mechanism_targets_set: set
-    df_sizes_all: list[int]
-    df_sizes_pchembl: list[int]
+    df_sizes_all: pd.DataFrame
+    df_sizes_pchembl: pd.DataFrame
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
@@ -250,7 +250,7 @@ def get_aggregated_activity_ct_pairs(
         df_result,
         set(),
         set(),
-        [],
-        [],
+        pd.DataFrame(),
+        pd.DataFrame(),
     )
     return dataset
diff --git a/src/get_dataset.py b/src/get_dataset.py
@@ -83,4 +83,4 @@ def get_ct_pair_dataset(
     write_subsets.output_all_stats(dataset, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.output_debug_sizes(dataset, out)
+        write_subsets.write_debug_sizes(dataset, out)
diff --git a/src/get_stats.py b/src/get_stats.py
@@ -4,94 +4,33 @@
 from dataset import Dataset
 
 
-##### Debugging Stats #####
-def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
+##### Logging Stats #####
+def get_stats_columns() -> tuple[list[str], list[str]]:
     """
-    Calculate the number of unique compounds, targets and pairs
-    for df and df limited to drugs.
-
-    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
-    :type df: pd.DataFrame
-    :return: List of calculated unique counts.
-    :rtype: list[int]
+    Get the relevant columns for which stats should be calculated
+    and a list of descriptions corresponding to the columns.
     """
-    now_mols = df["parent_molregno"].nunique()
-    now_targets = df["tid"].nunique()
-    now_targets_mutation = df["tid_mutation"].nunique()
-    now_pairs = df["cpd_target_pair"].nunique()
-    now_pairs_mutation = df["cpd_target_pair_mutation"].nunique()
-
-    if "DTI" in df.columns:
-        # drugs = compounds of a compound-target pair with a known interaction
-        df_drugs = df[df["DTI"] == "D_DT"]
-    else:
-        df_drugs = df[df["max_phase"] == 4]
-
-    now_drugs = df_drugs["parent_molregno"].nunique()
-    now_drug_targets = df_drugs["tid"].nunique()
-    now_drug_targets_mutation = df_drugs["tid_mutation"].nunique()
-    now_drug_pairs = df_drugs["cpd_target_pair"].nunique()
-    now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique()
-
-    return [
-        now_mols,
-        now_drugs,
-        now_targets,
-        now_drug_targets,
-        now_targets_mutation,
-        now_drug_targets_mutation,
-        now_pairs,
-        now_drug_pairs,
-        now_pairs_mutation,
-        now_drug_pairs_mutation,
+    df_columns = [
+        "parent_molregno",
+        "tid",
+        "tid_mutation",
+        "cpd_target_pair",
+        "cpd_target_pair_mutation",
     ]
+    columns_descs = [
+        "compound ID",
+        "target ID",
+        "target ID with mutation annotations",
+        "compound-target pair",
+        "compound-target pair with mutation annotations",
+    ]
+    return df_columns, columns_descs
 
 
-def add_dataset_sizes(
-    dataset: Dataset,
-    df: pd.DataFrame,
-    label: str,
-):
-    """
-    Count and add representative counts of df used for debugging to the dataset.
-
-    :param dataset: Dataset with compound-target pairs and debugging sizes.
-    :type dataset: Dataset
-    :param df: Pandas DataFrame with current compound-target pairs
-    :type df: pd.DataFrame
-    :param label: Description of pipeline step (e.g., initial query).
-    :type label: str
-    """
-    df_copy = df.copy()
-    dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy))
-
-    # restrict to data with any pchembl value (any data with a pchembl,
-    # even if it is based on only functional data)
-    # these statistics are purely based on removing
-    # compound-target pairs without pchembl information,
-    # i.e., the subset of the dataset is determined by the given df and not recalculated
-    df_pchembl = df_copy.dropna(
-        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
-    )
-    dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl))
-
-
-def add_debugging_info(
-    dataset: Dataset,
-    df: pd.DataFrame,
-    label: str,
-):
-    """
-    Wrapper for add_dataset_sizes.
-    Handles logging level.
-    """
-    if logging.DEBUG >= logging.root.level:
-        add_dataset_sizes(dataset, df, label)
-
-
-##### Logging Stats #####
 def get_stats_for_column(
-    df: pd.DataFrame, column: str, columns_desc: str
+    df: pd.DataFrame,
+    column: str,
+    columns_desc: str,
 ) -> list[list[str, str, int]]:
     """
     Calculate the number of unique values in df[column] and various subsets of df.
@@ -145,3 +84,78 @@ def get_stats_for_column(
             df[df["DTI"] == "C0_DT"][column].nunique(),
         ],
     ]
+
+
+##### Debugging Stats #####
+def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame:
+    """
+    Calculate the number of unique compounds, targets and pairs
+    for df and df limited to drugs.
+
+    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    :return: Pandas DataFrame with calculated unique counts.
+    :rtype: pd.DataFrame
+    """
+    stats = {"step": label}
+
+    if "DTI" in df.columns:
+        # drugs = compounds of a compound-target pair with a known interaction
+        df_drugs = df[df["DTI"] == "D_DT"]
+    else:
+        df_drugs = df[df["max_phase"] == 4]
+
+    df_columns, _ = get_stats_columns()
+    for column in df_columns:
+        stats[f"{column}_all"] = df[column].nunique()
+        stats[f"{column}_drugs"] = df_drugs[column].nunique()
+
+    df_stats = pd.DataFrame([stats])
+    return df_stats
+
+
+def add_dataset_sizes(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Count and add representative counts of df used for debugging to the dataset.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param df: Pandas DataFrame with current compound-target pairs
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    """
+    df_stats = get_dataset_sizes(df, label)
+
+    dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats])
+
+    # restrict to data with any pchembl value (any data with a pchembl,
+    # even if it is based on only functional data)
+    # these statistics are purely based on removing
+    # compound-target pairs without pchembl information,
+    # i.e., the subset of the dataset is determined by the given df and not recalculated
+    df_copy = df.copy()
+    df_pchembl = df_copy.dropna(
+        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
+    )
+    df_stats = get_dataset_sizes(df_pchembl, label)
+    dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats])
+
+
+def add_debugging_info(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Wrapper for add_dataset_sizes.
+    Handles logging level.
+    """
+    if logging.DEBUG >= logging.root.level:
+        add_dataset_sizes(dataset, df, label)
diff --git a/src/write_subsets.py b/src/write_subsets.py
@@ -8,6 +8,7 @@
 from dataset import Dataset
 
 
+##### Writing Output #####
 def write_output(
     df: pd.DataFrame,
     filename: str,
@@ -73,6 +74,7 @@ def write_and_check_output(
     )
 
 
+##### Output Specific Results #####
 def write_full_dataset_to_file(
     dataset: Dataset,
     args: CalculationArgs,
@@ -97,59 +99,6 @@ def write_full_dataset_to_file(
         write_and_check_output(dataset.df_result, name_all, desc, args, out)
 
 
-def output_debug_sizes(
-    dataset: Dataset,
-    out: OutputArgs,
-):
-    """
-    Output counts at various points during calculating the final dataset for debugging.
-
-    :param dataset: Dataset with compound-target pairs and debugging sizes.
-    :type dataset: Dataset
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
-    """
-    column_names = [
-        "type",
-        "#mols",
-        "#drugs",
-        "#targets",
-        "#drug_ targets",
-        "#targets_ mutation",
-        "#drug_ targets_mutation",
-        "#cpd_tid_ pairs",
-        "#drug_tid_ pairs",
-        "#cpd_ tid_mutation_ pairs",
-        "#drug_ tid_mutation_ pairs",
-    ]
-
-    logging.debug("Size of full dataset at different points.")
-    full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names)
-    logging.debug(full_df_sizes)
-    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
-    write_output(
-        full_df_sizes,
-        name_full_df_sizes,
-        out,
-    )
-
-    logging.debug("Size of dataset with any pchembl values at different points.")
-    logging.debug(
-        "This includes data for which we only have pchembl data \
-            for functional assays but not for binding assays."
-    )
-    df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names)
-    logging.debug(df_pchembl_sizes)
-    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
-    write_output(
-        full_df_sizes,
-        name_pchembl_df_sizes,
-        out,
-    )
-
-
 def output_stats(
     df: pd.DataFrame,
     output_file: str,
@@ -171,33 +120,15 @@ def output_stats(
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
     """
-    df_columns = [
-        "parent_molregno",
-        "tid",
-        "tid_mutation",
-        "cpd_target_pair",
-        "cpd_target_pair_mutation",
-    ]
-    columns_descs = [
-        "compound ID",
-        "target ID",
-        "target ID with mutation annotations",
-        "compound-target pair",
-        "compound-target pair with mutation annotations",
-    ]
-
     logging.debug("Stats for %s", output_file)
     stats = []
+    df_columns, columns_descs = get_stats.get_stats_columns()
     for column, columns_desc in zip(df_columns, columns_descs):
         logging.debug("Stats for column %s:", column)
         column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
         stats += column_stats
         for colum_stat in column_stats:
-            logging.debug(
-                "%20s %s",
-                colum_stat[2],
-                colum_stat[3],
-            )
+            logging.debug("%20s %s", colum_stat[2], colum_stat[3])
 
     df_stats = pd.DataFrame(
         stats, columns=["column", "column_description", "subset_type", "counts"]
@@ -252,3 +183,36 @@ def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs):
             output_file,
             out,
         )
+
+
+def write_debug_sizes(
+    dataset: Dataset,
+    out: OutputArgs,
+):
+    """
+    Output counts at various points during calculating the final dataset for debugging.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    # Size of full dataset at different points.
+    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
+    write_output(
+        dataset.df_sizes_all,
+        name_full_df_sizes,
+        out,
+    )
+
+    # Size of dataset with any pchembl values at different points.
+    # This includes data for which we only have pchembl data
+    # for functional assays but not for binding assays.
+    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
+    write_output(
+        dataset.df_sizes_pchembl,
+        name_pchembl_df_sizes,
+        out,
+    )