Skip to content

Commit

Permalink
Simplify methods to output stats
Browse files Browse the repository at this point in the history
  • Loading branch information
LinaHeinzke committed Feb 21, 2024
1 parent fe2a49a commit 0c82748
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 162 deletions.
11 changes: 7 additions & 4 deletions src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ class Dataset:
used for DTI assignments
drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
used for DTI assigments
df_sizes_all: List of intermediate sized of the dataset used for debugging
df_sizes_pchembl: List of intermediate sized of the dataset used for debugging
df_sizes_all: Pandas DataFrame of intermediate sizes of the dataset,
used for debugging
df_sizes_pchembl: Pandas DataFrame of intermediate sizes of the dataset,
restricted to entries with a pchembl value,
used for debugging
"""

df_result: pd.DataFrame
drug_mechanism_pairs_set: set
drug_mechanism_targets_set: set
df_sizes_all: list[int]
df_sizes_pchembl: list[int]
df_sizes_all: pd.DataFrame
df_sizes_pchembl: pd.DataFrame
4 changes: 2 additions & 2 deletions src/get_activity_ct_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def get_aggregated_activity_ct_pairs(
df_result,
set(),
set(),
[],
[],
pd.DataFrame(),
pd.DataFrame(),
)
return dataset
2 changes: 1 addition & 1 deletion src/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ def get_ct_pair_dataset(
write_subsets.output_all_stats(dataset, args, out)

if logging.DEBUG >= logging.root.level:
write_subsets.output_debug_sizes(dataset, out)
write_subsets.write_debug_sizes(dataset, out)
178 changes: 96 additions & 82 deletions src/get_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,94 +4,33 @@
from dataset import Dataset


##### Debugging Stats #####
def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
##### Logging Stats #####
def get_stats_columns() -> tuple[list[str], list[str]]:
"""
Calculate the number of unique compounds, targets and pairs
for df and df limited to drugs.
:param df: Pandas DataFrame for which the dataset sizes should be calculated.
:type df: pd.DataFrame
:return: List of calculated unique counts.
:rtype: list[int]
Get the relevant columns for which stats should be calculated
and a list of descriptions corresponding to the columns.
"""
now_mols = df["parent_molregno"].nunique()
now_targets = df["tid"].nunique()
now_targets_mutation = df["tid_mutation"].nunique()
now_pairs = df["cpd_target_pair"].nunique()
now_pairs_mutation = df["cpd_target_pair_mutation"].nunique()

if "DTI" in df.columns:
# drugs = compounds of a compound-target pair with a known interaction
df_drugs = df[df["DTI"] == "D_DT"]
else:
df_drugs = df[df["max_phase"] == 4]

now_drugs = df_drugs["parent_molregno"].nunique()
now_drug_targets = df_drugs["tid"].nunique()
now_drug_targets_mutation = df_drugs["tid_mutation"].nunique()
now_drug_pairs = df_drugs["cpd_target_pair"].nunique()
now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique()

return [
now_mols,
now_drugs,
now_targets,
now_drug_targets,
now_targets_mutation,
now_drug_targets_mutation,
now_pairs,
now_drug_pairs,
now_pairs_mutation,
now_drug_pairs_mutation,
df_columns = [
"parent_molregno",
"tid",
"tid_mutation",
"cpd_target_pair",
"cpd_target_pair_mutation",
]
columns_descs = [
"compound ID",
"target ID",
"target ID with mutation annotations",
"compound-target pair",
"compound-target pair with mutation annotations",
]
return df_columns, columns_descs


def add_dataset_sizes(
dataset: Dataset,
df: pd.DataFrame,
label: str,
):
"""
Count and add representative counts of df used for debugging to the dataset.
:param dataset: Dataset with compound-target pairs and debugging sizes.
:type dataset: Dataset
:param df: Pandas DataFrame with current compound-target pairs
:type df: pd.DataFrame
:param label: Description of pipeline step (e.g., initial query).
:type label: str
"""
df_copy = df.copy()
dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy))

# restrict to data with any pchembl value (any data with a pchembl,
# even if it is based on only functional data)
# these statistics are purely based on removing
# compound-target pairs without pchembl information,
# i.e., the subset of the dataset is determined by the given df and not recalculated
df_pchembl = df_copy.dropna(
subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
)
dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl))


def add_debugging_info(
dataset: Dataset,
df: pd.DataFrame,
label: str,
):
"""
Wrapper for add_dataset_sizes.
Handles logging level.
"""
if logging.DEBUG >= logging.root.level:
add_dataset_sizes(dataset, df, label)


##### Logging Stats #####
def get_stats_for_column(
df: pd.DataFrame, column: str, columns_desc: str
df: pd.DataFrame,
column: str,
columns_desc: str,
) -> list[list[str, str, int]]:
"""
Calculate the number of unique values in df[column] and various subsets of df.
Expand Down Expand Up @@ -145,3 +84,78 @@ def get_stats_for_column(
df[df["DTI"] == "C0_DT"][column].nunique(),
],
]


##### Debugging Stats #####
def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame:
"""
Calculate the number of unique compounds, targets and pairs
for df and df limited to drugs.
:param df: Pandas DataFrame for which the dataset sizes should be calculated.
:type df: pd.DataFrame
:param label: Description of pipeline step (e.g., initial query).
:type label: str
:return: Pandas DataFrame with calculated unique counts.
:rtype: pd.DataFrame
"""
stats = {"step": label}

if "DTI" in df.columns:
# drugs = compounds of a compound-target pair with a known interaction
df_drugs = df[df["DTI"] == "D_DT"]
else:
df_drugs = df[df["max_phase"] == 4]

df_columns, _ = get_stats_columns()
for column in df_columns:
stats[f"{column}_all"] = df[column].nunique()
stats[f"{column}_drugs"] = df_drugs[column].nunique()

df_stats = pd.DataFrame([stats])
return df_stats


def add_dataset_sizes(
dataset: Dataset,
df: pd.DataFrame,
label: str,
):
"""
Count and add representative counts of df used for debugging to the dataset.
:param dataset: Dataset with compound-target pairs and debugging sizes.
:type dataset: Dataset
:param df: Pandas DataFrame with current compound-target pairs
:type df: pd.DataFrame
:param label: Description of pipeline step (e.g., initial query).
:type label: str
"""
df_stats = get_dataset_sizes(df, label)

dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats])

# restrict to data with any pchembl value (any data with a pchembl,
# even if it is based on only functional data)
# these statistics are purely based on removing
# compound-target pairs without pchembl information,
# i.e., the subset of the dataset is determined by the given df and not recalculated
df_copy = df.copy()
df_pchembl = df_copy.dropna(
subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
)
df_stats = get_dataset_sizes(df_pchembl, label)
dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats])


def add_debugging_info(
dataset: Dataset,
df: pd.DataFrame,
label: str,
):
"""
Wrapper for add_dataset_sizes.
Handles logging level.
"""
if logging.DEBUG >= logging.root.level:
add_dataset_sizes(dataset, df, label)
110 changes: 37 additions & 73 deletions src/write_subsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dataset import Dataset


##### Writing Output #####
def write_output(
df: pd.DataFrame,
filename: str,
Expand Down Expand Up @@ -73,6 +74,7 @@ def write_and_check_output(
)


##### Output Specific Results #####
def write_full_dataset_to_file(
dataset: Dataset,
args: CalculationArgs,
Expand All @@ -97,59 +99,6 @@ def write_full_dataset_to_file(
write_and_check_output(dataset.df_result, name_all, desc, args, out)


def output_debug_sizes(
dataset: Dataset,
out: OutputArgs,
):
"""
Output counts at various points during calculating the final dataset for debugging.
:param dataset: Dataset with compound-target pairs and debugging sizes.
:type dataset: Dataset
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
column_names = [
"type",
"#mols",
"#drugs",
"#targets",
"#drug_ targets",
"#targets_ mutation",
"#drug_ targets_mutation",
"#cpd_tid_ pairs",
"#drug_tid_ pairs",
"#cpd_ tid_mutation_ pairs",
"#drug_ tid_mutation_ pairs",
]

logging.debug("Size of full dataset at different points.")
full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names)
logging.debug(full_df_sizes)
name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
write_output(
full_df_sizes,
name_full_df_sizes,
out,
)

logging.debug("Size of dataset with any pchembl values at different points.")
logging.debug(
"This includes data for which we only have pchembl data \
for functional assays but not for binding assays."
)
df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names)
logging.debug(df_pchembl_sizes)
name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
write_output(
full_df_sizes,
name_pchembl_df_sizes,
out,
)


def output_stats(
df: pd.DataFrame,
output_file: str,
Expand All @@ -171,33 +120,15 @@ def output_stats(
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
df_columns = [
"parent_molregno",
"tid",
"tid_mutation",
"cpd_target_pair",
"cpd_target_pair_mutation",
]
columns_descs = [
"compound ID",
"target ID",
"target ID with mutation annotations",
"compound-target pair",
"compound-target pair with mutation annotations",
]

logging.debug("Stats for %s", output_file)
stats = []
df_columns, columns_descs = get_stats.get_stats_columns()
for column, columns_desc in zip(df_columns, columns_descs):
logging.debug("Stats for column %s:", column)
column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
stats += column_stats
for colum_stat in column_stats:
logging.debug(
"%20s %s",
colum_stat[2],
colum_stat[3],
)
logging.debug("%20s %s", colum_stat[2], colum_stat[3])

df_stats = pd.DataFrame(
stats, columns=["column", "column_description", "subset_type", "counts"]
Expand Down Expand Up @@ -252,3 +183,36 @@ def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs):
output_file,
out,
)


def write_debug_sizes(
dataset: Dataset,
out: OutputArgs,
):
"""
Output counts at various points during calculating the final dataset for debugging.
:param dataset: Dataset with compound-target pairs and debugging sizes.
:type dataset: Dataset
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
# Size of full dataset at different points.
name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
write_output(
dataset.df_sizes_all,
name_full_df_sizes,
out,
)

# Size of dataset with any pchembl values at different points.
# This includes data for which we only have pchembl data
# for functional assays but not for binding assays.
name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
write_output(
dataset.df_sizes_pchembl,
name_pchembl_df_sizes,
out,
)

0 comments on commit 0c82748

Please sign in to comment.