Skip to content

Commit

Permalink
Add module to add filtering columns
Browse files Browse the repository at this point in the history
  • Loading branch information
LinaHeinzke committed Feb 15, 2024
1 parent c7e76b5 commit d8331c9
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 318 deletions.
233 changes: 233 additions & 0 deletions src/add_filtering_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import logging
import os

import pandas as pd

from arguments import CalculationArgs, OutputArgs
import get_stats
import write_subsets


def get_data_subsets(
data: pd.DataFrame, min_nof_cpds: int, desc: str
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Calculate and return the different subsets of interest.
:param data: Pandas DataFrame with compound-target pairs
:type data: pd.DataFrame
:param min_nof_cpds: Miminum number of compounds per target
:type min_nof_cpds: int
:param desc: Types of assays current_df contains information about. \
Options: "BF" (binding+functional), "B" (binding)
:type desc: str
:return:
- data: Pandas DataFrame with compound-target pairs
without the annotations for the opposite desc, \
e.g. if desc = "BF", the average pchembl value based on
binding data only is dropped
- df_enough_cpds: Pandas DataFrame with targets
with at least <min_nof_cpds> compounds with a pchembl value,
- df_c_dt_d_dt: As df_enough_cpds but with \
at least one compound-target pair labelled as
'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction),
- df_d_dt: As df_enough_cpds but with \
at least one compound-target pair labelled as
'D_DT' (i.e., known drug-target interaction)
:rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame)
"""
if desc == "B":
drop_desc = "BF"
else:
drop_desc = "B"
data = data.drop(
columns=[
f"pchembl_value_mean_{drop_desc}",
f"pchembl_value_max_{drop_desc}",
f"pchembl_value_median_{drop_desc}",
f"first_publication_cpd_target_pair_{drop_desc}",
f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}",
f"LE_{drop_desc}",
f"BEI_{drop_desc}",
f"SEI_{drop_desc}",
f"LLE_{drop_desc}",
]
+ [ # exclude columns related to the other assay types
col for col in data.columns if col.startswith("B_") or col.startswith("BF_")
] # exclude filtering columns
).drop_duplicates()

# Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
comparator_counts = (
data[data[f"pchembl_value_mean_{desc}"].notnull()]
.groupby(["tid_mutation"])["parent_molregno"]
.count()
)
# pylint: disable-next=unused-variable
targets_w_enough_cpds = comparator_counts[
comparator_counts >= min_nof_cpds
].index.tolist()
df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds")

# Restrict the dataset further to targets
# with at least one compound-target pair labelled as
# 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT',
# i.e., compound-target pairs with a known interactions.
# pylint: disable-next=unused-variable
c_dt_d_dt_targets = set(
df_enough_cpds[
df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"])
].tid_mutation.to_list()
)
df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets")

# Restrict the dataset further to targets with
# at least one compound-target pair labelled as 'D_DT',
# i.e., known drug-target interactions.
# pylint: disable-next=unused-variable
d_dt_targets = set(
df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list()
)
df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")

return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt


def add_subset_filtering_columns(
df_combined_subset: pd.DataFrame,
df_combined: pd.DataFrame,
desc: str,
args: CalculationArgs,
out: OutputArgs,
df_sizes,
) -> pd.DataFrame:
# TODO update documentation
"""
Add filtering column for binding + functional vs binding
:param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
in df_combined
:type df_combined_subset: pd.DataFrame
:param df_combined: Pandas DataFrame with compound-target pairs
:type df_combined: pd.DataFrame
:param desc: Assay description,
either "BF" (binding+functional) or "B" (binding)
:type desc: str
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:return: List of calculated subsets
:rtype: pd.DataFrame
"""
(
df_combined_subset,
df_combined_subset_enough_cpds,
df_combined_subset_c_dt_d_dt,
df_combined_subset_d_dt,
) = get_data_subsets(
df_combined_subset,
args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
desc,
)

# write subsets if required
if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
for df_subset, subset_desc in zip(
[
df_combined_subset,
df_combined_subset_enough_cpds,
df_combined_subset_c_dt_d_dt,
df_combined_subset_d_dt,
],
[
f"{desc}",
f"{desc}_{args.min_nof_cpds_bf}",
f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
f"{desc}_{args.min_nof_cpds_bf}_d_dt",
],
):
name_subset = os.path.join(
out.output_path,
f"ChEMBL{args.chembl_version}_"
f"CTI_{args.limited_flag}_"
f"{subset_desc}",
)
write_subsets.write_and_check_output(
df_subset,
name_subset,
desc,
args,
out,
)

# add filtering columns to df_combined
for df, col_name in zip(
[
df_combined_subset_enough_cpds,
df_combined_subset_c_dt_d_dt,
df_combined_subset_d_dt,
],
[
f"{desc}_{args.min_nof_cpds_bf}",
f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
f"{desc}_{args.min_nof_cpds_bf}_d_dt",
],
):
df_combined[col_name] = False
df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
# check that filtering works
assert df_combined[df_combined[col_name] == True][df.columns].equals(
df
), f"Filtering is not accurate for {col_name}."

if logging.DEBUG >= logging.root.level:
get_stats.add_dataset_sizes(
df_combined_subset, "binding + functional", df_sizes
)
get_stats.add_dataset_sizes(
df_combined_subset_enough_cpds, "BF, >= 100", df_sizes
)
get_stats.add_dataset_sizes(
df_combined_subset_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes
)
get_stats.add_dataset_sizes(
df_combined_subset_d_dt, "BF, >= 100, d_dt", df_sizes
)

return df_combined


def add_filtering_columns(
df_combined,
df_sizes,
args,
out,
):
# TODO: documentation
# consider binding and functional assays
# assay description = binding+functional
desc = "BF"
# df_combined without binding only data
df_combined_subset = df_combined.copy()
df_combined = add_subset_filtering_columns(
df_combined_subset,
df_combined,
desc,
args,
out,
df_sizes,
)

# consider only binding assays
# assay description = binding
desc = "B"
df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy()
df_combined = add_subset_filtering_columns(
df_combined_subset,
df_combined,
desc,
args,
out,
df_sizes,
)

return df_combined
13 changes: 3 additions & 10 deletions src/get_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import write_subsets
import get_stats
from arguments import OutputArgs, CalculationArgs
import add_filtering_columns


def get_ct_pair_dataset(
Expand Down Expand Up @@ -102,16 +103,8 @@ def get_ct_pair_dataset(
args.calculate_rdkit,
)

logging.info("write_BF_to_file")
df_combined = write_subsets.write_bf_to_file(
df_combined,
df_sizes,
args,
out,
)

logging.info("write_B_to_file")
df_combined = write_subsets.write_b_to_file(
logging.info("add_filtering_columns")
add_filtering_columns.add_filtering_columns(
df_combined,
df_sizes,
args,
Expand Down
Loading

0 comments on commit d8331c9

Please sign in to comment.