Skip to content

Commit

Permalink
Group arguments together in dataclasses
Browse files Browse the repository at this point in the history
- add CalculationArgs dataclass with arguments related to how the dataset is calculated
- add OutputArgs dataclass with arguments related to the output
  • Loading branch information
LinaHeinzke committed Feb 15, 2024
1 parent 6982ee6 commit c7e76b5
Show file tree
Hide file tree
Showing 6 changed files with 458 additions and 565 deletions.
35 changes: 11 additions & 24 deletions src/add_chembl_target_class_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

import write_subsets
from arguments import OutputArgs, CalculationArgs


########### Add Target Class Annotations Based on ChEMBL Data ###########
Expand Down Expand Up @@ -81,12 +82,8 @@ def get_target_class_table(
def add_chembl_target_class_annotations(
df_combined: pd.DataFrame,
chembl_con: sqlite3.Connection,
output_path: str,
write_to_csv: bool,
write_to_excel: bool,
delimiter: str,
chembl_version: str,
limited_flag: str,
args: CalculationArgs,
out: OutputArgs,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""
Add level 1 and 2 target class annotations.
Expand All @@ -101,19 +98,10 @@ def add_chembl_target_class_annotations(
:type df_combined: pd.DataFrame
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param output_path: Path to write the targets with more than one target class assignment to
:type output_path: str
:param write_to_csv: True if output should be written to csv
:type write_to_csv: bool
:param write_to_excel: True if output should be written to excel
:type write_to_excel: bool
:param delimiter: Delimiter in csv-output
:type delimiter: str
:param chembl_version: Version of ChEMBL for output files
:type chembl_version: str
:param limited_flag: Document suffix indicating
whether the dataset was limited to literature sources
:type limited_flag: str
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
:return: - Pandas DataFrame with added target class annotations \\
- Pandas DataFrame with mapping from target id to level 1 target class \\
- Pandas DataFrame with mapping from target id to level 2 target class
Expand Down Expand Up @@ -199,15 +187,14 @@ def add_chembl_target_class_annotations(
)

name_more_than_one_tclass = os.path.join(
output_path,
f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass",
out.output_path,
f"ChEMBL{args.chembl_version}_"
f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass",
)
write_subsets.write_output(
more_than_one_tclass,
name_more_than_one_tclass,
write_to_csv,
write_to_excel,
delimiter,
out,
)

return df_combined, target_classes_level1, target_classes_level2
172 changes: 172 additions & 0 deletions src/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import argparse
from dataclasses import dataclass


@dataclass(frozen=True)
class CalculationArgs:
"""
Collection of arguments related to how to calculate the dataset.
chembl_version: Version of ChEMBL for output file names
calculate_rdkit: True if RDKit-based compound properties should be calculated
limit_to_literature: Include only literature sources if True
limited_flag: String version of limit_to_literature used in file names
min_nof_cpds_bf: Minimum number of compounds per target for the BF subset
min_nof_cpds_b: Minimum number of compounds per target for the B subset
"""

chembl_version: str
calculate_rdkit: bool
limit_to_literature: bool
limited_flag: str
min_nof_cpds_bf: int
min_nof_cpds_b: int


@dataclass(frozen=True)
class OutputArgs:
"""
Collection of arguments related to how to output the dataset.
output_path: Path to write output files to
delimiter: Delimiter in csv-output
write_to_csv: True if output should be written to csv
write_to_excel: True if output should be written to excel
write_full_dataset: True if the full dataset should be written to output
write_bf: True if subsets based on binding+functional data should be written to output
write_b: True if subsets based on binding data only should be written to output
"""

output_path: str
delimiter: str
write_to_csv: bool
write_to_excel: bool
write_full_dataset: bool
write_bf: bool
write_b: bool


def parse_args() -> argparse.Namespace:
"""
Get arguments with argparse.
:return: Populated argparse.Namespace
:rtype: argparse.Namespace
"""
parser = argparse.ArgumentParser(
description="Extract the compound-target pairs dataset from ChEMBL. \
The full dataset plus filtering columns for binding vs. binding+functional data \
will always be written to csv. \
Additional outputs and output types can be chosen with the parameters below."
)

parser.add_argument(
"--chembl",
"-v",
dest="chembl_version",
metavar="<version>",
type=str,
default=None,
help="ChEMBL version. \
Latest version if None. \
Required if a path to a SQLite database is provided, \
i.e., if --sqlite is set. (default: None)",
)
parser.add_argument(
"--sqlite",
"-s",
metavar="<path>",
type=str,
default=None,
help="Path to SQLite database. \
ChEMBL is downloaded as an SQLite database \
and handled by chembl_downloader if None. (default: None)",
)
parser.add_argument(
"--output",
"-o",
dest="output_path",
metavar="<path>",
type=str,
required=True,
help="Path to write the output file(s) to. (required)",
)
parser.add_argument(
"--delimiter",
"-d",
metavar="<delimiter>",
type=str,
default=";",
help="Delimiter in output csv-files. (default: ;)",
)
parser.add_argument(
"--all_sources",
action="store_true",
help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
This includes data from BindingDB which may skew the results. \
Default (not set): the dataset is calculated based on only literature data.",
)
parser.add_argument(
"--rdkit",
dest="calculate_rdkit",
action="store_true",
help="Calculate RDKit-based compound properties.",
)
parser.add_argument(
"--excel",
dest="write_to_excel",
action="store_true",
help="Write the results to excel. Note: this may fail if the output is too large.",
)
parser.add_argument(
"--BF",
dest="write_bf",
action="store_true",
help="Write binding+functional data subsets.",
)
parser.add_argument(
"--B", dest="write_b", action="store_true", help="Write binding data subsets."
)
parser.add_argument(
"--debug", action="store_true", help="Log additional debugging information."
)
args = parser.parse_args()

return args


def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
"""
Get parsed and default arguments.
:return: parserd arguments,
arguments related to how to calculate the dataset as CalculationArgs,
arguments related to how to output the dataset as OutputArgs
:rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
"""
args = parse_args()

calc_args = CalculationArgs(
chembl_version=args.chembl_version,
calculate_rdkit=args.calculate_rdkit,
limit_to_literature=not args.all_sources,
# used in file names
limited_flag="literature_only" if not args.all_sources else "all_sources",
min_nof_cpds_bf=100,
min_nof_cpds_b=100,
)

output_args = OutputArgs(
output_path=args.output_path,
delimiter=args.delimiter,
# Always write the results to csv.
write_to_csv=True,
write_to_excel=args.write_to_excel,
# Always write the full dataset plus filtering columns
# for binding vs. binding+functional data.
write_full_dataset=True,
write_bf=args.write_bf,
write_b=args.write_b,
)

return args, calc_args, output_args
15 changes: 2 additions & 13 deletions src/get_activity_ct_pairs.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import logging
import sqlite3

import numpy as np
import pandas as pd

import get_stats


########### Get Initial Compound-Target Data From ChEMBL ###########
def get_compound_target_pairs_with_pchembl(
chembl_con: sqlite3.Connection,
limit_to_literature: bool,
df_sizes: list[list[int], list[int]],
) -> pd.DataFrame:
"""
Query ChEMBL activities and related assay for compound-target pairs
Expand All @@ -27,8 +23,6 @@ def get_compound_target_pairs_with_pchembl(
:param limit_to_literature: Include only literature sources if True.
Include all available sources otherwise.
:type limit_to_literature: bool
:param df_sizes: List of intermediate sized of the dataset used for debugging.
:type df_sizes: list[list[int], list[int]]
:return: Pandas DataFrame with compound-target pairs with a pchembl value.
:rtype: pd.DataFrame
"""
Expand Down Expand Up @@ -84,9 +78,6 @@ def get_compound_target_pairs_with_pchembl(
f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
]

if logging.DEBUG >= logging.root.level:
get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes)

return df_mols


Expand Down Expand Up @@ -173,7 +164,6 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
def get_aggregated_activity_ct_pairs(
chembl_con: sqlite3.Connection,
limit_to_literature: bool,
df_sizes: list[list[int], list[int]],
) -> pd.DataFrame:
"""
Get dataset of compound target-pairs with an associated pchembl value
Expand All @@ -194,14 +184,13 @@ def get_aggregated_activity_ct_pairs(
:param limit_to_literature: Include only literature sources if True.
Include all available sources otherwise.
:type limit_to_literature: bool
:param df_sizes: List of intermediate sized of the dataset used for debugging.
:type df_sizes: list[list[int], list[int]]
:return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
aggregated into one entry per compound-target pair.
:rtype: pd.DataFrame
"""
df_mols = get_compound_target_pairs_with_pchembl(
chembl_con, limit_to_literature, df_sizes
chembl_con,
limit_to_literature,
)

# Summarise the information for binding and functional assays
Expand Down
Loading

0 comments on commit c7e76b5

Please sign in to comment.