Group arguments together in dataclasses

- add CalculationArgs dataclass with arguments related to how the dataset is calculated - add OutputArgs dataclass with arguments related to the output
chembl · Feb 15, 2024 · c7e76b5 · c7e76b5
1 parent 6982ee6
commit c7e76b5
Show file tree

Hide file tree

Showing 6 changed files with 458 additions and 565 deletions.
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 import write_subsets
+from arguments import OutputArgs, CalculationArgs
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -81,12 +82,8 @@ def get_target_class_table(
 def add_chembl_target_class_annotations(
     df_combined: pd.DataFrame,
     chembl_con: sqlite3.Connection,
-    output_path: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    chembl_version: str,
-    limited_flag: str,
+    args: CalculationArgs,
+    out: OutputArgs,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     Add level 1 and 2 target class annotations. 
@@ -101,19 +98,10 @@ def add_chembl_target_class_annotations(
     :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param output_path: Path to write the targets with more than one target class assignment to
-    :type output_path: str
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param limited_flag: Document suffix indicating 
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     :return: - Pandas DataFrame with added target class annotations \\
         - Pandas DataFrame with mapping from target id to level 1 target class \\
         - Pandas DataFrame with mapping from target id to level 2 target class
@@ -199,15 +187,14 @@ def add_chembl_target_class_annotations(
     )
 
     name_more_than_one_tclass = os.path.join(
-        output_path,
-        f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass",
+        out.output_path,
+        f"ChEMBL{args.chembl_version}_"
+        f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass",
     )
     write_subsets.write_output(
         more_than_one_tclass,
         name_more_than_one_tclass,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
+        out,
     )
 
     return df_combined, target_classes_level1, target_classes_level2
diff --git a/src/arguments.py b/src/arguments.py
@@ -0,0 +1,172 @@
+import argparse
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class CalculationArgs:
+    """
+    Collection of arguments related to how to calculate the dataset.
+
+    chembl_version:         Version of ChEMBL for output file names
+    calculate_rdkit:        True if RDKit-based compound properties should be calculated
+    limit_to_literature:    Include only literature sources if True
+    limited_flag:           String version of limit_to_literature used in file names
+    min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
+    min_nof_cpds_b:         Minimum number of compounds per target for the B subset
+    """
+
+    chembl_version: str
+    calculate_rdkit: bool
+    limit_to_literature: bool
+    limited_flag: str
+    min_nof_cpds_bf: int
+    min_nof_cpds_b: int
+
+
+@dataclass(frozen=True)
+class OutputArgs:
+    """
+    Collection of arguments related to how to output the dataset.
+
+    output_path:        Path to write output files to
+    delimiter:          Delimiter in csv-output
+    write_to_csv:       True if output should be written to csv
+    write_to_excel:     True if output should be written to excel
+    write_full_dataset: True if the full dataset should be written to output
+    write_bf:           True if subsets based on binding+functional data should be written to output
+    write_b:            True if subsets based on binding data only should be written to output
+    """
+
+    output_path: str
+    delimiter: str
+    write_to_csv: bool
+    write_to_excel: bool
+    write_full_dataset: bool
+    write_bf: bool
+    write_b: bool
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Get arguments with argparse.
+
+    :return: Populated argparse.Namespace
+    :rtype: argparse.Namespace
+    """
+    parser = argparse.ArgumentParser(
+        description="Extract the compound-target pairs dataset from ChEMBL. \
+            The full dataset plus filtering columns for binding vs. binding+functional data \
+            will always be written to csv. \
+            Additional outputs and output types can be chosen with the parameters below."
+    )
+
+    parser.add_argument(
+        "--chembl",
+        "-v",
+        dest="chembl_version",
+        metavar="<version>",
+        type=str,
+        default=None,
+        help="ChEMBL version. \
+            Latest version if None. \
+            Required if a path to a SQLite database is provided, \
+            i.e., if --sqlite is set. (default: None)",
+    )
+    parser.add_argument(
+        "--sqlite",
+        "-s",
+        metavar="<path>",
+        type=str,
+        default=None,
+        help="Path to SQLite database. \
+            ChEMBL is downloaded as an SQLite database \
+            and handled by chembl_downloader if None. (default: None)",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        dest="output_path",
+        metavar="<path>",
+        type=str,
+        required=True,
+        help="Path to write the output file(s) to. (required)",
+    )
+    parser.add_argument(
+        "--delimiter",
+        "-d",
+        metavar="<delimiter>",
+        type=str,
+        default=";",
+        help="Delimiter in output csv-files.  (default: ;)",
+    )
+    parser.add_argument(
+        "--all_sources",
+        action="store_true",
+        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
+            This includes data from BindingDB which may skew the results. \
+            Default (not set): the dataset is calculated based on only literature data.",
+    )
+    parser.add_argument(
+        "--rdkit",
+        dest="calculate_rdkit",
+        action="store_true",
+        help="Calculate RDKit-based compound properties.",
+    )
+    parser.add_argument(
+        "--excel",
+        dest="write_to_excel",
+        action="store_true",
+        help="Write the results to excel. Note: this may fail if the output is too large.",
+    )
+    parser.add_argument(
+        "--BF",
+        dest="write_bf",
+        action="store_true",
+        help="Write binding+functional data subsets.",
+    )
+    parser.add_argument(
+        "--B", dest="write_b", action="store_true", help="Write binding data subsets."
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Log additional debugging information."
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
+    """
+    Get parsed and default arguments.
+
+    :return: parserd arguments,
+        arguments related to how to calculate the dataset as CalculationArgs,
+        arguments related to how to output the dataset as OutputArgs
+    :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
+    """
+    args = parse_args()
+
+    calc_args = CalculationArgs(
+        chembl_version=args.chembl_version,
+        calculate_rdkit=args.calculate_rdkit,
+        limit_to_literature=not args.all_sources,
+        # used in file names
+        limited_flag="literature_only" if not args.all_sources else "all_sources",
+        min_nof_cpds_bf=100,
+        min_nof_cpds_b=100,
+    )
+
+    output_args = OutputArgs(
+        output_path=args.output_path,
+        delimiter=args.delimiter,
+        # Always write the results to csv.
+        write_to_csv=True,
+        write_to_excel=args.write_to_excel,
+        # Always write the full dataset plus filtering columns
+        # for binding vs. binding+functional data.
+        write_full_dataset=True,
+        write_bf=args.write_bf,
+        write_b=args.write_b,
+    )
+
+    return args, calc_args, output_args
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
@@ -1,17 +1,13 @@
-import logging
 import sqlite3
 
 import numpy as np
 import pandas as pd
 
-import get_stats
-
 
 ########### Get Initial Compound-Target Data From ChEMBL ###########
 def get_compound_target_pairs_with_pchembl(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Query ChEMBL activities and related assay for compound-target pairs
@@ -27,8 +23,6 @@ def get_compound_target_pairs_with_pchembl(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     :return: Pandas DataFrame with compound-target pairs with a pchembl value.
     :rtype: pd.DataFrame
     """
@@ -84,9 +78,6 @@ def get_compound_target_pairs_with_pchembl(
         f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
     ]
 
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes)
-
     return df_mols
 
 
@@ -173,7 +164,6 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
 def get_aggregated_activity_ct_pairs(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Get dataset of compound target-pairs with an associated pchembl value
@@ -194,14 +184,13 @@ def get_aggregated_activity_ct_pairs(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
         aggregated into one entry per compound-target pair.
     :rtype: pd.DataFrame
     """
     df_mols = get_compound_target_pairs_with_pchembl(
-        chembl_con, limit_to_literature, df_sizes
+        chembl_con,
+        limit_to_literature,
     )
 
     # Summarise the information for binding and functional assays