minor improvements

yazdanimehdi · Oct 22, 2024 · b03e668 · b03e668
1 parent b9930ad
commit b03e668
Show file tree

Hide file tree

Showing 14 changed files with 192,386 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -179,4 +179,5 @@ drug_b.py
 **/._.DS_Store
 **/**/pdb/
 *.pdb
-.VSCodeCounter/
+.VSCodeCounter/
+*.pth
diff --git a/data/drugbank_ddi/drugbank_DDI.tab b/data/drugbank_ddi/drugbank_DDI.tab
diff --git a/davis_correct.py b/davis_correct.py
@@ -0,0 +1,6 @@
+import pandas as pd
+import numpy as np
+# Load the data
+data = pd.read_csv('data/davis/davis.txt', sep=',')
+data["Y"]  = data["Y"].apply(lambda x: -np.log10(x/1e9))
+data.to_csv('data/davis/davis.txt', index=False)
diff --git a/deepdrugdomain/configs/fx_ddi.json b/deepdrugdomain/configs/fx_ddi.json
@@ -0,0 +1,121 @@
+{
+    "model": {
+        "default": {
+            "protein_graph_conv_layer": [
+                "dgl_tag",
+                "dgl_tag",
+                "dgl_gat"
+            ],
+            "ligand_graph_conv_layer": [
+                "dgl_tag",
+                "dgl_tag",
+                "dgl_gat"
+            ],
+            "protein_input_size": 74,
+            "ligand_input_size": 74,
+            "protein_graph_conv_dims": [
+                74,
+                128
+            ],
+            "ligand_graph_conv_dims": [
+                74,
+                128
+            ],
+            "protein_conv_dropout_rate": [
+                0.05,
+                0.05,
+                0.05
+            ],
+            "protein_conv_normalization": [
+                false,
+                false,
+                false
+            ],
+            "ligand_conv_dropout_rate": [
+                0.05,
+                0.05,
+                0.05
+            ],
+            "ligand_conv_normalization": [
+                false,
+                false,
+                false
+            ],
+            "head_dropout_rate": 0.05,
+            "head_activation_fn": [
+                null
+            ],
+            "head_normalization": [
+                "layer_norm"
+            ],
+            "protein_graph_conv_kwargs": [
+                {
+                    "k": 4
+                },
+                {
+                    "k": 4
+                },
+                {
+                    "num_heads": 2
+                }
+            ],
+            "ligand_graph_conv_kwargs": [
+                {
+                    "k": 8
+                },
+                {
+                    "k": 8
+                },
+                {
+                    "num_heads": 2
+                }
+            ],
+            "ligand_graph_pooling_kwargs": {},
+            "protein_graph_pooling_kwargs": {},
+            "embedding_dim": 256,
+            "ligand_graph_pooling": null,
+            "protein_graph_pooling": null,
+            "self_attention_depth": 4,
+            "self_attention_num_heads": 4,
+            "self_attention_mlp_ratio": 4,
+            "self_attention_qkv_bias": true,
+            "self_attention_qk_scale": null,
+            "self_attention_drop_rate": 0.4,
+            "self_attn_drop_rate": 0.0,
+            "self_drop_path_rate": 0.4,
+            "self_norm_layer": "layer_norm",
+            "input_norm_layer": "layer_norm",
+            "output_norm_layer": "layer_norm",
+            "block_layers": "transformer_attention_block",
+            "input_block_layers": "transformer_cross_attention_block",
+            "output_block_layers": "transformer_cross_attention_block",
+            "self_act_layer": "gelu",
+            "input_act_layer": "gelu",
+            "output_act_layer": "gelu",
+            "attention_block": "transformer_attention",
+            "self_mlp_block": "transformer_mlp",
+            "input_mlp_block": "transformer_mlp",
+            "output_mlp_block": "transformer_mlp",
+            "input_cross_att_block": "transformer_cross_attention",
+            "output_cross_att_block": "transformer_cross_attention",
+            "input_cross_attention_num_heads": 4,
+            "input_cross_attention_mlp_ratio": 4,
+            "input_cross_attention_qkv_bias": true,
+            "input_cross_attention_qk_scale": null,
+            "input_cross_attention_drop_rate": 0.1,
+            "input_cross_attn_drop_rate": 0.0,
+            "input_cross_drop_path_rate": 0.1,
+            "output_cross_attention_num_heads": 4,
+            "output_cross_attention_mlp_ratio": 4,
+            "output_cross_attention_qkv_bias": true,
+            "output_cross_attention_qk_scale": null,
+            "output_cross_attention_drop_rate": 0.4,
+            "output_cross_attn_drop_rate": 0.0,
+            "output_cross_drop_path_rate": 0.4,
+            "input_stages": 3,
+            "output_stages": 3,
+            "latent_space": 300,
+            "head_dims": []
+        }
+    }
+}
diff --git a/deepdrugdomain/data/datasets/DDI_datasets/__init__.py b/deepdrugdomain/data/datasets/DDI_datasets/__init__.py
@@ -0,0 +1 @@
+from .drugbank import DrugBankDDIDataset
diff --git a/deepdrugdomain/data/datasets/DDI_datasets/drugbank.py b/deepdrugdomain/data/datasets/DDI_datasets/drugbank.py
@@ -0,0 +1,65 @@
+import os
+from typing import Dict, List, Optional, Tuple, Union
+from deepdrugdomain.data.utils import CustomDataset
+from deepdrugdomain.data.preprocessing.utils.preprocessing_data_struct import PreprocessingObject
+from ..factory import DatasetFactory
+
+
+@DatasetFactory.register('drugbank_ddi')
+class DrugBankDDIDataset(CustomDataset):
+    """
+    Dataset class for DrugBank drug-drug interaction data.
+
+    This class extends CustomDataset to provide a structured way to load and preprocess the DrugBank interaction 
+    datasets. It supports the integration of drug and drug data, along with their corresponding labels, for tasks 
+    such as interaction prediction.
+
+    Parameters:
+        file_paths (str): Directory path where data files are stored or to be downloaded.
+        preprocesses (PreprocessingObject): Preprocessing configuration(s) for drug, protein, and label data.
+        save_directory (Optional[str]): The directory to save processed files, defaults to `file_paths` if None.
+        urls (Optional[Union[List[str], str]]): URLs to download the dataset files if not present at `file_paths`.
+        common_columns (Optional[Union[Dict[str, str], List[Dict[str, str]]]]): Mapping of common column names to the 
+            expected format.
+        separators (Union[List[str], str], optional): List of separators used in the data files.
+        associated_model (Optional[str]): The name of the model associated with the dataset, if any.
+        threads (int, optional): Number of threads to use for data processing.
+
+    Example:
+        >>> dataset = DrugBankDataset(
+        ...     file_paths='/data/drugbank/',
+        ...     drug_preprocess_type=('canonical_smiles', {'remove_hydrogens': True}),
+        ...     protein_preprocess_type=('sequence', {'tokenization': 'char'}),
+        ...     protein_attributes='sequence',
+        ...     in_memory_preprocessing_protein=True,
+        ... )
+        >>> train_dataset, val_dataset, test_dataset = dataset.split(splits=[0.8, 0.1, 0.1], return_df=False) 
+        >>> # Preprocess and split the dataset into train, validation, and test sets and prepare data for training or analysis
+        >>> drugbank_dataframe = dataset.to_dataframe()  # Get the raw dataset as a pandas DataFrame
+
+    Note:
+        The class automatically downloads the necessary files if they are not available in the given `file_paths` during 
+        initialization, using the provided `urls` for data source.
+    """
+
+    def __init__(self, file_paths: str,
+                 preprocesses: PreprocessingObject,
+                 save_directory: str | None = None,
+                 # Edit the URL
+                 urls: List[str] | str | None = ['https://github.com/khodabandeh-ali/D3-NewTasks/blob/main/data/drugbank/drugbank_DDI.tab'],
+                 common_columns: Dict[str,
+                                      str] | List[Dict[str, str]] | None = {},
+                 separators: List[str] | str = ['\t'],
+                 associated_model: str | None = None,
+                 threads: int = 4) -> None:
+
+        self.file_paths = file_paths
+        drugbank_data_path = os.path.join(self.file_paths, 'drugbank_DDI.tab')
+
+        file_paths = [drugbank_data_path]
+        save_directory = self.file_paths if save_directory is None else save_directory
+        super().__init__(file_paths, preprocesses, save_directory, urls,
+                         common_columns, separators, associated_model, None, threads)
+
+        if not os.path.exists(drugbank_data_path):
+            self.download()
diff --git a/deepdrugdomain/data/datasets/__init__.py b/deepdrugdomain/data/datasets/__init__.py
@@ -1,3 +1,4 @@
 from .DTI_datasets import *
 from .DTA_datasets import *
 from .factory import DatasetFactory
+from .DDI_datasets import *
diff --git a/deepdrugdomain/data/preprocessing/drug/smile_to_dgl_graph.py b/deepdrugdomain/data/preprocessing/drug/smile_to_dgl_graph.py
@@ -85,7 +85,7 @@ def preprocess(self, data: str) -> Optional[dgl.DGLGraph]:
                     return None
                 smile_graphs = [smiles_to_bigraph(
                     f, add_self_loop=True, node_featurizer=self.node_featurizer, edge_featurizer=self.edge_featurizer) for f in frags]
-                constructed_graphs = dgl.batch(smile_graphs)
+                constructed_graphs = smile_graphs
 
             except Exception as e:
                 constructed_graphs = None
@@ -109,7 +109,13 @@ def preprocess(self, data: str) -> Optional[dgl.DGLGraph]:
         return constructed_graphs
 
     def save_data(self, data: dgl.DGLGraph, path: str) -> None:
-        dgl.save_graphs(path, [data])
+        if not isinstance(data, dgl.DGLGraph):
+            super().save_data(data, path)
+        else:
+            dgl.save_graphs(path, [data])
 
     def load_data(self, path: str) -> dgl.DGLGraph:
-        return dgl.load_graphs(path)[0][0]
+        if self.fragment:
+            return super().load_data(path)
+        else:
+            return dgl.load_graphs(path)[0][0]
diff --git a/deepdrugdomain/layers/graph_layers/dgl_layers.py b/deepdrugdomain/layers/graph_layers/dgl_layers.py
@@ -120,7 +120,7 @@ def forward(self, g: dgl.DGLGraph) -> dgl.DGLGraph:
         features = self.dropout(features)
 
         new_g = g
-        new_g.ndata['h'] = features
+        new_g.ndata['h'] = torch.mean(features, dim=1)
 
         return new_g
 

diff --git a/deepdrugdomain/models/DDI/__init__.py b/deepdrugdomain/models/DDI/__init__.py
@@ -0,0 +1 @@
+from .fx_ddi import FragXSiteDDI