ran black

rsinghlab · Nov 2, 2024 · 3dc3687 · 3dc3687
1 parent 3485f77
commit 3dc3687
Show file tree

Hide file tree

Showing 87 changed files with 240 additions and 163 deletions.
diff --git a/Analysis.ipynb b/Analysis.ipynb
@@ -137,9 +137,7 @@
    "source": [
     "# Convert the data to a DataFrame for easier manipulation\n",
     "adata.obs = adata.obs.reset_index(drop=False)\n",
-    "df_expression = pd.DataFrame(\n",
-    "    adata.X.todense(), columns=adata.var_names\n",
-    ")\n",
+    "df_expression = pd.DataFrame(adata.X.todense(), columns=adata.var_names)\n",
     "df_expression[\"sex\"] = adata.obs[\"sex\"]\n",
     "df_expression[\"age\"] = adata.obs[\"age\"]\n",
     "\n",
@@ -200,7 +198,6 @@
     }
    ],
    "source": [
-    "\n",
     "# Plot gene expression over age by sex in a subplot grid\n",
     "import math\n",
     "\n",
@@ -212,7 +209,7 @@
     "rows = math.ceil(num_genes / cols)\n",
     "\n",
     "# Create subplots\n",
-    "fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 4*rows))\n",
+    "fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows))\n",
     "axes = axes.flatten()\n",
     "\n",
     "# Colors for sexes\n",
@@ -387,7 +384,7 @@
     "rows = math.ceil(num_genes / cols)\n",
     "\n",
     "# Create subplots\n",
-    "fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))\n",
+    "fig, axes = plt.subplots(rows, cols, figsize=(4 * cols, 4 * rows))\n",
     "axes = axes.flatten()  # Flatten in case of single row\n",
     "\n",
     "for idx, gene in enumerate(valid_matched_genes):\n",
@@ -421,7 +418,9 @@
     "\n",
     "# Calculate total variance explained\n",
     "total_variance_explained = sum(adata_vis.uns[\"pca\"][\"variance_ratio\"])\n",
-    "print(f\"Total variance explained by the first 50 PCs: {total_variance_explained*100:.2f}%\")\n",
+    "print(\n",
+    "    f\"Total variance explained by the first 50 PCs: {total_variance_explained*100:.2f}%\"\n",
+    ")\n",
     "\n",
     "# Compute the neighborhood graph and UMAP\n",
     "sc.pp.neighbors(adata_vis, n_pcs=50)\n",

diff --git a/Analysis/uncorrected/head/CNN/age/full_data/celltype_all/sextype_all/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/full_data/celltype_all/sextype_all/Results/Stats.JSON
@@ -1,15 +1,15 @@
 {
     "Test": {
-        "Accuracy": "68.33%",
-        "Precision": "70.42%",
-        "Recall": "64.02%",
-        "F1": "64.33%",
-        "AUC": "87.49%"
+        "Accuracy": "94.37%",
+        "Precision": "93.74%",
+        "Recall": "93.94%",
+        "F1": "93.81%",
+        "AUC": "99.45%"
     },
     "Baseline": {
-        "Accuracy": "33.33%",
-        "Precision": "8.33%",
+        "Accuracy": "33.67%",
+        "Precision": "8.42%",
         "Recall": "25.00%",
-        "F1": "12.50%"
+        "F1": "12.60%"
     }
 }
diff --git a/...ed/head/CNN/age/full_data/celltype_all/sextype_all/Results/confusion_matrix.png b/...ed/head/CNN/age/full_data/celltype_all/sextype_all/Results/confusion_matrix.png
diff --git a/...corrected/head/CNN/age/full_data/celltype_all/sextype_all/Results/roc_curve.png b/...corrected/head/CNN/age/full_data/celltype_all/sextype_all/Results/roc_curve.png
diff --git a/...ed/head/CNN/age/full_data/celltype_all/sextype_all/Results/training_metrics.png b/...ed/head/CNN/age/full_data/celltype_all/sextype_all/Results/training_metrics.png
diff --git a/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_all/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_all/Results/Stats.JSON
@@ -1,15 +1,15 @@
 {
     "Test": {
-        "Accuracy": "71.08%",
-        "Precision": "69.68%",
-        "Recall": "67.44%",
-        "F1": "67.79%",
-        "AUC": "90.37%"
+        "Accuracy": "79.62%",
+        "Precision": "77.77%",
+        "Recall": "78.26%",
+        "F1": "77.67%",
+        "AUC": "95.17%"
     },
     "Baseline": {
-        "Accuracy": "34.50%",
-        "Precision": "8.62%",
+        "Accuracy": "33.67%",
+        "Precision": "8.42%",
         "Recall": "25.00%",
-        "F1": "12.83%"
+        "F1": "12.60%"
     }
 }
diff --git a/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_female/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_female/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "81.55%",
+        "Precision": "79.06%",
+        "Recall": "79.09%",
+        "F1": "78.97%",
+        "AUC": "95.50%"
+    },
+    "Baseline": {
+        "Accuracy": "33.69%",
+        "Precision": "8.42%",
+        "Recall": "25.00%",
+        "F1": "12.60%"
+    }
+}
diff --git a/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_male/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/no_sex/celltype_all/sextype_male/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "86.26%",
+        "Precision": "83.79%",
+        "Recall": "83.10%",
+        "F1": "83.41%",
+        "AUC": "97.07%"
+    },
+    "Baseline": {
+        "Accuracy": "34.78%",
+        "Precision": "8.69%",
+        "Recall": "25.00%",
+        "F1": "12.90%"
+    }
+}
diff --git a/Analysis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_all/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_all/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "81.66%",
+        "Precision": "80.38%",
+        "Recall": "78.06%",
+        "F1": "78.83%",
+        "AUC": "95.51%"
+    },
+    "Baseline": {
+        "Accuracy": "37.57%",
+        "Precision": "9.39%",
+        "Recall": "25.00%",
+        "F1": "13.65%"
+    }
+}
diff --git a/...sis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_female/Results/Stats.JSON b/...sis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_female/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "82.06%",
+        "Precision": "78.87%",
+        "Recall": "78.81%",
+        "F1": "78.82%",
+        "AUC": "95.24%"
+    },
+    "Baseline": {
+        "Accuracy": "37.26%",
+        "Precision": "9.32%",
+        "Recall": "25.00%",
+        "F1": "13.57%"
+    }
+}
diff --git a/Analysis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_male/Results/Stats.JSON b/Analysis/uncorrected/head/CNN/age/no_sex/celltype_cns neuron/sextype_male/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "85.34%",
+        "Precision": "83.14%",
+        "Recall": "79.25%",
+        "F1": "80.52%",
+        "AUC": "95.90%"
+    },
+    "Baseline": {
+        "Accuracy": "37.87%",
+        "Precision": "9.47%",
+        "Recall": "25.00%",
+        "F1": "13.73%"
+    }
+}
diff --git a/...is/uncorrected/head/CNN/age/no_sex/celltype_sensory neuron/sextype_all/Results/Stats.JSON b/...is/uncorrected/head/CNN/age/no_sex/celltype_sensory neuron/sextype_all/Results/Stats.JSON
@@ -0,0 +1,15 @@
+{
+    "Test": {
+        "Accuracy": "81.05%",
+        "Precision": "80.31%",
+        "Recall": "80.16%",
+        "F1": "80.10%",
+        "AUC": "95.60%"
+    },
+    "Baseline": {
+        "Accuracy": "30.93%",
+        "Precision": "7.73%",
+        "Recall": "25.00%",
+        "F1": "11.81%"
+    }
+}
diff --git a/Code/__pycache__/Config.cpython-39.pyc b/Code/__pycache__/Config.cpython-39.pyc
diff --git a/Code/__pycache__/Model.cpython-39.pyc b/Code/__pycache__/Model.cpython-39.pyc
diff --git a/Code/__pycache__/Utilities.cpython-39.pyc b/Code/__pycache__/Utilities.cpython-39.pyc
diff --git a/Code/__pycache__/eda.cpython-39.pyc b/Code/__pycache__/eda.cpython-39.pyc
diff --git a/Code/__pycache__/interpreter.cpython-39.pyc b/Code/__pycache__/interpreter.cpython-39.pyc
diff --git a/Code/__pycache__/pipeline_manager.cpython-39.pyc b/Code/__pycache__/pipeline_manager.cpython-39.pyc
diff --git a/Code/__pycache__/preprocess.cpython-39.pyc b/Code/__pycache__/preprocess.cpython-39.pyc
diff --git a/Code/__pycache__/visuals.cpython-39.pyc b/Code/__pycache__/visuals.cpython-39.pyc
diff --git a/Code/config.py b/Code/config.py
@@ -18,7 +18,7 @@
             "tissue": "head",  # Options: 'head', 'body', 'all'
             "model_type": "CNN",  # Options: 'CNN', 'MLP', 'XGBoost', 'RandomForest', 'LogisticRegression'
             "encoding_variable": "age",  # Options: 'sex_age', 'sex', 'age'
-            "cell_type": "all",  # Options: 'all', 'CNS neuron', 'sensory neuron', 'epithelial cell', 'fat cell', 'glial cell', 'muscle cell'
+            "cell_type": "sensory neuron",  # Options: 'all', 'CNS neuron', 'sensory neuron', 'epithelial cell', 'fat cell', 'glial cell', 'muscle cell'
             "sex_type": "all",         # Options: 'all', 'male', 'female'
         },
         "Sampling": {
@@ -62,16 +62,16 @@
     },
     "GenePreprocessing": {
         "GeneFiltering": {
-            "remove_sex_genes": False,          # Options: True, False
+            "remove_sex_genes": True,          # Options: True, False
             "remove_autosomal_genes": False,    # Options: True, False
             "only_keep_lnc_genes": False,       # Options: True, False
             "remove_lnc_genes": False,          # Options: True, False
-            "remove_unaccounted_genes": False,  # Options: True, False
+            "remove_unaccounted_genes": True,  # Options: True, False
             "select_batch_genes": False,        # Options: True, False #need to create direcotries for this
             "highly_variable_genes": False,     # Options: True, False #need to create direcotries for this
         },
         "GeneBalancing": {
-            "balance_genes": False,             # Options: True, False
+            "balance_genes": True,             # Options: True, False
             "balance_lnc_genes": False,         # Options: True, False
         },
         "GeneShuffle": {
@@ -80,16 +80,16 @@
         },
     },
     "FeatureImportanceAndVisualizations": {
-        "run_visualization": True,       # Options: True, False
-        "run_interpreter": True,        # Options: True, False (SHAP)
+        "run_visualization": False,       # Options: True, False
+        "run_interpreter": False,        # Options: True, False (SHAP)
         "load_SHAP": False,              # Options: True to load SHAP values, False to compute them, only works if run_interpreter is True
         "reference_size": 5000,          # Reference data size for SHAP
         "save_predictions": False,        # Options: True, False; (Model predictions csv file)
     },
     "DataSplit": {
         "validation_split": 0.1,           # Fraction of data for validation
         "test_split": 0.1,                 # Fraction of data for testing
-        "random_state": 42,               # Random state for reproducibility
+        "random_state": 11,               # Random state for reproducibility
     },
     "Training": {
         "epochs": 15,                      # Number of epochs for training
@@ -104,7 +104,7 @@
             "dropout_rate": 0.3,               # Dropout rate
             "learning_rate": 0.0006,           # Learning rate
             "activation_function": "relu",     # Activation function
-            "reference_size": 1000,            # Reference data size for SHAP
+
         },
         "CNN_Model": {
             "filters": [32, 64, 128],          # Number of filters in each convolutional layer

diff --git a/Code/interpreter.py b/Code/interpreter.py
@@ -134,10 +134,10 @@ def __init__(
         """
         self.config = config
         self.model = model
-        self.test_data = test_data  
-        self.test_labels = test_labels  
+        self.test_data = test_data
+        self.test_labels = test_labels
         self.label_encoder = label_encoder
-        self.reference_data = reference_data 
+        self.reference_data = reference_data
         self.path_manager = path_manager
 
         self.shap_dir = self.path_manager.get_visualization_directory(
@@ -213,25 +213,40 @@ def compute_shap_values(self):
 
         # Adjust SHAP values and test data shapes based on the system type
         device = self.config.Device.processor.lower()
-        if device == 'm':
+        if device == "m":
             # Adjust SHAP values for macOS
             if isinstance(shap_values, list):
-                squeezed_shap_values = [np.squeeze(val, axis=1) if val.ndim >= 3 else val for val in shap_values]
+                squeezed_shap_values = [
+                    np.squeeze(val, axis=1) if val.ndim >= 3 else val
+                    for val in shap_values
+                ]
             else:
-                squeezed_shap_values = (np.squeeze(shap_values, axis=1) if shap_values.ndim >= 3 else shap_values)
+                squeezed_shap_values = (
+                    np.squeeze(shap_values, axis=1)
+                    if shap_values.ndim >= 3
+                    else shap_values
+                )
 
         else:
             # Adjust SHAP values for Windows
             if isinstance(shap_values, list):
-                squeezed_shap_values = [np.squeeze(val, axis=1) if val.ndim > 3 else val for val in shap_values]
+                squeezed_shap_values = [
+                    np.squeeze(val, axis=1) if val.ndim > 3 else val
+                    for val in shap_values
+                ]
             else:
-                squeezed_shap_values = (np.squeeze(shap_values, axis=1) if shap_values.ndim > 3 else shap_values)
+                squeezed_shap_values = (
+                    np.squeeze(shap_values, axis=1)
+                    if shap_values.ndim > 3
+                    else shap_values
+                )
 
             # Convert the SHAP values to a list of arrays for compatibility with the rest of the code
             squeezed_shap_values = [
-                squeezed_shap_values[:, :, i] for i in range(squeezed_shap_values.shape[2])
+                squeezed_shap_values[:, :, i]
+                for i in range(squeezed_shap_values.shape[2])
             ]
-            
+
         return squeezed_shap_values, squeezed_test_data
 
     def save_shap_values(self, shap_values):
@@ -250,7 +265,9 @@ def save_shap_values(self, shap_values):
             ),
             "model_weights_hash": model_weights_hash,
             "test_data_hash": self.compute_sha256_hash(self.test_data.tobytes()),
-            "reference_data_hash": self.compute_sha256_hash(self.reference_data.tobytes()),
+            "reference_data_hash": self.compute_sha256_hash(
+                self.reference_data.tobytes()
+            ),
         }
 
         # Save SHAP values, metadata, and the data
@@ -590,7 +607,9 @@ def save_predictions_to_csv(self, file_name_template="{}_{}_predictions.csv"):
             )
 
             # Determine the relevant train and test attributes based on the method
-            method = self.config.DataParameters.TrainTestSplit.method  # This could be 'sex', 'tissue', etc.
+            method = (
+                self.config.DataParameters.TrainTestSplit.method
+            )  # This could be 'sex', 'tissue', etc.
             train_attribute = self.config.DataParameters.TrainTestSplit.train.get(
                 method, "unknown"
             )
@@ -621,4 +640,4 @@ def compute_metrics(self):
         """
         self._evaluate_model_performance()
         self._calculate_and_save_metrics()
-        self.save_predictions_to_csv()
+        self.save_predictions_to_csv()