Add method: CellMapper (#10)

Marius1311 · web-flow · commit 11beedc73fa7 · 2025-08-07T08:54:15.000+02:00
* Add linear version of CellMapper

* Update object naming

* Add CellMapper scVI variant

* Use modality-dependent scvi models

* Improve logging and docs

* Add cellmapper to workflow config files

* Update the Changelog

* Add clr normalization for adt counts

* Add CLR normalization for ADT data

* Correctly set the path to the utils file

* Remove accidentally committed __pycache__ file and improve .gitignore
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,7 @@
 /output
 trace-*
 .ipynb_checkpoints
-/temp
+/temp
+__pycache__/
+*.pyc
+*.pyo
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## NEW FUNCTIONALITY
 
+* Added CellMapper method (two variants: simple PCA/CCA fallback and modality-specific scvi-tools models for joint mod1 representation) (PR #10)
+
 * Added Novel method (PR #2).
 
 * Added Simple MLP method (PR #3).
diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml
@@ -0,0 +1,73 @@
+__merge__: ../../api/comp_method.yaml
+name: cellmapper_linear
+label: CellMapper+PCA/CCA
+summary: "Modality prediction in a PCA/CCA space using CellMapper"
+description: |
+  CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
+  This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset (test) in a PCA/CCA latent space.
+references:
+  doi: 
+    - 10.5281/zenodo.15683594
+links:
+  documentation: https://cellmapper.readthedocs.io/en/latest/
+  repository: https://github.com/quadbio/cellmapper
+info:
+  preferred_normalization: log_cp10k
+  variants: 
+    cellmapper-pca:
+      fallback_representation: joint_pca
+      mask_var: None
+      kernel_method: hnoca
+    cellmapper-pca-hvg:
+      fallback_representation: joint_pca
+      mask_var: "hvg"
+      kernel_method: hnoca
+    cellmapper-pca-hvg-gauss:
+      fallback_representation: joint_pca
+      mask_var: "hvg"
+      kernel_method: gauss
+    cellmapper-cca:
+      fallback_representation: fast_cca
+      mask_var: None
+      kernel_method: hnoca
+    cellmapper-cca-hvg:
+      fallback_representation: fast_cca
+      mask_var: "hvg"
+      kernel_method: hnoca
+    cellmapper-cca-hvg-gauss:
+      fallback_representation: fast_cca
+      mask_var: "hvg"
+      kernel_method: gauss
+arguments:
+  - name: "--fallback_representation"
+    type: "string"
+    choices: ["joint_pca", "fast_cca"]
+    default: "fast_cca"
+    description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
+  - name: "--mask_var"
+    type: "string"
+    description: Variable to mask for fallback representation.
+  - name: "--kernel_method"
+    type: "string"
+    choices: ["hnoca", "gauss"]
+    default: "hnoca"
+    description: Kernel function to compute k-NN edge weights. 
+  - name: "--n_neighbors"
+    type: "integer"
+    default: 30
+    description: Number of neighbors to consider for k-NN graph construction.
+resources:
+  - type: python_script
+    path: script.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: 
+          - cellmapper>=0.2.2
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py
@@ -0,0 +1,60 @@
+import anndata as ad
+import cellmapper as cm
+from scipy.sparse import csc_matrix
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad',
+    'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad',
+    'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad',
+    'output': 'output.h5ad',
+    'fallback_representation': 'joint_pca',  # or None for fallback_representation
+    'n_neighbors': 30,
+    'kernel_method': 'gauss',
+    'mask_var': "hvg"  # variable to mask for fallback representation
+}
+meta = {
+  'name': 'cellmapper_linear',
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
+input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
+input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
+
+print('Prepare the data', flush=True)
+# Make sure we have normalized data in .X for mod1
+input_train_mod1.X = input_train_mod1.layers["normalized"].copy()
+input_test_mod1.X = input_test_mod1.layers["normalized"].copy()
+
+# copy the normalized layer to obsm for mod2
+input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] 
+
+print("Set up and prepare Cellmapper", flush=True)
+cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
+cmap.compute_neighbors(
+    use_rep=None,
+    fallback_representation=par['fallback_representation'],
+    n_neighbors=par['n_neighbors'],
+    fallback_kwargs={"mask_var": par['mask_var']},
+    )
+cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
+
+print("Predict on test data", flush=True)
+cmap.map_obsm(key="mod2", prediction_postfix="pred")
+mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+    layers={"normalized": mod2_pred},
+    obs=input_test_mod1.obs,
+    var=input_train_mod2.var,
+    uns={
+        'dataset_id': input_train_mod1.uns['dataset_id'],
+        'method_id': meta["name"],
+    },
+)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml
@@ -0,0 +1,84 @@
+__merge__: ../../api/comp_method.yaml
+name: cellmapper_scvi
+label: CellMapper+scVI
+summary: "Modality prediction in an scVI latent space using CellMapper"
+description: |
+  CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
+  This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset 
+  (test) in a modality-specific latent space computed with suitable scvi-tools models. For gene expression data, 
+  we use the scVI model on raw counts (nb likelihood), for ADT data, we use the scVI models on normalized counts 
+  (gaussian likelihood), and for ATAC data, we use the PeakVI model on raw counts. The actual CellMapper pipeline is 
+  modality-agnostic. 
+references:
+  doi: 
+    - 10.5281/zenodo.15683594
+links:
+  documentation: https://cellmapper.readthedocs.io/en/latest/
+  repository: https://github.com/quadbio/cellmapper
+info:
+  preferred_normalization: log_cp10k
+  variants: 
+    cellmapper_hnoca_hvg:
+      kernel_method: hnoca
+      use_hvg: true
+      adt_normalization: clr
+    cellmapper_hnoca_all_genes:
+      kernel_method: hnoca
+      use_hvg: false
+      adt_normalization: clr
+    cellmapper_gauss_hvg:
+      kernel_method: gauss
+      use_hvg: true
+      adt_normalization: clr
+    cellmapper_gauss_hvg_log_cp10k:
+      kernel_method: gauss
+      use_hvg: true
+      adt_normalization: log_cp10k
+    cellmapper_gauss_all_genes:
+      kernel_method: gauss
+      use_hvg: false
+      adt_normalization: clr
+
+arguments:
+  - name: "--kernel_method"
+    type: "string"
+    choices: ["hnoca", "gauss"]
+    default: "hnoca"
+    description: Kernel function to compute k-NN edge weights (CellMapper parameter). 
+  - name: "--n_neighbors"
+    type: "integer"
+    default: 30
+    description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
+  - name: "--use_hvg"
+    type: boolean
+    default: true
+    description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
+  - name: "--adt_normalization"
+    type: "string"
+    choices: ["clr", "log_cp10k"]
+    default: "clr"
+    description: Normalization method for ADT data, clr = centered log ratio. 
+  - name: "--plot_umap"
+    type: boolean
+    default: false
+    description: Whether to plot the UMAP embedding of the latent space (for diagnoscic purposes)
+resources:
+  - type: python_script
+    path: script.py
+  - path: utils.py
+    dest: utils.py 
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    setup:
+      - type: python
+        packages: 
+          - cellmapper>=0.2.2
+          - scvi-tools>=1.3.0
+          - muon>=0.1.6
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu,gpu]
diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py
@@ -0,0 +1,80 @@
+import sys
+import anndata as ad
+import cellmapper as cm
+from scipy.sparse import csc_matrix
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad',
+    'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
+    'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
+    'output': 'output.h5ad',
+    'n_neighbors': 30, 
+    'kernel_method': 'hnoca',
+    'use_hvg': False,
+    'adt_normalization': 'clr',  # Normalization method for ADT data
+    'plot_umap': True,
+
+}
+meta = {
+  'name': 'cellmapper_scvi',
+  'resources_dir': 'target/executable/methods/cellmapper_scvi',
+}
+## VIASH END
+
+sys.path.append(meta['resources_dir'])
+from utils import get_representation
+
+print('Reading input files', flush=True)
+input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
+input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
+input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
+
+mod1 = input_train_mod1.uns['modality']
+mod2 = input_train_mod2.uns['modality']
+print(f"Modality 1: {mod1}, n_features: {input_train_mod1.n_vars}", flush=True)
+print(f"Modality 2: {mod2}, n_features: {input_train_mod2.n_vars}", flush=True)
+
+print("Concatenating train and test data", flush=True)
+adata = ad.concat(
+    [input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"]
+    )
+
+# Compute a latent representation using an appropriate model based on the modality
+print("Get latent representation", flush=True)
+adata = get_representation(
+    adata=adata, modality=mod1, use_hvg=par['use_hvg'], adt_normalization=par['adt_normalization'], plot_umap=par['plot_umap']
+    )
+
+# Place the representation back into individual objects
+input_train_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "train"].obsm["X_scvi"].copy()
+input_test_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "test"].obsm["X_scvi"].copy()
+
+# copy the normalized layer to obsm for mod2
+input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] 
+
+print('Setup and prepare Cellmapper', flush=True)
+cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
+cmap.compute_neighbors(
+    use_rep="X_scvi",
+    n_neighbors=par['n_neighbors'], 
+    )
+cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
+
+print("Predict on test data", flush=True)
+cmap.map_obsm(key="mod2", prediction_postfix="pred")
+mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+    layers={"normalized": mod2_pred},
+    obs=input_test_mod1.obs,
+    var=input_train_mod2.var,
+    uns={
+        'dataset_id': input_train_mod1.uns['dataset_id'],
+        'method_id': meta["name"],
+    },
+)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf