Skip to content

Commit 11beedc

Browse files
authored
Add method: CellMapper (#10)
* Add linear version of CellMapper * Update object naming * Add CellMapper scVI variant * Use modality-dependent scvi models * Improve logging and docs * Add cellmapper to workflow config files * Update the Changelog * Add clr normalization for adt counts * Add CLR normalization for ADT data * Correctly set the path to the utils file * Remove accidentally committed __pycache__ file and improve .gitignore
1 parent 99fb09a commit 11beedc

File tree

9 files changed

+407
-1
lines changed

9 files changed

+407
-1
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,7 @@
88
/output
99
trace-*
1010
.ipynb_checkpoints
11-
/temp
11+
/temp
12+
__pycache__/
13+
*.pyc
14+
*.pyo

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## NEW FUNCTIONALITY
44

5+
* Added CellMapper method (two variants: simple PCA/CCA fallback and modality-specific scvi-tools models for joint mod1 representation) (PR #10)
6+
57
* Added Novel method (PR #2).
68

79
* Added Simple MLP method (PR #3).
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
__merge__: ../../api/comp_method.yaml
2+
name: cellmapper_linear
3+
label: CellMapper+PCA/CCA
4+
summary: "Modality prediction in a PCA/CCA space using CellMapper"
5+
description: |
6+
CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
7+
This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset (test) in a PCA/CCA latent space.
8+
references:
9+
doi:
10+
- 10.5281/zenodo.15683594
11+
links:
12+
documentation: https://cellmapper.readthedocs.io/en/latest/
13+
repository: https://github.com/quadbio/cellmapper
14+
info:
15+
preferred_normalization: log_cp10k
16+
variants:
17+
cellmapper-pca:
18+
fallback_representation: joint_pca
19+
mask_var: None
20+
kernel_method: hnoca
21+
cellmapper-pca-hvg:
22+
fallback_representation: joint_pca
23+
mask_var: "hvg"
24+
kernel_method: hnoca
25+
cellmapper-pca-hvg-gauss:
26+
fallback_representation: joint_pca
27+
mask_var: "hvg"
28+
kernel_method: gauss
29+
cellmapper-cca:
30+
fallback_representation: fast_cca
31+
mask_var: None
32+
kernel_method: hnoca
33+
cellmapper-cca-hvg:
34+
fallback_representation: fast_cca
35+
mask_var: "hvg"
36+
kernel_method: hnoca
37+
cellmapper-cca-hvg-gauss:
38+
fallback_representation: fast_cca
39+
mask_var: "hvg"
40+
kernel_method: gauss
41+
arguments:
42+
- name: "--fallback_representation"
43+
type: "string"
44+
choices: ["joint_pca", "fast_cca"]
45+
default: "fast_cca"
46+
description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
47+
- name: "--mask_var"
48+
type: "string"
49+
description: Variable to mask for fallback representation.
50+
- name: "--kernel_method"
51+
type: "string"
52+
choices: ["hnoca", "gauss"]
53+
default: "hnoca"
54+
description: Kernel function to compute k-NN edge weights.
55+
- name: "--n_neighbors"
56+
type: "integer"
57+
default: 30
58+
description: Number of neighbors to consider for k-NN graph construction.
59+
resources:
60+
- type: python_script
61+
path: script.py
62+
engines:
63+
- type: docker
64+
image: openproblems/base_python:1
65+
setup:
66+
- type: python
67+
packages:
68+
- cellmapper>=0.2.2
69+
runners:
70+
- type: executable
71+
- type: nextflow
72+
directives:
73+
label: [midtime,midmem,midcpu]
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import anndata as ad
2+
import cellmapper as cm
3+
from scipy.sparse import csc_matrix
4+
5+
## VIASH START
6+
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
7+
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
8+
par = {
9+
'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad',
10+
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad',
11+
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad',
12+
'output': 'output.h5ad',
13+
'fallback_representation': 'joint_pca', # or None for fallback_representation
14+
'n_neighbors': 30,
15+
'kernel_method': 'gauss',
16+
'mask_var': "hvg" # variable to mask for fallback representation
17+
}
18+
meta = {
19+
'name': 'cellmapper_linear',
20+
}
21+
## VIASH END
22+
23+
print('Reading input files', flush=True)
24+
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
25+
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
26+
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
27+
28+
print('Prepare the data', flush=True)
29+
# Make sure we have normalized data in .X for mod1
30+
input_train_mod1.X = input_train_mod1.layers["normalized"].copy()
31+
input_test_mod1.X = input_test_mod1.layers["normalized"].copy()
32+
33+
# copy the normalized layer to obsm for mod2
34+
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]
35+
36+
print("Set up and prepare Cellmapper", flush=True)
37+
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
38+
cmap.compute_neighbors(
39+
use_rep=None,
40+
fallback_representation=par['fallback_representation'],
41+
n_neighbors=par['n_neighbors'],
42+
fallback_kwargs={"mask_var": par['mask_var']},
43+
)
44+
cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
45+
46+
print("Predict on test data", flush=True)
47+
cmap.map_obsm(key="mod2", prediction_postfix="pred")
48+
mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])
49+
50+
print("Write output AnnData to file", flush=True)
51+
output = ad.AnnData(
52+
layers={"normalized": mod2_pred},
53+
obs=input_test_mod1.obs,
54+
var=input_train_mod2.var,
55+
uns={
56+
'dataset_id': input_train_mod1.uns['dataset_id'],
57+
'method_id': meta["name"],
58+
},
59+
)
60+
output.write_h5ad(par['output'], compression='gzip')
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
__merge__: ../../api/comp_method.yaml
2+
name: cellmapper_scvi
3+
label: CellMapper+scVI
4+
summary: "Modality prediction in an scVI latent space using CellMapper"
5+
description: |
6+
CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
7+
This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset
8+
(test) in a modality-specific latent space computed with suitable scvi-tools models. For gene expression data,
9+
we use the scVI model on raw counts (nb likelihood), for ADT data, we use the scVI models on normalized counts
10+
(gaussian likelihood), and for ATAC data, we use the PeakVI model on raw counts. The actual CellMapper pipeline is
11+
modality-agnostic.
12+
references:
13+
doi:
14+
- 10.5281/zenodo.15683594
15+
links:
16+
documentation: https://cellmapper.readthedocs.io/en/latest/
17+
repository: https://github.com/quadbio/cellmapper
18+
info:
19+
preferred_normalization: log_cp10k
20+
variants:
21+
cellmapper_hnoca_hvg:
22+
kernel_method: hnoca
23+
use_hvg: true
24+
adt_normalization: clr
25+
cellmapper_hnoca_all_genes:
26+
kernel_method: hnoca
27+
use_hvg: false
28+
adt_normalization: clr
29+
cellmapper_gauss_hvg:
30+
kernel_method: gauss
31+
use_hvg: true
32+
adt_normalization: clr
33+
cellmapper_gauss_hvg_log_cp10k:
34+
kernel_method: gauss
35+
use_hvg: true
36+
adt_normalization: log_cp10k
37+
cellmapper_gauss_all_genes:
38+
kernel_method: gauss
39+
use_hvg: false
40+
adt_normalization: clr
41+
42+
arguments:
43+
- name: "--kernel_method"
44+
type: "string"
45+
choices: ["hnoca", "gauss"]
46+
default: "hnoca"
47+
description: Kernel function to compute k-NN edge weights (CellMapper parameter).
48+
- name: "--n_neighbors"
49+
type: "integer"
50+
default: 30
51+
description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
52+
- name: "--use_hvg"
53+
type: boolean
54+
default: true
55+
description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
56+
- name: "--adt_normalization"
57+
type: "string"
58+
choices: ["clr", "log_cp10k"]
59+
default: "clr"
60+
description: Normalization method for ADT data, clr = centered log ratio.
61+
- name: "--plot_umap"
62+
type: boolean
63+
default: false
64+
description: Whether to plot the UMAP embedding of the latent space (for diagnoscic purposes)
65+
resources:
66+
- type: python_script
67+
path: script.py
68+
- path: utils.py
69+
dest: utils.py
70+
engines:
71+
- type: docker
72+
image: openproblems/base_pytorch_nvidia:1.0.0
73+
setup:
74+
- type: python
75+
packages:
76+
- cellmapper>=0.2.2
77+
- scvi-tools>=1.3.0
78+
- muon>=0.1.6
79+
80+
runners:
81+
- type: executable
82+
- type: nextflow
83+
directives:
84+
label: [midtime,midmem,midcpu,gpu]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import sys
2+
import anndata as ad
3+
import cellmapper as cm
4+
from scipy.sparse import csc_matrix
5+
6+
## VIASH START
7+
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
8+
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
9+
par = {
10+
'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad',
11+
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
12+
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
13+
'output': 'output.h5ad',
14+
'n_neighbors': 30,
15+
'kernel_method': 'hnoca',
16+
'use_hvg': False,
17+
'adt_normalization': 'clr', # Normalization method for ADT data
18+
'plot_umap': True,
19+
20+
}
21+
meta = {
22+
'name': 'cellmapper_scvi',
23+
'resources_dir': 'target/executable/methods/cellmapper_scvi',
24+
}
25+
## VIASH END
26+
27+
sys.path.append(meta['resources_dir'])
28+
from utils import get_representation
29+
30+
print('Reading input files', flush=True)
31+
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
32+
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
33+
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
34+
35+
mod1 = input_train_mod1.uns['modality']
36+
mod2 = input_train_mod2.uns['modality']
37+
print(f"Modality 1: {mod1}, n_features: {input_train_mod1.n_vars}", flush=True)
38+
print(f"Modality 2: {mod2}, n_features: {input_train_mod2.n_vars}", flush=True)
39+
40+
print("Concatenating train and test data", flush=True)
41+
adata = ad.concat(
42+
[input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"]
43+
)
44+
45+
# Compute a latent representation using an appropriate model based on the modality
46+
print("Get latent representation", flush=True)
47+
adata = get_representation(
48+
adata=adata, modality=mod1, use_hvg=par['use_hvg'], adt_normalization=par['adt_normalization'], plot_umap=par['plot_umap']
49+
)
50+
51+
# Place the representation back into individual objects
52+
input_train_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "train"].obsm["X_scvi"].copy()
53+
input_test_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "test"].obsm["X_scvi"].copy()
54+
55+
# copy the normalized layer to obsm for mod2
56+
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]
57+
58+
print('Setup and prepare Cellmapper', flush=True)
59+
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
60+
cmap.compute_neighbors(
61+
use_rep="X_scvi",
62+
n_neighbors=par['n_neighbors'],
63+
)
64+
cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
65+
66+
print("Predict on test data", flush=True)
67+
cmap.map_obsm(key="mod2", prediction_postfix="pred")
68+
mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])
69+
70+
print("Write output AnnData to file", flush=True)
71+
output = ad.AnnData(
72+
layers={"normalized": mod2_pred},
73+
obs=input_test_mod1.obs,
74+
var=input_train_mod2.var,
75+
uns={
76+
'dataset_id': input_train_mod1.uns['dataset_id'],
77+
'method_id': meta["name"],
78+
},
79+
)
80+
output.write_h5ad(par['output'], compression='gzip')

0 commit comments

Comments
 (0)