Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/methods/cellmapper_linear/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,20 @@ arguments:
- name: "--fallback_representation"
type: "string"
choices: ["joint_pca", "fast_cca"]
default: "fast_cca"
default: "joint_pca"
description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
- name: "--mask_var"
type: "string"
default: "hvg"
description: Variable to mask for fallback representation.
- name: "--kernel_method"
type: "string"
choices: ["hnoca", "gauss"]
default: "hnoca"
default: "gauss"
description: Kernel function to compute k-NN edge weights.
- name: "--n_neighbors"
type: "integer"
default: 30
default: 50
description: Number of neighbors to consider for k-NN graph construction.
resources:
- type: python_script
Expand Down
4 changes: 4 additions & 0 deletions src/methods/cellmapper_linear/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,14 @@
# copy the normalized layer to obsm for mod2
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]

# choose the kNN method based on total cell number
n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs

print("Set up and prepare Cellmapper", flush=True)
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
cmap.compute_neighbors(
use_rep=None,
knn_method="sklearn" if n_obs < 60000 else "pynndescent",
fallback_representation=par['fallback_representation'],
n_neighbors=par['n_neighbors'],
fallback_kwargs={"mask_var": par['mask_var']},
Expand Down
6 changes: 3 additions & 3 deletions src/methods/cellmapper_scvi/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ arguments:
- name: "--kernel_method"
type: "string"
choices: ["hnoca", "gauss"]
default: "hnoca"
default: "gauss"
description: Kernel function to compute k-NN edge weights (CellMapper parameter).
- name: "--n_neighbors"
type: "integer"
default: 30
default: 50
description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
- name: "--use_hvg"
type: boolean
default: true
default: false
description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
- name: "--adt_normalization"
type: "string"
Expand Down
9 changes: 7 additions & 2 deletions src/methods/cellmapper_scvi/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
'output': 'output.h5ad',
'n_neighbors': 30,
'kernel_method': 'hnoca',
'n_neighbors': 50,
'kernel_method': 'gauss',
'use_hvg': False,
'adt_normalization': 'clr', # Normalization method for ADT data
'plot_umap': True,
Expand Down Expand Up @@ -41,6 +41,7 @@
adata = ad.concat(
[input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"]
)
adata.X = adata.layers["counts"]

# Compute a latent representation using an appropriate model based on the modality
print("Get latent representation", flush=True)
Expand All @@ -55,10 +56,14 @@
# copy the normalized layer to obsm for mod2
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]

# choose the kNN method based on total cell number
n_obs = input_test_mod1.n_obs + input_train_mod1.n_obs

print('Setup and prepare Cellmapper', flush=True)
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
cmap.compute_neighbors(
use_rep="X_scvi",
knn_method="sklearn" if n_obs < 60000 else "pynndescent",
n_neighbors=par['n_neighbors'],
)
cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])
Expand Down
106 changes: 94 additions & 12 deletions src/methods/cellmapper_scvi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,75 @@
from scipy.sparse import issparse, csr_matrix, csc_matrix
import muon
import scanpy as sc
import numpy as np



def preprocess_features(
adata: ad.AnnData,
modality: Literal["GEX", "ADT", "ATAC"],
use_hvg: bool,
min_cells_fraction: float,
feature_filter_threshold: int,
) -> ad.AnnData:
"""
Preprocess features with optional filtering and HVG selection.

Parameters
----------
adata
AnnData object to preprocess
modality
The modality type, affects filtering strategy
use_hvg
Whether to apply HVG selection (applies to all modalities if requested)
min_cells_fraction
Minimum fraction of cells a feature must be detected in to be kept
feature_filter_threshold
Only apply feature filtering if n_vars > this threshold

Returns
-------
Preprocessed AnnData object
"""
print(f"Starting preprocessing for {modality} with {adata.n_obs} cells and {adata.n_vars} features")

# Feature filtering: only apply if we have many features (mainly for ATAC)
if adata.n_vars > feature_filter_threshold:
min_cells = int(adata.n_obs * min_cells_fraction)
print(f"Applying feature filtering: removing features detected in <{min_cells} cells ({min_cells_fraction:.1%} threshold)")

n_features_before = adata.n_vars
sc.pp.filter_genes(adata, min_cells=min_cells)
n_features_after = adata.n_vars

print(f"Features after filtering: {n_features_after} (removed {n_features_before - n_features_after})")
else:
print(f"Skipping feature filtering (only {adata.n_vars} features, threshold is {feature_filter_threshold})")

# HVG selection: apply if requested
if use_hvg:
n_hvg = adata.var["hvg"].sum()
print(f"Applying HVG selection: subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})")
adata = adata[:, adata.var["hvg"]].copy()
else:
print("HVG selection disabled, using all remaining features")

# Critical check: ensure no cells have zero counts after filtering
cell_counts = np.array(adata.layers['counts'].sum(axis=1)).flatten()
zero_count_cells = (cell_counts == 0).sum()

if zero_count_cells > 0:
raise ValueError(
f"After preprocessing, {zero_count_cells} cells have zero counts! "
"This would prevent generating latent representations for these cells. "
"Consider relaxing filtering parameters or checking data quality."
)

print(f"Final dataset: {adata.n_obs} cells × {adata.n_vars} features")
print(f"Mean counts per cell: {cell_counts.mean():.1f}")

return adata


def get_representation(
Expand All @@ -12,6 +81,8 @@ def get_representation(
use_hvg: bool = True,
adt_normalization: Literal["clr", "log_cp10k"] = "clr",
plot_umap: bool = False,
min_cells_fraction: float = 0.01,
feature_filter_threshold: int = 20000,
) -> ad.AnnData:
"""
Get a joint latent space representation of the data based on the modality.
Expand All @@ -25,7 +96,7 @@ def get_representation(

- "GEX": scVI model for gene expression data with ZINB likelihood on raw counts.
- "ADT": scVI model for ADT data (surface proteins) with Gaussian likelihood on normalized data.
- "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on binarized count data.
- "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on count data.

We assume that regardless of the modality, the raw data will be stored in the `counts` layer
(e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer.
Expand All @@ -37,20 +108,26 @@ def get_representation(
- "log_cp10k" (normalization to 10k counts per cell and logarithm transformation)
plot_umap
Purely for diagnostic purposes, to see whether the data integration looks ok, this optionally computes
a UMAP in shared latent space and stores a plot.
a UMAP in shared latent space and stores a plot.
min_cells_fraction
Minimum fraction of cells a feature must be detected in to be kept during filtering.
feature_filter_threshold
Only apply feature filtering if n_vars > this threshold. This automatically separates
ATAC data (many peaks) from other modalities (fewer features).

Returns
-------
ad.AnnData
AnnData object with the latent representation in `obsm["X_scvi"]`, regardless of the modality.
"""
# Subset to highly variable features
if "hvg" in adata.var.columns and use_hvg:
n_hvg = adata.var["hvg"].sum()
print(f"Subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})", flush=True)
adata = adata[:, adata.var["hvg"]].copy()
else:
print("Training on all available features", flush=True)
# Preprocess features (filtering and HVG selection)
adata = preprocess_features(
adata,
modality=modality,
use_hvg=use_hvg,
min_cells_fraction=min_cells_fraction,
feature_filter_threshold=feature_filter_threshold
)

# Setup the AnnData object for scVI
if modality == "GEX":
Expand All @@ -73,9 +150,14 @@ def get_representation(
scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10)
elif modality == "ATAC":
layer = "counts"
scvi.model.PEAKVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
model = scvi.model.PEAKVI(adata)

print("Converting read counts to fragment counts")
scvi.data.reads_to_fragments(adata, read_layer="counts")
print(f"One counts: {(adata.layers['fragments'] == 1).sum()}, Two counts: {(adata.layers['fragments'] == 2).sum()}")
layer = "fragments"

scvi.external.POISSONVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"])
model = scvi.external.POISSONVI(adata)
else:
raise ValueError(f"Unknown modality: {modality}")

Expand Down