openproblems-bio · rcannood · Aug 20, 2025 · Aug 20, 2025 · Sep 25, 2025 · lazappi
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 * Added `methods/stacas` new method (PR #58).
     - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations.
 * Added `method/drvi` component (PR #61).
+
 * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68).
 
 * Added `metrics/cilisi` new metric component (PR #57).
@@ -14,6 +15,8 @@
         overcorrected datasets with removed cell type signals.
         We propose adding this metric to substitute iLISI.
 
+* Added `method/seurat_cca` and `method/seurat_rpca` components (PR #77).
+
 ## Minor changes
 
 * Un-pin the scPRINT version and update parameters (PR #51)

diff --git a/src/methods/seurat_cca/config.vsh.yaml b/src/methods/seurat_cca/config.vsh.yaml
@@ -0,0 +1,58 @@
+__merge__: /src/api/comp_method.yaml
+name: seurat_cca
+label: Seurat CCA
+summary: Integration using Seurat's anchor-based CCA integration
+description: |
+  Seurat's Canonical Correlation Analysis (CCA) integration method identifies shared 
+  correlation structures across datasets to perform batch correction. This method is 
+  effective for datasets with shared cell types across conditions/batches.
+
+  The method works by:
+  1. Finding highly variable features for each dataset
+  2. Identifying integration anchors using CCA 
+  3. Using anchors to harmonize datasets
+  4. Generating integrated low-dimensional embedding
+references:
+  # Stuart, T., Butler, A., Hoffman, P. et al.
+  # Comprehensive Integration of Single-Cell Data.
+  # Cell 177, 1888-1902.e21 (2019). https://doi.org/10.1016/j.cell.2019.05.031
+  doi: 10.1016/j.cell.2019.05.031
+links:
+  repository: https://github.com/satijalab/seurat
+  documentation: https://satijalab.org/seurat/articles/seurat5_integration.html
+info:
+  method_types: [embedding]
+  preferred_normalization: log_cp10k
+arguments:
+  - name: --dims
+    type: integer
+    description: Number of dimensions to use for integration.
+    default: 30
+  - name: --k_anchor
+    type: integer
+    description: Number of neighbors to use when picking anchors.
+    default: 5
+  - name: --k_filter
+    type: integer
+    description: Number of neighbors to use when filtering anchors.
+    default: 200
+  - name: --k_score
+    type: integer  
+    description: Number of neighbors to use when scoring anchors.
+    default: 30
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: r
+        cran:
+          - Seurat
+          - SeuratObject
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, highmem, hightime]
diff --git a/src/methods/seurat_cca/script.R b/src/methods/seurat_cca/script.R
@@ -0,0 +1,112 @@
+cat("Loading dependencies\n")
+suppressPackageStartupMessages({
+  requireNamespace("anndata", quietly = TRUE)
+  library(Matrix, warn.conflicts = FALSE)
+  library(Seurat, warn.conflicts = FALSE)
+  library(SeuratObject, warn.conflicts = FALSE)
+})
+
+## VIASH START
+par <- list(
+  input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
+  output = 'output.h5ad',
+  dims = 30L,
+  k_anchor = 5L,
+  k_filter = 200L,
+  k_score = 30L
+)
+meta <- list(
+  name = "seurat_cca"
+)
+## VIASH END
+
+cat("Read input\n")
+adata <- anndata::read_h5ad(par$input)
+
+cat("Create Seurat object using precomputed data\n")
+# Extract preprocessed data
+norm_data <- t(adata$layers[["normalized"]])
+obs <- adata$obs
+var <- adata$var
+
+# Convert to dgCMatrix if needed (Seurat v5 compatibility)
+if (inherits(norm_data, "dgRMatrix")) {
+  dense_temp <- as.matrix(norm_data)
+  norm_data <- as(dense_temp, "dgCMatrix")
+}
+
+# Ensure proper dimnames for other matrix types
+rownames(norm_data) <- rownames(var)
+colnames(norm_data) <- rownames(obs)
+
+# Create Seurat object
+seurat_obj <- CreateSeuratObject(
+  counts = norm_data,
+  meta.data = obs,
+  assay = "RNA"
+)
+
+# In Seurat v5, we need to set the data layer for normalized data
+seurat_obj[["RNA"]]$data <- norm_data
+
+cat("Set highly variable genes from input\n")
+hvg_genes <- rownames(adata$var)[adata$var$hvg]
+cat("Using", length(hvg_genes), "HVGs from input dataset\n")
+VariableFeatures(seurat_obj) <- hvg_genes
+
+cat("Split by batch and perform CCA integration\n")
+# Split the object by batch
+seurat_list <- SplitObject(seurat_obj, split.by = "batch")
+
+# Find integration anchors using CCA
+anchors <- FindIntegrationAnchors(
+  object.list = seurat_list,
+  anchor.features = hvg_genes,
+  dims = seq_len(par$dims),
+  k.anchor = par$k_anchor,
+  k.filter = par$k_filter,
+  k.score = par$k_score,
+  verbose = FALSE
+)
+
+# Integrate the data
+integrated <- IntegrateData(
+  anchorset = anchors,
+  dims = seq_len(par$dims),
+  verbose = FALSE
+)
+
+cat("Scale integrated data and run PCA\n")
+DefaultAssay(integrated) <- "integrated"
+integrated <- ScaleData(integrated, verbose = FALSE)
+integrated <- RunPCA(integrated, npcs = par$dims, verbose = FALSE)
+
+cat("Generate UMAP embedding\n")
+integrated <- RunUMAP(
+  integrated,
+  reduction = "pca",
+  dims = seq_len(par$dims),
+  verbose = FALSE
+)
+
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "umap")
-cat("Generate UMAP embedding\n")
-integrated <- RunUMAP(
-  integrated,
-  reduction = "pca",
-  dims = seq_len(par$dims),
-  verbose = FALSE
-)
-
-cat("Extract embedding\n")
-embedding <- Embeddings(integrated, reduction = "umap")
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "pca")
-cat("Generate UMAP embedding\n")
-integrated <- RunUMAP(
-  integrated,
-  reduction = "pca",
-  dims = seq_len(par$dims),
-  verbose = FALSE
-)
-
-cat("Extract embedding\n")
-embedding <- Embeddings(integrated, reduction = "umap")
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "pca")
+
+cat("Store outputs\n")
+output <- anndata::AnnData(
+  obs = adata$obs,
+  var = adata$var,
+  obsm = list(
+    X_emb = embedding
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  )
+)
+
+cat("Write output to file\n")
+zzz <- output$write_h5ad(par$output, compression = "gzip")
+
+cat("Finished\n")
diff --git a/src/methods/seurat_rpca/config.vsh.yaml b/src/methods/seurat_rpca/config.vsh.yaml
@@ -0,0 +1,59 @@
+__merge__: /src/api/comp_method.yaml
+name: seurat_rpca
+label: Seurat RPCA
+summary: Integration using Seurat's anchor-based RPCA integration
+description: |
+  Seurat's Reciprocal PCA (RPCA) integration method represents a faster and more 
+  conservative (less correction) approach compared to CCA integration. This method 
+  is especially useful for large datasets or datasets with similar biological 
+  composition across batches.
+
+  The method works by:
+  1. Finding highly variable features for each dataset
+  2. Running PCA on each dataset
+  3. Identifying integration anchors using RPCA
+  4. Using anchors to harmonize datasets with more conservative correction
+references:
+  # Stuart, T., Butler, A., Hoffman, P. et al.
+  # Comprehensive Integration of Single-Cell Data.
+  # Cell 177, 1888-1902.e21 (2019). https://doi.org/10.1016/j.cell.2019.05.031
+  doi: 10.1016/j.cell.2019.05.031
+links:
+  repository: https://github.com/satijalab/seurat
+  documentation: https://satijalab.org/seurat/articles/seurat5_integration.html
+info:
+  method_types: [embedding]
+  preferred_normalization: log_cp10k
+arguments:
+  - name: --dims
+    type: integer
+    description: Number of dimensions to use for integration.
+    default: 30
+  - name: --k_anchor
+    type: integer
+    description: Number of neighbors to use when picking anchors.
+    default: 5
+  - name: --k_filter
+    type: integer
+    description: Number of neighbors to use when filtering anchors.
+    default: 200
+  - name: --k_score
+    type: integer  
+    description: Number of neighbors to use when scoring anchors.
+    default: 30
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1
+    setup:
+      - type: r
+        cran:
+          - Seurat
+          - SeuratObject
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, highmem, hightime]
diff --git a/src/methods/seurat_rpca/script.R b/src/methods/seurat_rpca/script.R
@@ -0,0 +1,136 @@
+cat("Loading dependencies\n")
+suppressPackageStartupMessages({
+  requireNamespace("anndata", quietly = TRUE)
+  library(Matrix, warn.conflicts = FALSE)
+  library(Seurat, warn.conflicts = FALSE)
+  library(SeuratObject, warn.conflicts = FALSE)
+})
+
+## VIASH START
+par <- list(
+  input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
+  output = 'output.h5ad',
+  dims = 30L,
+  k_anchor = 5L,
+  k_filter = 200L,
+  k_score = 30L
+)
+meta <- list(
+  name = "seurat_rpca"
+)
+## VIASH END
+
+cat("Read input\n")
+adata <- anndata::read_h5ad(par$input)
+
+cat("Create Seurat object\n")
+# Extract preprocessed data
+counts_data <- t(adata$layers[["counts"]])
+norm_data <- t(adata$layers[["normalized"]])
+obs <- adata$obs
+var <- adata$var
+
+# Convert to dgCMatrix if needed (Seurat v5 compatibility)
+if (inherits(counts_data, "dgRMatrix")) {
+  dense_temp <- as.matrix(counts_data)
+  counts_data <- as(dense_temp, "dgCMatrix")
+}
+
+if (inherits(norm_data, "dgRMatrix")) {
+  dense_temp <- as.matrix(norm_data)
+  norm_data <- as(dense_temp, "dgCMatrix")
+}
+
+# Ensure proper dimnames
+rownames(counts_data) <- rownames(var)
+colnames(counts_data) <- rownames(obs)
+rownames(norm_data) <- rownames(var)
+colnames(norm_data) <- rownames(obs)
+
+# Create Seurat object from counts
+seurat_obj <- CreateSeuratObject(
+  counts = counts_data,
+  meta.data = obs,
+  assay = "RNA"
+)
+
+# Add normalized data layer
+LayerData(seurat_obj, layer = "data") <- norm_data
+
+# Use existing HVGs from the dataset
+hvg_genes <- rownames(adata$var)[adata$var$hvg]
+cat("Using", length(hvg_genes), "HVGs from input dataset\n")
+VariableFeatures(seurat_obj) <- hvg_genes
+
+# Use existing PCA from input dataset
+pca_embeddings <- adata$obsm[["X_pca"]]
+rownames(pca_embeddings) <- colnames(seurat_obj)
+colnames(pca_embeddings) <- paste0("PC_", seq_len(ncol(pca_embeddings)))
+
+seurat_obj[["pca"]] <- CreateDimReducObject(
+  embeddings = pca_embeddings,
+  key = "PC_",
+  assay = "RNA"
+)
+
+cat("Split object by batch\n")
+# Split the object by batch for traditional integration
+seurat_list <- SplitObject(seurat_obj, split.by = "batch")
+
+# For RPCA, we need to scale and run PCA on each dataset
+cat("Scale data and run PCA for RPCA integration\n")
+seurat_list <- lapply(seurat_list, function(x) {
+  x <- ScaleData(x, features = hvg_genes, verbose = FALSE)
+  x <- RunPCA(x, features = hvg_genes, npcs = par$dims, verbose = FALSE)
+  return(x)
+})
+
+cat("Perform RPCA integration\n")
+# Find integration anchors using RPCA
+anchors <- FindIntegrationAnchors(
+  object.list = seurat_list,
+  anchor.features = hvg_genes,
+  reduction = "rpca",
+  dims = seq_len(par$dims),
+  k.anchor = par$k_anchor,
+  k.filter = par$k_filter,
+  k.score = par$k_score,
+  verbose = FALSE
+)
+
+# Integrate the data
+integrated <- IntegrateData(
+  anchorset = anchors,
+  dims = seq_len(par$dims),
+  verbose = FALSE
+)
+
+cat("Scale and run PCA on integrated data\n")
+DefaultAssay(integrated) <- "integrated"
+integrated <- ScaleData(integrated, verbose = FALSE)
+integrated <- RunPCA(integrated, npcs = par$dims, verbose = FALSE)
+
+cat("Generate UMAP embedding\n")
+integrated <- RunUMAP(integrated, reduction = "pca", dims = seq_len(par$dims), verbose = FALSE)
+
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "umap")
-cat("Generate UMAP embedding\n")
-integrated <- RunUMAP(integrated, reduction = "pca", dims = seq_len(par$dims), verbose = FALSE)
-
-cat("Extract embedding\n")
-embedding <- Embeddings(integrated, reduction = "umap")
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "pca")
-cat("Generate UMAP embedding\n")
-integrated <- RunUMAP(integrated, reduction = "pca", dims = seq_len(par$dims), verbose = FALSE)
-
-cat("Extract embedding\n")
-embedding <- Embeddings(integrated, reduction = "umap")
+cat("Extract embedding\n")
+embedding <- Embeddings(integrated, reduction = "pca")
+
+cat("Store outputs\n")
+output <- anndata::AnnData(
+  obs = adata$obs[, c()],
+  var = adata$var[, c()],
+  obsm = list(
+    X_emb = embedding
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  )
+)
+
+cat("Write output to file\n")
+zzz <- output$write_h5ad(par$output, compression = "gzip")
+
+cat("Finished\n")