diff --git a/CHANGELOG.md b/CHANGELOG.md index 890c4eb7..4f49447e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * Added `methods/stacas` new method (PR #58). - Add non-supervised version of STACAS tool for integration of single-cell transcriptomics data. This functionality enables correction of batch effects while preserving biological variability without requiring prior cell type annotations. * Added `method/drvi` component (PR #61). + * Added `ARI_batch` and `NMI_batch` to `metrics/clustering_overlap` (PR #68). * Added `metrics/cilisi` new metric component (PR #57). @@ -14,6 +15,8 @@ overcorrected datasets with removed cell type signals. We propose adding this metric to substitute iLISI. +* Added `method/seurat_cca` and `method/seurat_rpca` components (PR #77). + ## Minor changes * Un-pin the scPRINT version and update parameters (PR #51) diff --git a/src/methods/seurat_cca/config.vsh.yaml b/src/methods/seurat_cca/config.vsh.yaml new file mode 100644 index 00000000..f689d6fa --- /dev/null +++ b/src/methods/seurat_cca/config.vsh.yaml @@ -0,0 +1,58 @@ +__merge__: /src/api/comp_method.yaml +name: seurat_cca +label: Seurat CCA +summary: Integration using Seurat's anchor-based CCA integration +description: | + Seurat's Canonical Correlation Analysis (CCA) integration method identifies shared + correlation structures across datasets to perform batch correction. This method is + effective for datasets with shared cell types across conditions/batches. + + The method works by: + 1. Finding highly variable features for each dataset + 2. Identifying integration anchors using CCA + 3. Using anchors to harmonize datasets + 4. Generating integrated low-dimensional embedding +references: + # Stuart, T., Butler, A., Hoffman, P. et al. + # Comprehensive Integration of Single-Cell Data. + # Cell 177, 1888-1902.e21 (2019). https://doi.org/10.1016/j.cell.2019.05.031 + doi: 10.1016/j.cell.2019.05.031 +links: + repository: https://github.com/satijalab/seurat + documentation: https://satijalab.org/seurat/articles/seurat5_integration.html +info: + method_types: [embedding] + preferred_normalization: log_cp10k +arguments: + - name: --dims + type: integer + description: Number of dimensions to use for integration. + default: 30 + - name: --k_anchor + type: integer + description: Number of neighbors to use when picking anchors. + default: 5 + - name: --k_filter + type: integer + description: Number of neighbors to use when filtering anchors. + default: 200 + - name: --k_score + type: integer + description: Number of neighbors to use when scoring anchors. + default: 30 +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: r + cran: + - Seurat + - SeuratObject +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, highmem, hightime] diff --git a/src/methods/seurat_cca/script.R b/src/methods/seurat_cca/script.R new file mode 100644 index 00000000..d57df0ee --- /dev/null +++ b/src/methods/seurat_cca/script.R @@ -0,0 +1,112 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + library(Seurat, warn.conflicts = FALSE) + library(SeuratObject, warn.conflicts = FALSE) +}) + +## VIASH START +par <- list( + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + output = 'output.h5ad', + dims = 30L, + k_anchor = 5L, + k_filter = 200L, + k_score = 30L +) +meta <- list( + name = "seurat_cca" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +cat("Create Seurat object using precomputed data\n") +# Extract preprocessed data +norm_data <- t(adata$layers[["normalized"]]) +obs <- adata$obs +var <- adata$var + +# Convert to dgCMatrix if needed (Seurat v5 compatibility) +if (inherits(norm_data, "dgRMatrix")) { + dense_temp <- as.matrix(norm_data) + norm_data <- as(dense_temp, "dgCMatrix") +} + +# Ensure proper dimnames for other matrix types +rownames(norm_data) <- rownames(var) +colnames(norm_data) <- rownames(obs) + +# Create Seurat object +seurat_obj <- CreateSeuratObject( + counts = norm_data, + meta.data = obs, + assay = "RNA" +) + +# In Seurat v5, we need to set the data layer for normalized data +seurat_obj[["RNA"]]$data <- norm_data + +cat("Set highly variable genes from input\n") +hvg_genes <- rownames(adata$var)[adata$var$hvg] +cat("Using", length(hvg_genes), "HVGs from input dataset\n") +VariableFeatures(seurat_obj) <- hvg_genes + +cat("Split by batch and perform CCA integration\n") +# Split the object by batch +seurat_list <- SplitObject(seurat_obj, split.by = "batch") + +# Find integration anchors using CCA +anchors <- FindIntegrationAnchors( + object.list = seurat_list, + anchor.features = hvg_genes, + dims = seq_len(par$dims), + k.anchor = par$k_anchor, + k.filter = par$k_filter, + k.score = par$k_score, + verbose = FALSE +) + +# Integrate the data +integrated <- IntegrateData( + anchorset = anchors, + dims = seq_len(par$dims), + verbose = FALSE +) + +cat("Scale integrated data and run PCA\n") +DefaultAssay(integrated) <- "integrated" +integrated <- ScaleData(integrated, verbose = FALSE) +integrated <- RunPCA(integrated, npcs = par$dims, verbose = FALSE) + +cat("Generate UMAP embedding\n") +integrated <- RunUMAP( + integrated, + reduction = "pca", + dims = seq_len(par$dims), + verbose = FALSE +) + +cat("Extract embedding\n") +embedding <- Embeddings(integrated, reduction = "umap") + +cat("Store outputs\n") +output <- anndata::AnnData( + obs = adata$obs, + var = adata$var, + obsm = list( + X_emb = embedding + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ) +) + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") + +cat("Finished\n") diff --git a/src/methods/seurat_rpca/config.vsh.yaml b/src/methods/seurat_rpca/config.vsh.yaml new file mode 100644 index 00000000..fa9df05c --- /dev/null +++ b/src/methods/seurat_rpca/config.vsh.yaml @@ -0,0 +1,59 @@ +__merge__: /src/api/comp_method.yaml +name: seurat_rpca +label: Seurat RPCA +summary: Integration using Seurat's anchor-based RPCA integration +description: | + Seurat's Reciprocal PCA (RPCA) integration method represents a faster and more + conservative (less correction) approach compared to CCA integration. This method + is especially useful for large datasets or datasets with similar biological + composition across batches. + + The method works by: + 1. Finding highly variable features for each dataset + 2. Running PCA on each dataset + 3. Identifying integration anchors using RPCA + 4. Using anchors to harmonize datasets with more conservative correction +references: + # Stuart, T., Butler, A., Hoffman, P. et al. + # Comprehensive Integration of Single-Cell Data. + # Cell 177, 1888-1902.e21 (2019). https://doi.org/10.1016/j.cell.2019.05.031 + doi: 10.1016/j.cell.2019.05.031 +links: + repository: https://github.com/satijalab/seurat + documentation: https://satijalab.org/seurat/articles/seurat5_integration.html +info: + method_types: [embedding] + preferred_normalization: log_cp10k +arguments: + - name: --dims + type: integer + description: Number of dimensions to use for integration. + default: 30 + - name: --k_anchor + type: integer + description: Number of neighbors to use when picking anchors. + default: 5 + - name: --k_filter + type: integer + description: Number of neighbors to use when filtering anchors. + default: 200 + - name: --k_score + type: integer + description: Number of neighbors to use when scoring anchors. + default: 30 +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1 + setup: + - type: r + cran: + - Seurat + - SeuratObject +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, highmem, hightime] diff --git a/src/methods/seurat_rpca/script.R b/src/methods/seurat_rpca/script.R new file mode 100644 index 00000000..302db792 --- /dev/null +++ b/src/methods/seurat_rpca/script.R @@ -0,0 +1,136 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + library(Seurat, warn.conflicts = FALSE) + library(SeuratObject, warn.conflicts = FALSE) +}) + +## VIASH START +par <- list( + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + output = 'output.h5ad', + dims = 30L, + k_anchor = 5L, + k_filter = 200L, + k_score = 30L +) +meta <- list( + name = "seurat_rpca" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +cat("Create Seurat object\n") +# Extract preprocessed data +counts_data <- t(adata$layers[["counts"]]) +norm_data <- t(adata$layers[["normalized"]]) +obs <- adata$obs +var <- adata$var + +# Convert to dgCMatrix if needed (Seurat v5 compatibility) +if (inherits(counts_data, "dgRMatrix")) { + dense_temp <- as.matrix(counts_data) + counts_data <- as(dense_temp, "dgCMatrix") +} + +if (inherits(norm_data, "dgRMatrix")) { + dense_temp <- as.matrix(norm_data) + norm_data <- as(dense_temp, "dgCMatrix") +} + +# Ensure proper dimnames +rownames(counts_data) <- rownames(var) +colnames(counts_data) <- rownames(obs) +rownames(norm_data) <- rownames(var) +colnames(norm_data) <- rownames(obs) + +# Create Seurat object from counts +seurat_obj <- CreateSeuratObject( + counts = counts_data, + meta.data = obs, + assay = "RNA" +) + +# Add normalized data layer +LayerData(seurat_obj, layer = "data") <- norm_data + +# Use existing HVGs from the dataset +hvg_genes <- rownames(adata$var)[adata$var$hvg] +cat("Using", length(hvg_genes), "HVGs from input dataset\n") +VariableFeatures(seurat_obj) <- hvg_genes + +# Use existing PCA from input dataset +pca_embeddings <- adata$obsm[["X_pca"]] +rownames(pca_embeddings) <- colnames(seurat_obj) +colnames(pca_embeddings) <- paste0("PC_", seq_len(ncol(pca_embeddings))) + +seurat_obj[["pca"]] <- CreateDimReducObject( + embeddings = pca_embeddings, + key = "PC_", + assay = "RNA" +) + +cat("Split object by batch\n") +# Split the object by batch for traditional integration +seurat_list <- SplitObject(seurat_obj, split.by = "batch") + +# For RPCA, we need to scale and run PCA on each dataset +cat("Scale data and run PCA for RPCA integration\n") +seurat_list <- lapply(seurat_list, function(x) { + x <- ScaleData(x, features = hvg_genes, verbose = FALSE) + x <- RunPCA(x, features = hvg_genes, npcs = par$dims, verbose = FALSE) + return(x) +}) + +cat("Perform RPCA integration\n") +# Find integration anchors using RPCA +anchors <- FindIntegrationAnchors( + object.list = seurat_list, + anchor.features = hvg_genes, + reduction = "rpca", + dims = seq_len(par$dims), + k.anchor = par$k_anchor, + k.filter = par$k_filter, + k.score = par$k_score, + verbose = FALSE +) + +# Integrate the data +integrated <- IntegrateData( + anchorset = anchors, + dims = seq_len(par$dims), + verbose = FALSE +) + +cat("Scale and run PCA on integrated data\n") +DefaultAssay(integrated) <- "integrated" +integrated <- ScaleData(integrated, verbose = FALSE) +integrated <- RunPCA(integrated, npcs = par$dims, verbose = FALSE) + +cat("Generate UMAP embedding\n") +integrated <- RunUMAP(integrated, reduction = "pca", dims = seq_len(par$dims), verbose = FALSE) + +cat("Extract embedding\n") +embedding <- Embeddings(integrated, reduction = "umap") + +cat("Store outputs\n") +output <- anndata::AnnData( + obs = adata$obs[, c()], + var = adata$var[, c()], + obsm = list( + X_emb = embedding + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ) +) + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") + +cat("Finished\n")