From 3e5c8b9fa9ce05151bdd7dbcf769337b761ca891 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Wed, 4 Jun 2025 22:40:06 +0200 Subject: [PATCH 01/10] working unsupervised scmerge2. Need to clear out the comments --- .../unsupervised_scmerge2/config.vsh.yaml | 85 +++++++++++++++++++ src/methods/unsupervised_scmerge2/script.R | 81 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 src/methods/unsupervised_scmerge2/config.vsh.yaml create mode 100644 src/methods/unsupervised_scmerge2/script.R diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml new file mode 100644 index 00000000..bd732295 --- /dev/null +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -0,0 +1,85 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: unsupervised_scmerge2 +# A relatively short label, used when rendering visualisations (required) +label: unsupervised Scmerge2 +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. + It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. + Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. + To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.. +references: + doi: + - 10.1073/pnas.1820006116 +# bibtex: +# - | +# @article{foo, +# title={Foo}, +# author={Bar}, +# journal={Baz}, +# year={2024} +# } +links: + # URL to the documentation for this method (required). + documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html + # URL to the code repository for this method (required). + repository: https://github.com/SydneyBioX/scMerge + + + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cpm + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: r_script + path: script.R + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_r:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: apt + packages: cmake + - type: r + bioc: + - scmerge + - org.Mm.eg.db + - org.Hs.eg.db + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R new file mode 100644 index 00000000..d6ed10a9 --- /dev/null +++ b/src/methods/unsupervised_scmerge2/script.R @@ -0,0 +1,81 @@ +cat(">> Load dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(scMerge) +library(org.Hs.eg.db) +library(org.Mm.eg.db) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "unsupervised_scmerge2" +) +## VIASH END + +cat("Reading input files\n") +adata <- anndata::read_h5ad(par$input) +adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names + +anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) { + exprsMat_all <- t(as.matrix(adata$layers[[layer]])) + batch_all <- as.character(adata$obs$batch) + + valid_cells <- !is.na(batch_all) + exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] + batch <- batch_all[valid_cells] + + ctl_flat <- unlist(seg_list, recursive = FALSE) + ctl_matches <- ctl_flat[grepl("scSEG$", names(ctl_flat))] + if (length(ctl_matches) == 0) { + stop("No stably expressed gene (scSEG) list found in the provided seg_list.") + } + ctl <- ctl_matches[[1]] + + scMerge2_res <- scMerge2( + exprsMat = exprsMat, + batch = batch, + ctl = ctl, + verbose = verbose + ) + + return(scMerge2_res) +} + +data("segList_ensemblGeneID") # only for human and mouse- is that okay? + +cat("Run scMerge2\n") + +scMerge2_res <- anndataToScMerge2( + adata = adata, + seg_list = segList_ensemblGeneID, + layer = "normalized", + verbose = TRUE +) + + +cat("Store output\n") +corrected_mat <- scMerge2_res$newY + +# PCA as embedding - is this right? +embedding <- prcomp(t(corrected_mat))$x[, 1:10] + +rownames(embedding) <- colnames(corrected_mat) + +output <- anndata::AnnData( + X = NULL, + obs = adata$obs[, c()], + var = NULL, + obsm = list( + X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + shape = adata$shape +) +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip") From 1cd2b42b75f0058dfa2302feb06c113bb2abf4bd Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Fri, 6 Jun 2025 18:00:13 +0200 Subject: [PATCH 02/10] raise error for unmatched species --- src/methods/unsupervised_scmerge2/script.R | 33 ++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R index d6ed10a9..3c766444 100644 --- a/src/methods/unsupervised_scmerge2/script.R +++ b/src/methods/unsupervised_scmerge2/script.R @@ -25,13 +25,31 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F valid_cells <- !is.na(batch_all) exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] batch <- batch_all[valid_cells] - - ctl_flat <- unlist(seg_list, recursive = FALSE) - ctl_matches <- ctl_flat[grepl("scSEG$", names(ctl_flat))] - if (length(ctl_matches) == 0) { - stop("No stably expressed gene (scSEG) list found in the provided seg_list.") + + # Check overlap with human/mouse scSEG lists + gene_ids <- rownames(exprsMat) + species <- NULL + best_match <- 0 + + for (organism in names(seg_list)) { + scseg_name <- paste0(organism, "_scSEG") + seg_genes <- seg_list[[organism]][[scseg_name]] + overlap <- length(intersect(gene_ids, seg_genes)) + + if (overlap > best_match) { + best_match <- overlap + species <- organism + } } - ctl <- ctl_matches[[1]] + + if (is.null(species) || best_match == 0) { + stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ", + "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.") + } + + message("Detected species: ", species, " (matched ", best_match, " genes)") + + ctl <- seg_list[[species]][[paste0(species, "_scSEG")]] scMerge2_res <- scMerge2( exprsMat = exprsMat, @@ -43,7 +61,7 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F return(scMerge2_res) } -data("segList_ensemblGeneID") # only for human and mouse- is that okay? +data("segList_ensemblGeneID") cat("Run scMerge2\n") @@ -58,7 +76,6 @@ scMerge2_res <- anndataToScMerge2( cat("Store output\n") corrected_mat <- scMerge2_res$newY -# PCA as embedding - is this right? embedding <- prcomp(t(corrected_mat))$x[, 1:10] rownames(embedding) <- colnames(corrected_mat) From 68c3b98f31bf83a5f56bbf102a907b742bb03b46 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Wed, 25 Jun 2025 14:55:21 +0200 Subject: [PATCH 03/10] clean unsupervised scmerge2 --- src/methods/unsupervised_scmerge2/config.vsh.yaml | 4 +--- src/methods/unsupervised_scmerge2/script.R | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml index bd732295..6fc3f07e 100644 --- a/src/methods/unsupervised_scmerge2/config.vsh.yaml +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -20,7 +20,7 @@ description: | It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. - Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis. references: doi: - 10.1073/pnas.1820006116 @@ -73,8 +73,6 @@ engines: - type: r bioc: - scmerge - - org.Mm.eg.db - - org.Hs.eg.db runners: # This platform allows running the component natively diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R index 3c766444..7d682e00 100644 --- a/src/methods/unsupervised_scmerge2/script.R +++ b/src/methods/unsupervised_scmerge2/script.R @@ -1,8 +1,6 @@ cat(">> Load dependencies\n") requireNamespace("anndata", quietly = TRUE) library(scMerge) -library(org.Hs.eg.db) -library(org.Mm.eg.db) ## VIASH START par <- list( From 9836495559a853a5692b204ae2f64f97209ecab2 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Wed, 25 Jun 2025 14:55:37 +0200 Subject: [PATCH 04/10] working semi-supervised scmerge2 --- .../semisupervised_scmerge2/config.vsh.yaml | 86 ++++++++++++++++ src/methods/semisupervised_scmerge2/script.R | 99 +++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 src/methods/semisupervised_scmerge2/config.vsh.yaml create mode 100644 src/methods/semisupervised_scmerge2/script.R diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml new file mode 100644 index 00000000..34e3990e --- /dev/null +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -0,0 +1,86 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: semisupervised_scmerge2 +# A relatively short label, used when rendering visualisations (required) +label: Semi-supervised Scmerge2 +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + When cell type information are known (e.g. results from cell type classification using reference), + scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input. + scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. + It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. + Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. + To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis. +references: + doi: + - 10.1073/pnas.1820006116 +# bibtex: +# - | +# @article{foo, +# title={Foo}, +# author={Bar}, +# journal={Baz}, +# year={2024} +# } +links: + # URL to the documentation for this method (required). + documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html + # URL to the code repository for this method (required). + repository: https://github.com/SydneyBioX/scMerge + + + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cpm + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: r_script + path: script.R + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_r:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: apt + packages: cmake + - type: r + bioc: + - scmerge + + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/semisupervised_scmerge2/script.R b/src/methods/semisupervised_scmerge2/script.R new file mode 100644 index 00000000..0b54fda2 --- /dev/null +++ b/src/methods/semisupervised_scmerge2/script.R @@ -0,0 +1,99 @@ +library(anndata) +library(scMerge) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "semisupervised_scmerge2" +) +## VIASH END + +cat("Reading input files\n") +adata <- anndata::read_h5ad(par$input) +adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names + +anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) { + exprsMat_all <- t(as.matrix(adata$layers[[layer]])) + batch_all <- as.character(adata$obs$batch) + celltypes_all <- as.character(adata$obs$cell_type) + + valid_cells <- !is.na(batch_all) + exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] + batch <- batch_all[valid_cells] + cellTypes <- celltypes_all[valid_cells] + + # Check overlap with human/mouse scSEG lists + gene_ids <- rownames(exprsMat) + species <- NULL + best_match <- 0 + + for (organism in names(seg_list)) { + scseg_name <- paste0(organism, "_scSEG") + seg_genes <- seg_list[[organism]][[scseg_name]] + overlap <- length(intersect(gene_ids, seg_genes)) + + if (overlap > best_match) { + best_match <- overlap + species <- organism + } + } + + if (is.null(species) || best_match == 0) { + stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ", + "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.") + } + + message("Detected species: ", species, " (matched ", best_match, " genes)") + + ctl <- seg_list[[species]][[paste0(species, "_scSEG")]] + + scMerge2_res <- scMerge2( + exprsMat = exprsMat, + batch = batch, + cellTypes = cellTypes, + ctl = ctl, + verbose = verbose + ) + + return(scMerge2_res) +} + +data("segList_ensemblGeneID") + +cat("Run semi-supervised scMerge2\n") + +scMerge2_res <- anndataToSemiSupervisedScMerge2( + adata = adata, + seg_list = segList_ensemblGeneID, + layer = "normalized", + verbose = TRUE +) + + +cat("Store output\n") +corrected_mat <- scMerge2_res$newY + +embedding <- prcomp(t(corrected_mat))$x[, 1:10] + +rownames(embedding) <- colnames(corrected_mat) + +output <- anndata::AnnData( + X = NULL, + obs = adata$obs[, c()], + var = NULL, + obsm = list( + X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + shape = adata$shape +) + +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip") From e4a1daa2ea4ac041fd221aca1b2054e4e0dc95d9 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Wed, 23 Jul 2025 13:19:36 +0200 Subject: [PATCH 05/10] remove comments --- .../semisupervised_scmerge2/config.vsh.yaml | 49 ------------------- .../unsupervised_scmerge2/config.vsh.yaml | 48 ------------------ 2 files changed, 97 deletions(-) diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml index 34e3990e..b8ed095b 100644 --- a/src/methods/semisupervised_scmerge2/config.vsh.yaml +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -1,20 +1,7 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_method.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. name: semisupervised_scmerge2 -# A relatively short label, used when rendering visualisations (required) label: Semi-supervised Scmerge2 -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. description: | When cell type information are known (e.g. results from cell type classification using reference), scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input. @@ -26,61 +13,25 @@ description: | references: doi: - 10.1073/pnas.1820006116 -# bibtex: -# - | -# @article{foo, -# title={Foo}, -# author={Bar}, -# journal={Baz}, -# year={2024} -# } links: - # URL to the documentation for this method (required). documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html - # URL to the code repository for this method (required). repository: https://github.com/SydneyBioX/scMerge - - - -# Metadata for your component info: - # Which normalisation method this component prefers to use (required). preferred_normalization: log_cpm - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component resources: - # The script of your component (required) - type: r_script path: script.R - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - engines: - # Specifications for the Docker image for this component. - type: docker image: openproblems/base_r:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . setup: - type: apt packages: cmake - type: r bioc: - scmerge - - runners: - # This platform allows running the component natively - type: executable - # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: label: [midtime,midmem,midcpu] diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml index 6fc3f07e..12ef3ab4 100644 --- a/src/methods/unsupervised_scmerge2/config.vsh.yaml +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -1,20 +1,7 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_method.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. name: unsupervised_scmerge2 -# A relatively short label, used when rendering visualisations (required) label: unsupervised Scmerge2 -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. description: | scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. @@ -24,60 +11,25 @@ description: | references: doi: - 10.1073/pnas.1820006116 -# bibtex: -# - | -# @article{foo, -# title={Foo}, -# author={Bar}, -# journal={Baz}, -# year={2024} -# } links: - # URL to the documentation for this method (required). documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html - # URL to the code repository for this method (required). repository: https://github.com/SydneyBioX/scMerge - - - -# Metadata for your component info: - # Which normalisation method this component prefers to use (required). preferred_normalization: log_cpm - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component resources: - # The script of your component (required) - type: r_script path: script.R - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - engines: - # Specifications for the Docker image for this component. - type: docker image: openproblems/base_r:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . setup: - type: apt packages: cmake - type: r bioc: - scmerge - runners: - # This platform allows running the component natively - type: executable - # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: label: [midtime,midmem,midcpu] From d5eb2d12300c401a7e2e8888351cca889ae06345 Mon Sep 17 00:00:00 2001 From: seo <159482645+seohyonkim@users.noreply.github.com> Date: Wed, 23 Jul 2025 17:33:08 +0200 Subject: [PATCH 06/10] Update src/methods/semisupervised_scmerge2/config.vsh.yaml Co-authored-by: Luke Zappia --- src/methods/semisupervised_scmerge2/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml index b8ed095b..b435f660 100644 --- a/src/methods/semisupervised_scmerge2/config.vsh.yaml +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -23,7 +23,7 @@ resources: path: script.R engines: - type: docker - image: openproblems/base_r:1.0.0 + image: openproblems/base_r:1 setup: - type: apt packages: cmake From 26712b611c395505acfaaaaa856defc3fbbc25f2 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Wed, 23 Jul 2025 17:43:29 +0200 Subject: [PATCH 07/10] change image --- src/methods/unsupervised_scmerge2/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml index 12ef3ab4..23c28030 100644 --- a/src/methods/unsupervised_scmerge2/config.vsh.yaml +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -21,7 +21,7 @@ resources: path: script.R engines: - type: docker - image: openproblems/base_r:1.0.0 + image: openproblems/base_r:1 setup: - type: apt packages: cmake From d0437b951c8e04d4533c36d309afeb5fb5224174 Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 9 Sep 2025 01:07:25 +0200 Subject: [PATCH 08/10] fixed scMerge2 --- .../semisupervised_scmerge2/config.vsh.yaml | 2 + src/methods/semisupervised_scmerge2/script.R | 60 +++++------------ .../unsupervised_scmerge2/config.vsh.yaml | 2 + src/methods/unsupervised_scmerge2/script.R | 64 ++++++------------- 4 files changed, 40 insertions(+), 88 deletions(-) diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml index b435f660..fb514571 100644 --- a/src/methods/semisupervised_scmerge2/config.vsh.yaml +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -28,6 +28,8 @@ engines: - type: apt packages: cmake - type: r + cran: + - Matrix bioc: - scmerge runners: diff --git a/src/methods/semisupervised_scmerge2/script.R b/src/methods/semisupervised_scmerge2/script.R index 0b54fda2..13f9efd8 100644 --- a/src/methods/semisupervised_scmerge2/script.R +++ b/src/methods/semisupervised_scmerge2/script.R @@ -1,5 +1,7 @@ library(anndata) library(scMerge) +library(Matrix) +library(stats) ## VIASH START par <- list( @@ -13,42 +15,22 @@ meta <- list( cat("Reading input files\n") adata <- anndata::read_h5ad(par$input) -adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names -anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) { - exprsMat_all <- t(as.matrix(adata$layers[[layer]])) - batch_all <- as.character(adata$obs$batch) - celltypes_all <- as.character(adata$obs$cell_type) +anndataToSemiSupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) { + counts <- t(as.matrix(adata$layers[["counts"]])) + rownames(counts) <- as.character(adata$var_names) + colnames(counts) <- as.character(adata$obs_names) - valid_cells <- !is.na(batch_all) - exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] - batch <- batch_all[valid_cells] - cellTypes <- celltypes_all[valid_cells] + seg_df <- scSEGIndex(exprs_mat = counts) + seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE] + ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))] - # Check overlap with human/mouse scSEG lists - gene_ids <- rownames(exprsMat) - species <- NULL - best_match <- 0 + exprsMat <- t(as.matrix(adata$layers[["normalized"]])) + rownames(exprsMat) <- as.character(adata$var_names) + colnames(exprsMat) <- as.character(adata$obs_names) - for (organism in names(seg_list)) { - scseg_name <- paste0(organism, "_scSEG") - seg_genes <- seg_list[[organism]][[scseg_name]] - overlap <- length(intersect(gene_ids, seg_genes)) - - if (overlap > best_match) { - best_match <- overlap - species <- organism - } - } - - if (is.null(species) || best_match == 0) { - stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ", - "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.") - } - - message("Detected species: ", species, " (matched ", best_match, " genes)") - - ctl <- seg_list[[species]][[paste0(species, "_scSEG")]] + batch <- as.character(adata$obs$batch) + cellTypes <- as.character(adata$obs$cell_type) scMerge2_res <- scMerge2( exprsMat = exprsMat, @@ -61,23 +43,15 @@ anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized return(scMerge2_res) } -data("segList_ensemblGeneID") cat("Run semi-supervised scMerge2\n") -scMerge2_res <- anndataToSemiSupervisedScMerge2( - adata = adata, - seg_list = segList_ensemblGeneID, - layer = "normalized", - verbose = TRUE -) +scMerge2_res <- anndataToSemiSupervisedScMerge2(adata, top_n = 1000, verbose = TRUE) cat("Store output\n") corrected_mat <- scMerge2_res$newY - -embedding <- prcomp(t(corrected_mat))$x[, 1:10] - +embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE] rownames(embedding) <- colnames(corrected_mat) output <- anndata::AnnData( @@ -85,7 +59,7 @@ output <- anndata::AnnData( obs = adata$obs[, c()], var = NULL, obsm = list( - X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells + X_emb = embedding[as.character(adata$obs_names), , drop = FALSE] # match input cells ), uns = list( dataset_id = adata$uns[["dataset_id"]], diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml index 23c28030..8e921dc1 100644 --- a/src/methods/unsupervised_scmerge2/config.vsh.yaml +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -26,6 +26,8 @@ engines: - type: apt packages: cmake - type: r + cran: + - Matrix bioc: - scmerge runners: diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R index 7d682e00..503a1d6b 100644 --- a/src/methods/unsupervised_scmerge2/script.R +++ b/src/methods/unsupervised_scmerge2/script.R @@ -1,6 +1,7 @@ -cat(">> Load dependencies\n") -requireNamespace("anndata", quietly = TRUE) +library(anndata) library(scMerge) +library(Matrix) +library(stats) ## VIASH START par <- list( @@ -14,40 +15,22 @@ meta <- list( cat("Reading input files\n") adata <- anndata::read_h5ad(par$input) -adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names -anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) { - exprsMat_all <- t(as.matrix(adata$layers[[layer]])) - batch_all <- as.character(adata$obs$batch) +anndataToUnsupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) { + counts <- t(as.matrix(adata$layers[["counts"]])) + rownames(counts) <- as.character(adata$var_names) + colnames(counts) <- as.character(adata$obs_names) - valid_cells <- !is.na(batch_all) - exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] - batch <- batch_all[valid_cells] + seg_df <- scSEGIndex(exprs_mat = counts) + seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE] + ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))] - # Check overlap with human/mouse scSEG lists - gene_ids <- rownames(exprsMat) - species <- NULL - best_match <- 0 + exprsMat <- t(as.matrix(adata$layers[["normalized"]])) + rownames(exprsMat) <- as.character(adata$var_names) + colnames(exprsMat) <- as.character(adata$obs_names) - for (organism in names(seg_list)) { - scseg_name <- paste0(organism, "_scSEG") - seg_genes <- seg_list[[organism]][[scseg_name]] - overlap <- length(intersect(gene_ids, seg_genes)) - - if (overlap > best_match) { - best_match <- overlap - species <- organism - } - } - - if (is.null(species) || best_match == 0) { - stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ", - "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.") - } - - message("Detected species: ", species, " (matched ", best_match, " genes)") - - ctl <- seg_list[[species]][[paste0(species, "_scSEG")]] + batch <- as.character(adata$obs$batch) + cellTypes <- as.character(adata$obs$cell_type) scMerge2_res <- scMerge2( exprsMat = exprsMat, @@ -59,23 +42,14 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F return(scMerge2_res) } -data("segList_ensemblGeneID") +cat("Run unsupervised scMerge2\n") -cat("Run scMerge2\n") - -scMerge2_res <- anndataToScMerge2( - adata = adata, - seg_list = segList_ensemblGeneID, - layer = "normalized", - verbose = TRUE -) +scMerge2_res <- anndataToUnsupervisedScMerge2(adata, top_n = 1000L, verbose = TRUE) cat("Store output\n") corrected_mat <- scMerge2_res$newY - -embedding <- prcomp(t(corrected_mat))$x[, 1:10] - +embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE] rownames(embedding) <- colnames(corrected_mat) output <- anndata::AnnData( @@ -83,7 +57,7 @@ output <- anndata::AnnData( obs = adata$obs[, c()], var = NULL, obsm = list( - X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells + X_emb = embedding[as.character(adata$obs_names), , drop = FALSE] # match input cells ), uns = list( dataset_id = adata$uns[["dataset_id"]], From 423057eb80d7a182a2272f69c263fd99b5912adc Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 9 Sep 2025 01:17:12 +0200 Subject: [PATCH 09/10] add method_types to config --- src/methods/semisupervised_scmerge2/config.vsh.yaml | 1 + src/methods/unsupervised_scmerge2/config.vsh.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml index fb514571..723a3058 100644 --- a/src/methods/semisupervised_scmerge2/config.vsh.yaml +++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml @@ -17,6 +17,7 @@ links: documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html repository: https://github.com/SydneyBioX/scMerge info: + method_types: [feature] preferred_normalization: log_cpm resources: - type: r_script diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml index 8e921dc1..9bff630d 100644 --- a/src/methods/unsupervised_scmerge2/config.vsh.yaml +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -15,6 +15,7 @@ links: documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html repository: https://github.com/SydneyBioX/scMerge info: + method_types: [feature] preferred_normalization: log_cpm resources: - type: r_script From dd6b1ce115b9afdc85d9faf58dfe7720496587ba Mon Sep 17 00:00:00 2001 From: seohyonkim Date: Tue, 9 Sep 2025 01:18:35 +0200 Subject: [PATCH 10/10] add to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 09d672d0..1f799b71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New functionality * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52). +* Added `methods/semisupervised_scmerge2` and `methods/unsupervised_scmerge2` components (PR #63). ## Minor changes