From 3e5c8b9fa9ce05151bdd7dbcf769337b761ca891 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Wed, 4 Jun 2025 22:40:06 +0200
Subject: [PATCH 01/10] working unsupervised scmerge2. Need to clear out the
 comments

---
 .../unsupervised_scmerge2/config.vsh.yaml     | 85 +++++++++++++++++++
 src/methods/unsupervised_scmerge2/script.R    | 81 ++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 src/methods/unsupervised_scmerge2/config.vsh.yaml
 create mode 100644 src/methods/unsupervised_scmerge2/script.R

diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
new file mode 100644
index 00000000..bd732295
--- /dev/null
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -0,0 +1,85 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_method.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: unsupervised_scmerge2
+# A relatively short label, used when rendering visualisations (required)
+label: unsupervised Scmerge2
+# A one sentence summary of how this method works (required). Used when 
+# rendering summary tables.
+summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
+# A multi-line description of how this component works (required). Used
+# when rendering reference documentation.
+description: |
+  scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
+  It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
+  Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
+  To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
+  Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis..
+references:
+  doi: 
+    - 10.1073/pnas.1820006116
+#   bibtex:
+#     - |
+#       @article{foo,
+#         title={Foo},
+#         author={Bar},
+#         journal={Baz},
+#         year={2024}
+#       }
+links:
+  # URL to the documentation for this method (required).
+  documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
+  # URL to the code repository for this method (required).
+  repository: https://github.com/SydneyBioX/scMerge
+
+
+
+# Metadata for your component
+info:
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: log_cpm
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: r_script
+    path: script.R
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: apt
+        packages: cmake
+      - type: r
+        bioc: 
+        - scmerge
+        - org.Mm.eg.db
+        - org.Hs.eg.db
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R
new file mode 100644
index 00000000..d6ed10a9
--- /dev/null
+++ b/src/methods/unsupervised_scmerge2/script.R
@@ -0,0 +1,81 @@
+cat(">> Load dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+library(scMerge)
+library(org.Hs.eg.db)
+library(org.Mm.eg.db)
+
+## VIASH START
+par <- list(
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "unsupervised_scmerge2"
+)
+## VIASH END
+
+cat("Reading input files\n")
+adata <- anndata::read_h5ad(par$input)
+adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names
+
+anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
+  exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
+  batch_all <- as.character(adata$obs$batch)
+
+  valid_cells <- !is.na(batch_all)
+  exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
+  batch <- batch_all[valid_cells]
+  
+  ctl_flat <- unlist(seg_list, recursive = FALSE)
+  ctl_matches <- ctl_flat[grepl("scSEG$", names(ctl_flat))]
+  if (length(ctl_matches) == 0) {
+    stop("No stably expressed gene (scSEG) list found in the provided seg_list.")
+  }
+  ctl <- ctl_matches[[1]]
+
+  scMerge2_res <- scMerge2(
+    exprsMat = exprsMat,
+    batch = batch,
+    ctl = ctl,
+    verbose = verbose
+  )
+
+  return(scMerge2_res)
+}
+
+data("segList_ensemblGeneID") # only for human and mouse- is that okay?
+
+cat("Run scMerge2\n")
+
+scMerge2_res <- anndataToScMerge2(
+  adata = adata,
+  seg_list = segList_ensemblGeneID,
+  layer = "normalized",
+  verbose = TRUE
+)
+
+
+cat("Store output\n")
+corrected_mat <- scMerge2_res$newY
+
+# PCA as embedding - is this right?
+embedding <- prcomp(t(corrected_mat))$x[, 1:10]
+
+rownames(embedding) <- colnames(corrected_mat)
+
+output <- anndata::AnnData(
+  X = NULL,
+  obs = adata$obs[, c()],
+  var = NULL,
+  obsm = list(
+    X_emb = embedding[rownames(adata), , drop = FALSE]  # match input cells
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name 
+  ),
+  shape = adata$shape
+)
+cat("Write output AnnData to file\n")
+output$write_h5ad(par[["output"]], compression = "gzip")

From 1cd2b42b75f0058dfa2302feb06c113bb2abf4bd Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Fri, 6 Jun 2025 18:00:13 +0200
Subject: [PATCH 02/10] raise error for unmatched species

---
 src/methods/unsupervised_scmerge2/script.R | 33 ++++++++++++++++------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R
index d6ed10a9..3c766444 100644
--- a/src/methods/unsupervised_scmerge2/script.R
+++ b/src/methods/unsupervised_scmerge2/script.R
@@ -25,13 +25,31 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F
   valid_cells <- !is.na(batch_all)
   exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
   batch <- batch_all[valid_cells]
-  
-  ctl_flat <- unlist(seg_list, recursive = FALSE)
-  ctl_matches <- ctl_flat[grepl("scSEG$", names(ctl_flat))]
-  if (length(ctl_matches) == 0) {
-    stop("No stably expressed gene (scSEG) list found in the provided seg_list.")
+
+  # Check overlap with human/mouse scSEG lists
+  gene_ids <- rownames(exprsMat)
+  species <- NULL
+  best_match <- 0
+
+  for (organism in names(seg_list)) {
+    scseg_name <- paste0(organism, "_scSEG")
+    seg_genes <- seg_list[[organism]][[scseg_name]]
+    overlap <- length(intersect(gene_ids, seg_genes))
+
+    if (overlap > best_match) {
+      best_match <- overlap
+      species <- organism
+    }
   }
-  ctl <- ctl_matches[[1]]
+
+  if (is.null(species) || best_match == 0) {
+    stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
+         "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
+  }
+
+  message("Detected species: ", species, " (matched ", best_match, " genes)")
+
+  ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]
 
   scMerge2_res <- scMerge2(
     exprsMat = exprsMat,
@@ -43,7 +61,7 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F
   return(scMerge2_res)
 }
 
-data("segList_ensemblGeneID") # only for human and mouse- is that okay?
+data("segList_ensemblGeneID")
 
 cat("Run scMerge2\n")
 
@@ -58,7 +76,6 @@ scMerge2_res <- anndataToScMerge2(
 cat("Store output\n")
 corrected_mat <- scMerge2_res$newY
 
-# PCA as embedding - is this right?
 embedding <- prcomp(t(corrected_mat))$x[, 1:10]
 
 rownames(embedding) <- colnames(corrected_mat)

From 68c3b98f31bf83a5f56bbf102a907b742bb03b46 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Wed, 25 Jun 2025 14:55:21 +0200
Subject: [PATCH 03/10] clean unsupervised scmerge2

---
 src/methods/unsupervised_scmerge2/config.vsh.yaml | 4 +---
 src/methods/unsupervised_scmerge2/script.R        | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
index bd732295..6fc3f07e 100644
--- a/src/methods/unsupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -20,7 +20,7 @@ description: |
   It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
   Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
   To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
-  Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis..
+  Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.
 references:
   doi: 
     - 10.1073/pnas.1820006116
@@ -73,8 +73,6 @@ engines:
       - type: r
         bioc: 
         - scmerge
-        - org.Mm.eg.db
-        - org.Hs.eg.db
 
 runners:
   # This platform allows running the component natively
diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R
index 3c766444..7d682e00 100644
--- a/src/methods/unsupervised_scmerge2/script.R
+++ b/src/methods/unsupervised_scmerge2/script.R
@@ -1,8 +1,6 @@
 cat(">> Load dependencies\n")
 requireNamespace("anndata", quietly = TRUE)
 library(scMerge)
-library(org.Hs.eg.db)
-library(org.Mm.eg.db)
 
 ## VIASH START
 par <- list(

From 9836495559a853a5692b204ae2f64f97209ecab2 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Wed, 25 Jun 2025 14:55:37 +0200
Subject: [PATCH 04/10] working semi-supervised scmerge2

---
 .../semisupervised_scmerge2/config.vsh.yaml   | 86 ++++++++++++++++
 src/methods/semisupervised_scmerge2/script.R  | 99 +++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 src/methods/semisupervised_scmerge2/config.vsh.yaml
 create mode 100644 src/methods/semisupervised_scmerge2/script.R

diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml
new file mode 100644
index 00000000..34e3990e
--- /dev/null
+++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml
@@ -0,0 +1,86 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_method.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: semisupervised_scmerge2
+# A relatively short label, used when rendering visualisations (required)
+label: Semi-supervised Scmerge2
+# A one sentence summary of how this method works (required). Used when 
+# rendering summary tables.
+summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
+# A multi-line description of how this component works (required). Used
+# when rendering reference documentation.
+description: |
+  When cell type information are known (e.g. results from cell type classification using reference),
+  scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input.
+  scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
+  It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
+  Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
+  To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
+  Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.
+references:
+  doi: 
+    - 10.1073/pnas.1820006116
+#   bibtex:
+#     - |
+#       @article{foo,
+#         title={Foo},
+#         author={Bar},
+#         journal={Baz},
+#         year={2024}
+#       }
+links:
+  # URL to the documentation for this method (required).
+  documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
+  # URL to the code repository for this method (required).
+  repository: https://github.com/SydneyBioX/scMerge
+
+
+
+# Metadata for your component
+info:
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: log_cpm
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: r_script
+    path: script.R
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: apt
+        packages: cmake
+      - type: r
+        bioc: 
+        - scmerge
+
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/semisupervised_scmerge2/script.R b/src/methods/semisupervised_scmerge2/script.R
new file mode 100644
index 00000000..0b54fda2
--- /dev/null
+++ b/src/methods/semisupervised_scmerge2/script.R
@@ -0,0 +1,99 @@
+library(anndata)
+library(scMerge)
+
+## VIASH START
+par <- list(
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "semisupervised_scmerge2"
+)
+## VIASH END
+
+cat("Reading input files\n")
+adata <- anndata::read_h5ad(par$input)
+adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names
+
+anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
+  exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
+  batch_all <- as.character(adata$obs$batch)
+  celltypes_all <- as.character(adata$obs$cell_type)
+
+  valid_cells <- !is.na(batch_all)
+  exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
+  batch <- batch_all[valid_cells]
+  cellTypes <- celltypes_all[valid_cells]
+
+  # Check overlap with human/mouse scSEG lists
+  gene_ids <- rownames(exprsMat)
+  species <- NULL
+  best_match <- 0
+
+  for (organism in names(seg_list)) {
+    scseg_name <- paste0(organism, "_scSEG")
+    seg_genes <- seg_list[[organism]][[scseg_name]]
+    overlap <- length(intersect(gene_ids, seg_genes))
+
+    if (overlap > best_match) {
+      best_match <- overlap
+      species <- organism
+    }
+  }
+
+  if (is.null(species) || best_match == 0) {
+    stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
+         "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
+  }
+
+  message("Detected species: ", species, " (matched ", best_match, " genes)")
+
+  ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]
+
+  scMerge2_res <- scMerge2(
+    exprsMat = exprsMat,
+    batch = batch,
+    cellTypes = cellTypes,
+    ctl = ctl,
+    verbose = verbose
+  )
+
+  return(scMerge2_res)
+}
+
+data("segList_ensemblGeneID")
+
+cat("Run semi-supervised scMerge2\n")
+
+scMerge2_res <- anndataToSemiSupervisedScMerge2(
+  adata = adata,
+  seg_list = segList_ensemblGeneID,
+  layer = "normalized",
+  verbose = TRUE
+)
+
+
+cat("Store output\n")
+corrected_mat <- scMerge2_res$newY
+
+embedding <- prcomp(t(corrected_mat))$x[, 1:10]
+
+rownames(embedding) <- colnames(corrected_mat)
+
+output <- anndata::AnnData(
+  X = NULL,
+  obs = adata$obs[, c()],
+  var = NULL,
+  obsm = list(
+    X_emb = embedding[rownames(adata), , drop = FALSE]  # match input cells
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name 
+  ),
+  shape = adata$shape
+)
+
+cat("Write output AnnData to file\n")
+output$write_h5ad(par[["output"]], compression = "gzip")

From e4a1daa2ea4ac041fd221aca1b2054e4e0dc95d9 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Wed, 23 Jul 2025 13:19:36 +0200
Subject: [PATCH 05/10] remove comments

---
 .../semisupervised_scmerge2/config.vsh.yaml   | 49 -------------------
 .../unsupervised_scmerge2/config.vsh.yaml     | 48 ------------------
 2 files changed, 97 deletions(-)

diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml
index 34e3990e..b8ed095b 100644
--- a/src/methods/semisupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml
@@ -1,20 +1,7 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-#   - The input/output files
-#   - Common parameters
-#   - A unit test
 __merge__: ../../api/comp_method.yaml
-
-# A unique identifier for your component (required).
-# Can contain only lowercase letters or underscores.
 name: semisupervised_scmerge2
-# A relatively short label, used when rendering visualisations (required)
 label: Semi-supervised Scmerge2
-# A one sentence summary of how this method works (required). Used when 
-# rendering summary tables.
 summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
-# A multi-line description of how this component works (required). Used
-# when rendering reference documentation.
 description: |
   When cell type information are known (e.g. results from cell type classification using reference),
   scMerge2 can use this information to construct pseudo-replicates and identify mutual nearest groups with cellTypes input.
@@ -26,61 +13,25 @@ description: |
 references:
   doi: 
     - 10.1073/pnas.1820006116
-#   bibtex:
-#     - |
-#       @article{foo,
-#         title={Foo},
-#         author={Bar},
-#         journal={Baz},
-#         year={2024}
-#       }
 links:
-  # URL to the documentation for this method (required).
   documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
-  # URL to the code repository for this method (required).
   repository: https://github.com/SydneyBioX/scMerge
-
-
-
-# Metadata for your component
 info:
-  # Which normalisation method this component prefers to use (required).
   preferred_normalization: log_cpm
-
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
-
-# Resources required to run the component
 resources:
-  # The script of your component (required)
   - type: r_script
     path: script.R
-  # Additional resources your script needs (optional)
-  # - type: file
-  #   path: weights.pt
-
 engines:
-  # Specifications for the Docker image for this component.
   - type: docker
     image: openproblems/base_r:1.0.0
-    # Add custom dependencies here (optional). For more information, see
-    # https://viash.io/reference/config/engines/docker/#setup .
     setup:
       - type: apt
         packages: cmake
       - type: r
         bioc: 
         - scmerge
-
-
 runners:
-  # This platform allows running the component natively
   - type: executable
-  # Allows turning the component into a Nextflow module / pipeline.
   - type: nextflow
     directives:
       label: [midtime,midmem,midcpu]
diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
index 6fc3f07e..12ef3ab4 100644
--- a/src/methods/unsupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -1,20 +1,7 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-#   - The input/output files
-#   - Common parameters
-#   - A unit test
 __merge__: ../../api/comp_method.yaml
-
-# A unique identifier for your component (required).
-# Can contain only lowercase letters or underscores.
 name: unsupervised_scmerge2
-# A relatively short label, used when rendering visualisations (required)
 label: unsupervised Scmerge2
-# A one sentence summary of how this method works (required). Used when 
-# rendering summary tables.
 summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
-# A multi-line description of how this component works (required). Used
-# when rendering reference documentation.
 description: |
   scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
   It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
@@ -24,60 +11,25 @@ description: |
 references:
   doi: 
     - 10.1073/pnas.1820006116
-#   bibtex:
-#     - |
-#       @article{foo,
-#         title={Foo},
-#         author={Bar},
-#         journal={Baz},
-#         year={2024}
-#       }
 links:
-  # URL to the documentation for this method (required).
   documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
-  # URL to the code repository for this method (required).
   repository: https://github.com/SydneyBioX/scMerge
-
-
-
-# Metadata for your component
 info:
-  # Which normalisation method this component prefers to use (required).
   preferred_normalization: log_cpm
-
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
-
-# Resources required to run the component
 resources:
-  # The script of your component (required)
   - type: r_script
     path: script.R
-  # Additional resources your script needs (optional)
-  # - type: file
-  #   path: weights.pt
-
 engines:
-  # Specifications for the Docker image for this component.
   - type: docker
     image: openproblems/base_r:1.0.0
-    # Add custom dependencies here (optional). For more information, see
-    # https://viash.io/reference/config/engines/docker/#setup .
     setup:
       - type: apt
         packages: cmake
       - type: r
         bioc: 
         - scmerge
-
 runners:
-  # This platform allows running the component natively
   - type: executable
-  # Allows turning the component into a Nextflow module / pipeline.
   - type: nextflow
     directives:
       label: [midtime,midmem,midcpu]

From d5eb2d12300c401a7e2e8888351cca889ae06345 Mon Sep 17 00:00:00 2001
From: seo <159482645+seohyonkim@users.noreply.github.com>
Date: Wed, 23 Jul 2025 17:33:08 +0200
Subject: [PATCH 06/10] Update
 src/methods/semisupervised_scmerge2/config.vsh.yaml

Co-authored-by: Luke Zappia <lazappi@users.noreply.github.com>
---
 src/methods/semisupervised_scmerge2/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml
index b8ed095b..b435f660 100644
--- a/src/methods/semisupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml
@@ -23,7 +23,7 @@ resources:
     path: script.R
 engines:
   - type: docker
-    image: openproblems/base_r:1.0.0
+    image: openproblems/base_r:1
     setup:
       - type: apt
         packages: cmake

From 26712b611c395505acfaaaaa856defc3fbbc25f2 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Wed, 23 Jul 2025 17:43:29 +0200
Subject: [PATCH 07/10] change image

---
 src/methods/unsupervised_scmerge2/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
index 12ef3ab4..23c28030 100644
--- a/src/methods/unsupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -21,7 +21,7 @@ resources:
     path: script.R
 engines:
   - type: docker
-    image: openproblems/base_r:1.0.0
+    image: openproblems/base_r:1
     setup:
       - type: apt
         packages: cmake

From d0437b951c8e04d4533c36d309afeb5fb5224174 Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Tue, 9 Sep 2025 01:07:25 +0200
Subject: [PATCH 08/10] fixed scMerge2

---
 .../semisupervised_scmerge2/config.vsh.yaml   |  2 +
 src/methods/semisupervised_scmerge2/script.R  | 60 +++++------------
 .../unsupervised_scmerge2/config.vsh.yaml     |  2 +
 src/methods/unsupervised_scmerge2/script.R    | 64 ++++++-------------
 4 files changed, 40 insertions(+), 88 deletions(-)

diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml
index b435f660..fb514571 100644
--- a/src/methods/semisupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml
@@ -28,6 +28,8 @@ engines:
       - type: apt
         packages: cmake
       - type: r
+        cran:
+        - Matrix
         bioc: 
         - scmerge
 runners:
diff --git a/src/methods/semisupervised_scmerge2/script.R b/src/methods/semisupervised_scmerge2/script.R
index 0b54fda2..13f9efd8 100644
--- a/src/methods/semisupervised_scmerge2/script.R
+++ b/src/methods/semisupervised_scmerge2/script.R
@@ -1,5 +1,7 @@
 library(anndata)
 library(scMerge)
+library(Matrix)
+library(stats)
 
 ## VIASH START
 par <- list(
@@ -13,42 +15,22 @@ meta <- list(
 
 cat("Reading input files\n")
 adata <- anndata::read_h5ad(par$input)
-adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names
 
-anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
-  exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
-  batch_all <- as.character(adata$obs$batch)
-  celltypes_all <- as.character(adata$obs$cell_type)
+anndataToSemiSupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) {
+  counts <- t(as.matrix(adata$layers[["counts"]]))
+  rownames(counts) <- as.character(adata$var_names)
+  colnames(counts) <- as.character(adata$obs_names)
 
-  valid_cells <- !is.na(batch_all)
-  exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
-  batch <- batch_all[valid_cells]
-  cellTypes <- celltypes_all[valid_cells]
+  seg_df <- scSEGIndex(exprs_mat = counts)
+  seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE]
+  ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))]
 
-  # Check overlap with human/mouse scSEG lists
-  gene_ids <- rownames(exprsMat)
-  species <- NULL
-  best_match <- 0
+  exprsMat <- t(as.matrix(adata$layers[["normalized"]]))
+  rownames(exprsMat) <- as.character(adata$var_names)
+  colnames(exprsMat) <- as.character(adata$obs_names)
 
-  for (organism in names(seg_list)) {
-    scseg_name <- paste0(organism, "_scSEG")
-    seg_genes <- seg_list[[organism]][[scseg_name]]
-    overlap <- length(intersect(gene_ids, seg_genes))
-
-    if (overlap > best_match) {
-      best_match <- overlap
-      species <- organism
-    }
-  }
-
-  if (is.null(species) || best_match == 0) {
-    stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
-         "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
-  }
-
-  message("Detected species: ", species, " (matched ", best_match, " genes)")
-
-  ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]
+  batch     <- as.character(adata$obs$batch)
+  cellTypes <- as.character(adata$obs$cell_type)
 
   scMerge2_res <- scMerge2(
     exprsMat = exprsMat,
@@ -61,23 +43,15 @@ anndataToSemiSupervisedScMerge2 <- function(adata, seg_list, layer = "normalized
   return(scMerge2_res)
 }
 
-data("segList_ensemblGeneID")
 
 cat("Run semi-supervised scMerge2\n")
 
-scMerge2_res <- anndataToSemiSupervisedScMerge2(
-  adata = adata,
-  seg_list = segList_ensemblGeneID,
-  layer = "normalized",
-  verbose = TRUE
-)
+scMerge2_res <- anndataToSemiSupervisedScMerge2(adata, top_n = 1000, verbose = TRUE)
 
 
 cat("Store output\n")
 corrected_mat <- scMerge2_res$newY
-
-embedding <- prcomp(t(corrected_mat))$x[, 1:10]
-
+embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE]
 rownames(embedding) <- colnames(corrected_mat)
 
 output <- anndata::AnnData(
@@ -85,7 +59,7 @@ output <- anndata::AnnData(
   obs = adata$obs[, c()],
   var = NULL,
   obsm = list(
-    X_emb = embedding[rownames(adata), , drop = FALSE]  # match input cells
+    X_emb = embedding[as.character(adata$obs_names), , drop = FALSE]  # match input cells
   ),
   uns = list(
     dataset_id = adata$uns[["dataset_id"]],
diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
index 23c28030..8e921dc1 100644
--- a/src/methods/unsupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -26,6 +26,8 @@ engines:
       - type: apt
         packages: cmake
       - type: r
+        cran:
+        - Matrix
         bioc: 
         - scmerge
 runners:
diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R
index 7d682e00..503a1d6b 100644
--- a/src/methods/unsupervised_scmerge2/script.R
+++ b/src/methods/unsupervised_scmerge2/script.R
@@ -1,6 +1,7 @@
-cat(">> Load dependencies\n")
-requireNamespace("anndata", quietly = TRUE)
+library(anndata)
 library(scMerge)
+library(Matrix)
+library(stats)
 
 ## VIASH START
 par <- list(
@@ -14,40 +15,22 @@ meta <- list(
 
 cat("Reading input files\n")
 adata <- anndata::read_h5ad(par$input)
-adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names
 
-anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
-  exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
-  batch_all <- as.character(adata$obs$batch)
+anndataToUnsupervisedScMerge2 <- function(adata, top_n = 1000, verbose = TRUE) {
+  counts <- t(as.matrix(adata$layers[["counts"]]))
+  rownames(counts) <- as.character(adata$var_names)
+  colnames(counts) <- as.character(adata$obs_names)
 
-  valid_cells <- !is.na(batch_all)
-  exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
-  batch <- batch_all[valid_cells]
+  seg_df <- scSEGIndex(exprs_mat = counts)
+  seg_df <- seg_df[order(seg_df$segIdx, decreasing = TRUE), , drop = FALSE]
+  ctl <- rownames(seg_df)[seq_len(min(top_n, nrow(seg_df)))]
 
-  # Check overlap with human/mouse scSEG lists
-  gene_ids <- rownames(exprsMat)
-  species <- NULL
-  best_match <- 0
+  exprsMat <- t(as.matrix(adata$layers[["normalized"]]))
+  rownames(exprsMat) <- as.character(adata$var_names)
+  colnames(exprsMat) <- as.character(adata$obs_names)
 
-  for (organism in names(seg_list)) {
-    scseg_name <- paste0(organism, "_scSEG")
-    seg_genes <- seg_list[[organism]][[scseg_name]]
-    overlap <- length(intersect(gene_ids, seg_genes))
-
-    if (overlap > best_match) {
-      best_match <- overlap
-      species <- organism
-    }
-  }
-
-  if (is.null(species) || best_match == 0) {
-    stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
-         "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
-  }
-
-  message("Detected species: ", species, " (matched ", best_match, " genes)")
-
-  ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]
+  batch     <- as.character(adata$obs$batch)
+  cellTypes <- as.character(adata$obs$cell_type)
 
   scMerge2_res <- scMerge2(
     exprsMat = exprsMat,
@@ -59,23 +42,14 @@ anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = F
   return(scMerge2_res)
 }
 
-data("segList_ensemblGeneID")
+cat("Run unsupervised scMerge2\n")
 
-cat("Run scMerge2\n")
-
-scMerge2_res <- anndataToScMerge2(
-  adata = adata,
-  seg_list = segList_ensemblGeneID,
-  layer = "normalized",
-  verbose = TRUE
-)
+scMerge2_res <- anndataToUnsupervisedScMerge2(adata, top_n = 1000L, verbose = TRUE)
 
 
 cat("Store output\n")
 corrected_mat <- scMerge2_res$newY
-
-embedding <- prcomp(t(corrected_mat))$x[, 1:10]
-
+embedding <- prcomp(t(corrected_mat))$x[, 1:10, drop = FALSE]
 rownames(embedding) <- colnames(corrected_mat)
 
 output <- anndata::AnnData(
@@ -83,7 +57,7 @@ output <- anndata::AnnData(
   obs = adata$obs[, c()],
   var = NULL,
   obsm = list(
-    X_emb = embedding[rownames(adata), , drop = FALSE]  # match input cells
+    X_emb = embedding[as.character(adata$obs_names), , drop = FALSE]  # match input cells
   ),
   uns = list(
     dataset_id = adata$uns[["dataset_id"]],

From 423057eb80d7a182a2272f69c263fd99b5912adc Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Tue, 9 Sep 2025 01:17:12 +0200
Subject: [PATCH 09/10] add method_types to config

---
 src/methods/semisupervised_scmerge2/config.vsh.yaml | 1 +
 src/methods/unsupervised_scmerge2/config.vsh.yaml   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/methods/semisupervised_scmerge2/config.vsh.yaml b/src/methods/semisupervised_scmerge2/config.vsh.yaml
index fb514571..723a3058 100644
--- a/src/methods/semisupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/semisupervised_scmerge2/config.vsh.yaml
@@ -17,6 +17,7 @@ links:
   documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
   repository: https://github.com/SydneyBioX/scMerge
 info:
+  method_types: [feature]
   preferred_normalization: log_cpm
 resources:
   - type: r_script
diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
index 8e921dc1..9bff630d 100644
--- a/src/methods/unsupervised_scmerge2/config.vsh.yaml
+++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -15,6 +15,7 @@ links:
   documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
   repository: https://github.com/SydneyBioX/scMerge
 info:
+  method_types: [feature]
   preferred_normalization: log_cpm
 resources:
   - type: r_script

From dd6b1ce115b9afdc85d9faf58dfe7720496587ba Mon Sep 17 00:00:00 2001
From: seohyonkim <seohyon.l.kim@gmail.com>
Date: Tue, 9 Sep 2025 01:18:35 +0200
Subject: [PATCH 10/10] add to changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09d672d0..1f799b71 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## New functionality
 
 * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
+* Added `methods/semisupervised_scmerge2` and `methods/unsupervised_scmerge2` components (PR #63).
 
 ## Minor changes