From 2e7f6306007dafa428f83e2fd87250c0690b114f Mon Sep 17 00:00:00 2001
From: Tejas Kumar <157981948+patturtejaskumar@users.noreply.github.com>
Date: Tue, 18 Mar 2025 23:36:18 +0100
Subject: [PATCH 1/5] Create bidcell.py

---
 bidcell.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 bidcell.py

diff --git a/bidcell.py b/bidcell.py
new file mode 100644
index 000000000..09caf1804
--- /dev/null
+++ b/bidcell.py
@@ -0,0 +1,125 @@
+import spatialdata as sd
+import spatialdata_plot as pl
+import matplotlib.pyplot as plt
+import numpy as np
+import tifffile
+import cv2
+import dask.dataframe as dd  #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts
+import scanpy as sc
+import pandas as pd
+import natsort
+import os 
+from bidcell import BIDCellModel
+
+
+# preprocessing test data to get required input files for BIDcell
+sdata = sd.read_zarr("dataset.zarr")
+sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist()
+# Extracting DAPI image from dataset.zarr
+image_pyramid = []
+img = sdata["morphology_mip"]["/scale0"]["image"].values  # Convert dask array to numpy
+img = np.squeeze(img)  # Remove singleton channel dimension (c:1)
+image_pyramid.append(img)
+
+with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff:
+    for img in image_pyramid:
+        tiff.write(img, photometric="minisblack", resolution=(1, 1))
+
+
+#Converting h5ad single cell reference to .csv
+adata = sc.read_h5ad("dataset.h5ad")
+shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present
+adata = adata[:, adata.var["feature_name"].isin(shared_genes)]
+
+# Set var_names to feature_name
+adata.var_names = adata.var["feature_name"].astype(str)
+sc_ref = pd.DataFrame(
+    data=adata[:, shared_genes].layers["normalized"].toarray(), 
+    columns=shared_genes, 
+    index=range(adata.n_obs)
+)
+celltypes = adata.obs['cell_type'].unique().tolist()
+cell_type_col = adata.obs['cell_type'].astype('category')
+sc_ref["ct_idx"] = cell_type_col.cat.codes.values
+sc_ref["cell_type"] = cell_type_col.values
+sc_ref["atlas"] = "custom"
+sc_ref.to_csv( "scref.csv")
+
+# generating transcript map .csv from test data
+transcript = sdata["transcripts"].compute()
+transcript=pd.DataFrame(transcript)
+#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter
+transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip'
+)
+
+
+#generating positive and negative marker files from the single cell reference
+# defining the function generate_markers
+
+def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15):
+    """
+    Generate positive and negative marker dataframes from reference data.
+    
+    Args:
+        ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info
+        max_overlaps_pos (int): Maximum number of cell types that can share a positive marker
+        max_overlaps_neg (int): Maximum number of cell types that can share a negative marker
+    
+    Returns:
+        tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers
+    """
+
+    n_genes = ref_df.shape[1] - 3
+    cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist())))
+    n_cell_types = len(cell_types)
+    
+    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
+    gene_names = ref_df.columns[:n_genes]
+    ct_idx = ref_df["ct_idx"].to_numpy()
+
+    # Generate negative markers
+    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
+    pct_10 = np.tile(pct_10, (1, n_genes))
+    low_expr_true = np.zeros(pct_10.shape)
+    low_expr_true[ref_expr <= pct_10] = 1
+
+    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
+    for ct in range(n_cell_types):
+        rows = np.where(ct_idx == ct)[0]
+        low_expr_true_ct = low_expr_true[rows]
+        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)
+
+    overlaps = np.sum(low_expr_true_agg, 0)
+    too_many = np.where(overlaps > max_overlaps_neg)[0]
+    low_expr_true_agg[:, too_many] = 0
+    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)
+
+    # Generate positive markers
+    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
+    pct_90 = np.tile(pct_90, (1, n_genes))
+    high_expr_true = np.zeros(pct_90.shape)
+    high_expr_true[ref_expr >= pct_90] = 1
+
+    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
+    for ct in range(n_cell_types):
+        rows = np.where(ct_idx == ct)[0]
+        high_expr_true_ct = high_expr_true[rows]
+        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)
+
+    overlaps = np.sum(high_expr_true_agg, 0)
+    too_many = np.where(overlaps > max_overlaps_pos)[0]
+    high_expr_true_agg[:, too_many] = 0
+    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)
+
+    return df_pos, df_neg
+
+# generate positive and negative marker files 
+df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15)
+df_pos.to_csv( "pos_marker.csv")
+df_neg.to_csv("neg_marker.csv")
+
+
+# Setting up and running BIDCell
+model = BIDCellModel("testdata.yaml")
+model.run_pipeline() 
+

From f1f5b9194dec8621cea84ed27a9aecc70008b1b1 Mon Sep 17 00:00:00 2001
From: LouisK92 <louiskuemmerle@googlemail.com>
Date: Fri, 27 Jun 2025 18:14:00 +0200
Subject: [PATCH 2/5] Reduce test crop size to 1000 x 1000

---
 .../2023_10x_mouse_brain_xenium_rep1.sh                   | 8 ++++----
 scripts/create_test_resources/test_pipeline.sh            | 7 +++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh b/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh
index b48d1b01d..519e19b1a 100755
--- a/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh
+++ b/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh
@@ -8,8 +8,8 @@ cd "$REPO_ROOT"
 
 set -e
 
-if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium ]; then
-  mkdir -p temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium
+if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1 ]; then
+  mkdir -p temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1
 fi
 if [ ! -f temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip ]; then
   wget -O temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \
@@ -29,9 +29,9 @@ param_list:
     dataset_description: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1).
     dataset_organism: mus_musculus
     crop_region_min_x: 10000
-    crop_region_max_x: 12000
+    crop_region_max_x: 11000
     crop_region_min_y: 10000
-    crop_region_max_y: 12000
+    crop_region_max_y: 11000
 
 publish_dir: resources_test/common
 output_dataset: '\$id/dataset.zarr'
diff --git a/scripts/create_test_resources/test_pipeline.sh b/scripts/create_test_resources/test_pipeline.sh
index 538cdc467..6ffba9110 100755
--- a/scripts/create_test_resources/test_pipeline.sh
+++ b/scripts/create_test_resources/test_pipeline.sh
@@ -12,6 +12,13 @@ mkdir -p $OUT_DIR
 
 # run dataset preprocessor
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
+  --dataset_id mouse_brain_combined \
+  --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \
+  --dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \
+  --dataset_reference "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z" \
+  --dataset_summary "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \
+  --dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \
+  --dataset_organism "mus_musculus" \
   --input_sc $SC_DIR/dataset.h5ad \
   --input_sp $SP_DIR/dataset.zarr \
   --output_sc $OUT_DIR/scrnaseq_reference.h5ad \

From 153c058e576f7794bdf797075f9368a1a414c856 Mon Sep 17 00:00:00 2001
From: LouisK92 <louiskuemmerle@googlemail.com>
Date: Wed, 2 Jul 2025 09:44:30 +0200
Subject: [PATCH 3/5] Adjust dependency in wu_human_breast_cancer dataset to
 enable viash ns build

---
 .../process_wu_human_breast_cancer_sc/config.vsh.yaml       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml b/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml
index 93c33e23b..c0fdf6f52 100644
--- a/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml
+++ b/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml
@@ -88,8 +88,10 @@ dependencies:
     repository: openproblems
   - name: datasets/processors/knn
     repository: openproblems
-  - name: h5ad/extract_uns_metadata
-    repository: core
+  - name: utils/extract_uns_metadata
+    repository: openproblems
+#  - name: h5ad/extract_uns_metadata
+#    repository: core
 
 runners:
   - type: nextflow

From 1f935c77545c366b9070a9b5020a50978b1316ee Mon Sep 17 00:00:00 2001
From: LouisK92 <louiskuemmerle@googlemail.com>
Date: Wed, 2 Jul 2025 11:24:18 +0200
Subject: [PATCH 4/5] Reduce memory for tenx_xenium loader and processing to
 enable local generation of test data

---
 src/datasets/loaders/tenx_xenium/config.vsh.yaml           | 2 +-
 src/datasets/workflows/process_tenx_xenium/config.vsh.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
index f16032a9b..d8a1dd367 100644
--- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml
+++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml
@@ -67,4 +67,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [highmem, midcpu, midtime]
+      label: [midmem, midcpu, midtime]
diff --git a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml
index e9507cb65..c4fec772b 100644
--- a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml
+++ b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml
@@ -82,4 +82,4 @@ dependencies:
 runners:
   - type: nextflow
     directives:
-      label: [highcpu, highmem, hightime]
+      label: [highcpu, midmem, hightime]

From 84d91328aa71ba169aabc143053773394ec2317b Mon Sep 17 00:00:00 2001
From: LouisK92 <louiskuemmerle@googlemail.com>
Date: Fri, 4 Jul 2025 19:39:29 +0200
Subject: [PATCH 5/5] Remove bidcell script in root dir

---
 bidcell.py | 125 -----------------------------------------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 bidcell.py

diff --git a/bidcell.py b/bidcell.py
deleted file mode 100644
index 09caf1804..000000000
--- a/bidcell.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import spatialdata as sd
-import spatialdata_plot as pl
-import matplotlib.pyplot as plt
-import numpy as np
-import tifffile
-import cv2
-import dask.dataframe as dd  #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts
-import scanpy as sc
-import pandas as pd
-import natsort
-import os 
-from bidcell import BIDCellModel
-
-
-# preprocessing test data to get required input files for BIDcell
-sdata = sd.read_zarr("dataset.zarr")
-sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist()
-# Extracting DAPI image from dataset.zarr
-image_pyramid = []
-img = sdata["morphology_mip"]["/scale0"]["image"].values  # Convert dask array to numpy
-img = np.squeeze(img)  # Remove singleton channel dimension (c:1)
-image_pyramid.append(img)
-
-with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff:
-    for img in image_pyramid:
-        tiff.write(img, photometric="minisblack", resolution=(1, 1))
-
-
-#Converting h5ad single cell reference to .csv
-adata = sc.read_h5ad("dataset.h5ad")
-shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present
-adata = adata[:, adata.var["feature_name"].isin(shared_genes)]
-
-# Set var_names to feature_name
-adata.var_names = adata.var["feature_name"].astype(str)
-sc_ref = pd.DataFrame(
-    data=adata[:, shared_genes].layers["normalized"].toarray(), 
-    columns=shared_genes, 
-    index=range(adata.n_obs)
-)
-celltypes = adata.obs['cell_type'].unique().tolist()
-cell_type_col = adata.obs['cell_type'].astype('category')
-sc_ref["ct_idx"] = cell_type_col.cat.codes.values
-sc_ref["cell_type"] = cell_type_col.values
-sc_ref["atlas"] = "custom"
-sc_ref.to_csv( "scref.csv")
-
-# generating transcript map .csv from test data
-transcript = sdata["transcripts"].compute()
-transcript=pd.DataFrame(transcript)
-#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter
-transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip'
-)
-
-
-#generating positive and negative marker files from the single cell reference
-# defining the function generate_markers
-
-def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15):
-    """
-    Generate positive and negative marker dataframes from reference data.
-    
-    Args:
-        ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info
-        max_overlaps_pos (int): Maximum number of cell types that can share a positive marker
-        max_overlaps_neg (int): Maximum number of cell types that can share a negative marker
-    
-    Returns:
-        tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers
-    """
-
-    n_genes = ref_df.shape[1] - 3
-    cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist())))
-    n_cell_types = len(cell_types)
-    
-    ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
-    gene_names = ref_df.columns[:n_genes]
-    ct_idx = ref_df["ct_idx"].to_numpy()
-
-    # Generate negative markers
-    pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
-    pct_10 = np.tile(pct_10, (1, n_genes))
-    low_expr_true = np.zeros(pct_10.shape)
-    low_expr_true[ref_expr <= pct_10] = 1
-
-    low_expr_true_agg = np.zeros((n_cell_types, n_genes))
-    for ct in range(n_cell_types):
-        rows = np.where(ct_idx == ct)[0]
-        low_expr_true_ct = low_expr_true[rows]
-        low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)
-
-    overlaps = np.sum(low_expr_true_agg, 0)
-    too_many = np.where(overlaps > max_overlaps_neg)[0]
-    low_expr_true_agg[:, too_many] = 0
-    df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)
-
-    # Generate positive markers
-    pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
-    pct_90 = np.tile(pct_90, (1, n_genes))
-    high_expr_true = np.zeros(pct_90.shape)
-    high_expr_true[ref_expr >= pct_90] = 1
-
-    high_expr_true_agg = np.zeros((n_cell_types, n_genes))
-    for ct in range(n_cell_types):
-        rows = np.where(ct_idx == ct)[0]
-        high_expr_true_ct = high_expr_true[rows]
-        high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)
-
-    overlaps = np.sum(high_expr_true_agg, 0)
-    too_many = np.where(overlaps > max_overlaps_pos)[0]
-    high_expr_true_agg[:, too_many] = 0
-    df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)
-
-    return df_pos, df_neg
-
-# generate positive and negative marker files 
-df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15)
-df_pos.to_csv( "pos_marker.csv")
-df_neg.to_csv("neg_marker.csv")
-
-
-# Setting up and running BIDCell
-model = BIDCellModel("testdata.yaml")
-model.run_pipeline() 
-