From 2e7f6306007dafa428f83e2fd87250c0690b114f Mon Sep 17 00:00:00 2001 From: Tejas Kumar <157981948+patturtejaskumar@users.noreply.github.com> Date: Tue, 18 Mar 2025 23:36:18 +0100 Subject: [PATCH 1/5] Create bidcell.py --- bidcell.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 bidcell.py diff --git a/bidcell.py b/bidcell.py new file mode 100644 index 000000000..09caf1804 --- /dev/null +++ b/bidcell.py @@ -0,0 +1,125 @@ +import spatialdata as sd +import spatialdata_plot as pl +import matplotlib.pyplot as plt +import numpy as np +import tifffile +import cv2 +import dask.dataframe as dd #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts +import scanpy as sc +import pandas as pd +import natsort +import os +from bidcell import BIDCellModel + + +# preprocessing test data to get required input files for BIDcell +sdata = sd.read_zarr("dataset.zarr") +sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist() +# Extracting DAPI image from dataset.zarr +image_pyramid = [] +img = sdata["morphology_mip"]["/scale0"]["image"].values # Convert dask array to numpy +img = np.squeeze(img) # Remove singleton channel dimension (c:1) +image_pyramid.append(img) + +with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff: + for img in image_pyramid: + tiff.write(img, photometric="minisblack", resolution=(1, 1)) + + +#Converting h5ad single cell reference to .csv +adata = sc.read_h5ad("dataset.h5ad") +shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present +adata = adata[:, adata.var["feature_name"].isin(shared_genes)] + +# Set var_names to feature_name +adata.var_names = adata.var["feature_name"].astype(str) +sc_ref = pd.DataFrame( + data=adata[:, shared_genes].layers["normalized"].toarray(), + columns=shared_genes, + index=range(adata.n_obs) +) +celltypes = adata.obs['cell_type'].unique().tolist() +cell_type_col = adata.obs['cell_type'].astype('category') +sc_ref["ct_idx"] = cell_type_col.cat.codes.values +sc_ref["cell_type"] = cell_type_col.values +sc_ref["atlas"] = "custom" +sc_ref.to_csv( "scref.csv") + +# generating transcript map .csv from test data +transcript = sdata["transcripts"].compute() +transcript=pd.DataFrame(transcript) +#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter +transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip' +) + + +#generating positive and negative marker files from the single cell reference +# defining the function generate_markers + +def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15): + """ + Generate positive and negative marker dataframes from reference data. + + Args: + ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info + max_overlaps_pos (int): Maximum number of cell types that can share a positive marker + max_overlaps_neg (int): Maximum number of cell types that can share a negative marker + + Returns: + tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers + """ + + n_genes = ref_df.shape[1] - 3 + cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist()))) + n_cell_types = len(cell_types) + + ref_expr = ref_df.iloc[:, :n_genes].to_numpy() + gene_names = ref_df.columns[:n_genes] + ct_idx = ref_df["ct_idx"].to_numpy() + + # Generate negative markers + pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True) + pct_10 = np.tile(pct_10, (1, n_genes)) + low_expr_true = np.zeros(pct_10.shape) + low_expr_true[ref_expr <= pct_10] = 1 + + low_expr_true_agg = np.zeros((n_cell_types, n_genes)) + for ct in range(n_cell_types): + rows = np.where(ct_idx == ct)[0] + low_expr_true_ct = low_expr_true[rows] + low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0) + + overlaps = np.sum(low_expr_true_agg, 0) + too_many = np.where(overlaps > max_overlaps_neg)[0] + low_expr_true_agg[:, too_many] = 0 + df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names) + + # Generate positive markers + pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True) + pct_90 = np.tile(pct_90, (1, n_genes)) + high_expr_true = np.zeros(pct_90.shape) + high_expr_true[ref_expr >= pct_90] = 1 + + high_expr_true_agg = np.zeros((n_cell_types, n_genes)) + for ct in range(n_cell_types): + rows = np.where(ct_idx == ct)[0] + high_expr_true_ct = high_expr_true[rows] + high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0) + + overlaps = np.sum(high_expr_true_agg, 0) + too_many = np.where(overlaps > max_overlaps_pos)[0] + high_expr_true_agg[:, too_many] = 0 + df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names) + + return df_pos, df_neg + +# generate positive and negative marker files +df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15) +df_pos.to_csv( "pos_marker.csv") +df_neg.to_csv("neg_marker.csv") + + +# Setting up and running BIDCell +model = BIDCellModel("testdata.yaml") +model.run_pipeline() + From f1f5b9194dec8621cea84ed27a9aecc70008b1b1 Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Fri, 27 Jun 2025 18:14:00 +0200 Subject: [PATCH 2/5] Reduce test crop size to 1000 x 1000 --- .../2023_10x_mouse_brain_xenium_rep1.sh | 8 ++++---- scripts/create_test_resources/test_pipeline.sh | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh b/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh index b48d1b01d..519e19b1a 100755 --- a/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh +++ b/scripts/create_test_resources/2023_10x_mouse_brain_xenium_rep1.sh @@ -8,8 +8,8 @@ cd "$REPO_ROOT" set -e -if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium ]; then - mkdir -p temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium +if [ ! -d temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1 ]; then + mkdir -p temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1 fi if [ ! -f temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip ]; then wget -O temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium_rep1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \ @@ -29,9 +29,9 @@ param_list: dataset_description: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). dataset_organism: mus_musculus crop_region_min_x: 10000 - crop_region_max_x: 12000 + crop_region_max_x: 11000 crop_region_min_y: 10000 - crop_region_max_y: 12000 + crop_region_max_y: 11000 publish_dir: resources_test/common output_dataset: '\$id/dataset.zarr' diff --git a/scripts/create_test_resources/test_pipeline.sh b/scripts/create_test_resources/test_pipeline.sh index 538cdc467..6ffba9110 100755 --- a/scripts/create_test_resources/test_pipeline.sh +++ b/scripts/create_test_resources/test_pipeline.sh @@ -12,6 +12,13 @@ mkdir -p $OUT_DIR # run dataset preprocessor viash run src/data_processors/process_dataset/config.vsh.yaml -- \ + --dataset_id mouse_brain_combined \ + --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \ + --dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \ + --dataset_reference "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z" \ + --dataset_summary "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \ + --dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \ + --dataset_organism "mus_musculus" \ --input_sc $SC_DIR/dataset.h5ad \ --input_sp $SP_DIR/dataset.zarr \ --output_sc $OUT_DIR/scrnaseq_reference.h5ad \ From 153c058e576f7794bdf797075f9368a1a414c856 Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Wed, 2 Jul 2025 09:44:30 +0200 Subject: [PATCH 3/5] Adjust dependency in wu_human_breast_cancer dataset to enable viash ns build --- .../process_wu_human_breast_cancer_sc/config.vsh.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml b/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml index 93c33e23b..c0fdf6f52 100644 --- a/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml +++ b/src/datasets/workflows/process_wu_human_breast_cancer_sc/config.vsh.yaml @@ -88,8 +88,10 @@ dependencies: repository: openproblems - name: datasets/processors/knn repository: openproblems - - name: h5ad/extract_uns_metadata - repository: core + - name: utils/extract_uns_metadata + repository: openproblems +# - name: h5ad/extract_uns_metadata +# repository: core runners: - type: nextflow From 1f935c77545c366b9070a9b5020a50978b1316ee Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Wed, 2 Jul 2025 11:24:18 +0200 Subject: [PATCH 4/5] Reduce memory for tenx_xenium loader and processing to enable local generation of test data --- src/datasets/loaders/tenx_xenium/config.vsh.yaml | 2 +- src/datasets/workflows/process_tenx_xenium/config.vsh.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/loaders/tenx_xenium/config.vsh.yaml b/src/datasets/loaders/tenx_xenium/config.vsh.yaml index f16032a9b..d8a1dd367 100644 --- a/src/datasets/loaders/tenx_xenium/config.vsh.yaml +++ b/src/datasets/loaders/tenx_xenium/config.vsh.yaml @@ -67,4 +67,4 @@ runners: - type: executable - type: nextflow directives: - label: [highmem, midcpu, midtime] + label: [midmem, midcpu, midtime] diff --git a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml index e9507cb65..c4fec772b 100644 --- a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml +++ b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml @@ -82,4 +82,4 @@ dependencies: runners: - type: nextflow directives: - label: [highcpu, highmem, hightime] + label: [highcpu, midmem, hightime] From 84d91328aa71ba169aabc143053773394ec2317b Mon Sep 17 00:00:00 2001 From: LouisK92 Date: Fri, 4 Jul 2025 19:39:29 +0200 Subject: [PATCH 5/5] Remove bidcell script in root dir --- bidcell.py | 125 ----------------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 bidcell.py diff --git a/bidcell.py b/bidcell.py deleted file mode 100644 index 09caf1804..000000000 --- a/bidcell.py +++ /dev/null @@ -1,125 +0,0 @@ -import spatialdata as sd -import spatialdata_plot as pl -import matplotlib.pyplot as plt -import numpy as np -import tifffile -import cv2 -import dask.dataframe as dd #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts -import scanpy as sc -import pandas as pd -import natsort -import os -from bidcell import BIDCellModel - - -# preprocessing test data to get required input files for BIDcell -sdata = sd.read_zarr("dataset.zarr") -sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist() -# Extracting DAPI image from dataset.zarr -image_pyramid = [] -img = sdata["morphology_mip"]["/scale0"]["image"].values # Convert dask array to numpy -img = np.squeeze(img) # Remove singleton channel dimension (c:1) -image_pyramid.append(img) - -with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff: - for img in image_pyramid: - tiff.write(img, photometric="minisblack", resolution=(1, 1)) - - -#Converting h5ad single cell reference to .csv -adata = sc.read_h5ad("dataset.h5ad") -shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present -adata = adata[:, adata.var["feature_name"].isin(shared_genes)] - -# Set var_names to feature_name -adata.var_names = adata.var["feature_name"].astype(str) -sc_ref = pd.DataFrame( - data=adata[:, shared_genes].layers["normalized"].toarray(), - columns=shared_genes, - index=range(adata.n_obs) -) -celltypes = adata.obs['cell_type'].unique().tolist() -cell_type_col = adata.obs['cell_type'].astype('category') -sc_ref["ct_idx"] = cell_type_col.cat.codes.values -sc_ref["cell_type"] = cell_type_col.values -sc_ref["atlas"] = "custom" -sc_ref.to_csv( "scref.csv") - -# generating transcript map .csv from test data -transcript = sdata["transcripts"].compute() -transcript=pd.DataFrame(transcript) -#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter -transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip' -) - - -#generating positive and negative marker files from the single cell reference -# defining the function generate_markers - -def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15): - """ - Generate positive and negative marker dataframes from reference data. - - Args: - ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info - max_overlaps_pos (int): Maximum number of cell types that can share a positive marker - max_overlaps_neg (int): Maximum number of cell types that can share a negative marker - - Returns: - tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers - """ - - n_genes = ref_df.shape[1] - 3 - cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist()))) - n_cell_types = len(cell_types) - - ref_expr = ref_df.iloc[:, :n_genes].to_numpy() - gene_names = ref_df.columns[:n_genes] - ct_idx = ref_df["ct_idx"].to_numpy() - - # Generate negative markers - pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True) - pct_10 = np.tile(pct_10, (1, n_genes)) - low_expr_true = np.zeros(pct_10.shape) - low_expr_true[ref_expr <= pct_10] = 1 - - low_expr_true_agg = np.zeros((n_cell_types, n_genes)) - for ct in range(n_cell_types): - rows = np.where(ct_idx == ct)[0] - low_expr_true_ct = low_expr_true[rows] - low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0) - - overlaps = np.sum(low_expr_true_agg, 0) - too_many = np.where(overlaps > max_overlaps_neg)[0] - low_expr_true_agg[:, too_many] = 0 - df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names) - - # Generate positive markers - pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True) - pct_90 = np.tile(pct_90, (1, n_genes)) - high_expr_true = np.zeros(pct_90.shape) - high_expr_true[ref_expr >= pct_90] = 1 - - high_expr_true_agg = np.zeros((n_cell_types, n_genes)) - for ct in range(n_cell_types): - rows = np.where(ct_idx == ct)[0] - high_expr_true_ct = high_expr_true[rows] - high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0) - - overlaps = np.sum(high_expr_true_agg, 0) - too_many = np.where(overlaps > max_overlaps_pos)[0] - high_expr_true_agg[:, too_many] = 0 - df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names) - - return df_pos, df_neg - -# generate positive and negative marker files -df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15) -df_pos.to_csv( "pos_marker.csv") -df_neg.to_csv("neg_marker.csv") - - -# Setting up and running BIDCell -model = BIDCellModel("testdata.yaml") -model.run_pipeline() -