diff --git a/src/methods_transcript_assignment/fastreseg/config.vsh.yaml b/src/methods_transcript_assignment/fastreseg/config.vsh.yaml new file mode 100644 index 00000000..8e66e88c --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/config.vsh.yaml @@ -0,0 +1,81 @@ +__merge__: /src/api/comp_method_transcript_assignment.yaml + +name: fastreseg +label: "fastReseg transcript assignment" +summary: "Spatial segmentation correction using fastReseg method" +description: | + fastReseg is an R package designed to enhance the precision of cell segmentation in spatial transcriptomics by + leveraging transcriptomic data to correct and refine initial image-based segmentation results. +links: + documentation: "https://github.com/openproblems-bio/task_ist_preprocessing" + repository: "https://github.com/openproblems-bio/task_ist_preprocessing" +references: + doi: "10.1038/s41598-025-08733-5" + + +arguments: + - name: --transcripts_key + type: string + default: "transcripts" + description: "Key for transcripts in the points layer" + - name: --coordinate_system + type: string + default: "global" + description: "Coordinate system for the transcripts" + + +resources: + - type: bash_script + path: orchestrator.sh + - type: python_script + path: input.py + - type: file + path: fastreseg.yml + dest: environment.yml + - type: r_script + path: script.R + - type: python_script + path: output.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: apt + packages: + - libv8-dev + - libudunits2-dev + - libabsl-dev + - gdal-bin + - libgdal-dev + - r-base + - type: r + packages: + - devtools + - terra + - Matrix + - dbscan + - igraph + - matrixStats + - codetools + - data.table + # Add other CRAN packages here + github: + # Add GitHub packages here if needed + - Nanostring-Biostats/FastReseg + #- type: python + # github: + # - openproblems-bio/core#subdirectory=packages/python/openproblems + __merge__: + - /src/base/setup_txsim_partial.yaml + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [ midtime, midcpu, midmem ] + + +##macOS workaround: export DOCKER_DEFAULT_PLATFORM=linux/amd64 \ No newline at end of file diff --git a/src/methods_transcript_assignment/fastreseg/fastreseg.yml b/src/methods_transcript_assignment/fastreseg/fastreseg.yml new file mode 100644 index 00000000..49577fda --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/fastreseg.yml @@ -0,0 +1,13 @@ +name: fastreseg +channels: + - anaconda + - conda-forge + - defaults +dependencies: + - r-base=4.5.1 + - r-devtools + - pip + - ipykernel + - jupyter_client + - r-irkernel +prefix: /opt/miniconda3/envs/fastreseg diff --git a/src/methods_transcript_assignment/fastreseg/input.py b/src/methods_transcript_assignment/fastreseg/input.py new file mode 100644 index 00000000..0928cce6 --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/input.py @@ -0,0 +1,116 @@ +import spatialdata as sd +from tifffile import imwrite +import sys +import numpy as np +import pandas as pd +import xarray as xr +import dask +import txsim as tx +import anndata as ad +import argparse + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Process input files and generate output files for FastReseg input" + ) + parser.add_argument('input_path_ist', help='Path to the input file') + parser.add_argument('input_segmentation_path', help='Path to the input segmentation file') + parser.add_argument('input_sc_reference_path', help='Path to the input single-cell reference file') + parser.add_argument('output_path_counts', help='Path for the output TSV file') + parser.add_argument('output_path_transcripts', help='Path for the output TIF file') + parser.add_argument('output_path_cell_type', help='Output cell type specification') + + return parser.parse_args() + +### parsing arguments +args = parse_arguments() +print("args:") +print(args) +input_path = args.input_path_ist +input_segmentation_path = args.input_segmentation_path +input_sc_reference_path = args.input_sc_reference_path +print("path") +print(input_sc_reference_path) +output_path_counts = args.output_path_counts +output_path_transcripts = args.output_path_transcripts +output_path_cell_type = args.output_path_cell_type + +## potential other parameters (TODO - make configurable) +um_per_pixel = 0.5 +sc_celltype_key = 'cell_type' + +### reading the data in +sdata = sd.read_zarr(input_path) + +### reading in basic segmentation +sdata_segm = sd.read_zarr(input_segmentation_path) +segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys() + +# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates +trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)['global'].inverse() + +transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global') +transcripts = sd.transform(transcripts, trans, 'global') + +print('Assigning transcripts to cell ids', flush=True) +y_coords = transcripts.y.compute().to_numpy(dtype=np.int64) +x_coords = transcripts.x.compute().to_numpy(dtype=np.int64) +if isinstance(sdata_segm["segmentation"], xr.DataTree): + label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() +else: + label_image = sdata_segm["segmentation"].to_numpy() +cell_id_dask_series = dask.dataframe.from_dask_array( + dask.array.from_array( + label_image[y_coords, x_coords], chunks=tuple(sdata['transcripts'].map_partitions(len).compute()) + ), + index=sdata['transcripts'].index +) +sdata['transcripts']["cell_id"] = cell_id_dask_series + +### extracting transcript ids +print('Transforming transcripts coordinates', flush=True) +transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global') + +transcripts_df = transcripts.compute() +transcripts_df.rename(columns = {'feature_name': 'target', +'transcript_id': 'UMI_transID', 'cell_id': 'UMI_cellID'}, inplace = True) + +transcripts_df = transcripts_df.loc[:, ['target', 'x', 'y', 'z', 'UMI_transID', 'UMI_cellID']] +transcripts_df.to_csv(output_path_transcripts) + + +#### aggregating counts per transcript, based on +df = sdata['transcripts'].compute() +df.feature_name = df.feature_name.astype(str) + +adata_sp = tx.preprocessing.generate_adata(df, cell_id_col='cell_id', gene_col='feature_name') #TODO: x and y refers to a specific coordinate system. Decide which space we want to use here. (probably should be handled in the previous assignment step) +adata_sp.layers['counts'] = adata_sp.layers['raw_counts'] +del adata_sp.layers['raw_counts'] +adata_sp.var["gene_name"] = adata_sp.var_names +print(adata_sp.var_names[1:10]) + +# currently the function also saves the transcripts in the adata object, but this is not necessary here +del adata_sp.uns['spots'] +del adata_sp.uns['pct_noise'] + + +count_df = pd.DataFrame(adata_sp.X.toarray(), + index=adata_sp.obs_names, + columns=adata_sp.var_names) +count_df.to_csv(output_path_counts) + +#### run cell annotation with ssam +adata_sc = ad.read_h5ad(input_sc_reference_path) +adata_sc.X = adata_sc.layers["normalized"] +print(adata_sc.var_names[1:10]) + +shared_genes = [g for g in adata_sc.var_names if g in adata_sp.var_names] +adata_sp = adata_sp[:,shared_genes] + +print('Annotating cell types', flush=True) +adata_sp = tx.preprocessing.run_ssam( + adata_sp, transcripts.compute(), adata_sc, um_p_px=um_per_pixel, + cell_id_col='cell_id', gene_col='feature_name', sc_ct_key=sc_celltype_key +) +cell_type_df = adata_sp.obs["ct_ssam"].astype(str) +cell_type_df.to_csv(output_path_cell_type, header=True) \ No newline at end of file diff --git a/src/methods_transcript_assignment/fastreseg/orchestrator.sh b/src/methods_transcript_assignment/fastreseg/orchestrator.sh new file mode 100644 index 00000000..ebb65742 --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/orchestrator.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +par_input_ist='resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr' +par_input_segmentation='resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr' +par_input_scrnaseq='resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad' +par_sc_cell_type_key='cell_type' +par_output='resources_test/task_ist_preprocessing/mouse_brain_combined/transcript_assignments.zarr' +par_transcripts_key='transcripts' +par_coordinate_system='global' +meta_name='fastreseg' +meta_functionality_name='fastreseg' +meta_resources_dir='/private/tmp/viash_inject_fastreseg18170326436127140412' +meta_executable='/private/tmp/viash_inject_fastreseg18170326436127140412/fastreseg' +meta_config='/private/tmp/viash_inject_fastreseg18170326436127140412/.config.vsh.yaml' +meta_temp_dir='/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/' +meta_cpus='123' +meta_memory_b='123' +meta_memory_kb='123' +meta_memory_mb='123' +meta_memory_gb='123' +meta_memory_tb='123' +meta_memory_pb='123' +meta_memory_kib='123' +meta_memory_mib='123' +meta_memory_gib='123' +meta_memory_tib='123' +meta_memory_pib='123' + +## VIASH END + +par_intermediate_dir=$(mktemp -d -p "$(pwd)" tmp-processing-XXXXXXXX) +# Access the YAML file +CONDA_ENV_FILE="$meta_resources_dir/environment.yml" +echo $CONDA_ENV_FILE +echo "running FastReseg orchestrator" + +# Create intermediate directory +mkdir -p "$par_intermediate_dir" +echo $(date +%T) + +# Step 1: Run Python script to reformat input in the first Python environment +python "input.py" \ + "$par_input_ist" \ + "$par_input_segmentation" \ + "$par_input_scrnaseq" \ + "$par_intermediate_dir/counts.tsv" \ + "$par_intermediate_dir/transcripts.tsv" \ + "$par_intermediate_dir/cell_types.tsv" + +head $par_intermediate_dir/cell_types.tsv + +# Step 2: RunFastReseg +## making conda environment with r-base and fastReseg +#export CONDA_DIR=/opt/conda +#wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ +# /bin/bash ~/miniconda.sh -b -p /opt/conda +# Put conda in path so we can use conda activate +#export PATH=$CONDA_DIR/bin:$PATH + +# Create and activate the second Python environment +# Initialize conda for bash +#eval "$(/opt/conda/bin/conda shell.bash hook)" +#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main +#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r +#conda env create -f $CONDA_ENV_FILE + +#conda activate fastreseg +#conda install conda-forge::r-concaveman conda-forge::r-data.table conda-forge::r-ggplot2 conda-forge::r-devtools conda-forge::r-terra +#conda install conda-forge::r-codetools conda-forge::r-matrix conda-forge::r-dbscan conda-forge::r-igraph conda-forge::r-matrixstats + +##running the R script +Rscript script.R "$par_intermediate_dir/counts.tsv" \ + "$par_intermediate_dir/transcripts.tsv" \ + "$par_intermediate_dir/cell_types.tsv" \ + "$par_intermediate_dir/cell_ids.csv" \ + "$par_intermediate_dir/gene_names.csv" \ + "$par_intermediate_dir/transcripts_out.csv" + +## python output +python "output.py" \ + "$par_intermediate_dir/cell_ids.csv" \ + "$par_intermediate_dir/gene_names.csv" \ + "$par_intermediate_dir/transcripts_out.csv" \ + "$par_output" + +echo $(date +%T) \ No newline at end of file diff --git a/src/methods_transcript_assignment/fastreseg/output.py b/src/methods_transcript_assignment/fastreseg/output.py new file mode 100644 index 00000000..3072e304 --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/output.py @@ -0,0 +1,54 @@ +import spatialdata as sd +from tifffile import imwrite +import sys +import numpy as np +import pandas as pd +import xarray as xr +from scipy.sparse import coo_matrix +from pathlib import Path +import os +import dask +import anndata as ad + +def convert_to_lower_dtype(arr): + max_val = arr.max() + if max_val <= np.iinfo(np.uint8).max: + new_dtype = np.uint8 + elif max_val <= np.iinfo(np.uint16).max: + new_dtype = np.uint16 + elif max_val <= np.iinfo(np.uint32).max: + new_dtype = np.uint32 + else: + new_dtype = np.uint64 + + return arr.astype(new_dtype) + +path_to_cell_ids_out = sys.argv[1] +path_to_gene_names_out = sys.argv[2] +path_to_transcripts_out = sys.argv[3] + +output_zarr_path = sys.argv[4] + +cell_df = pd.read_csv(path_to_cell_ids_out, index_col=0) +var_names = pd.read_csv(path_to_gene_names_out, index_col=0) +df = pd.read_csv(path_to_transcripts_out, index_col=0) +print(df.head()) + +##converting to dask transcript output for spatialdata format +df = df.loc[:,["x", "y", "z", "target", "updated_cellID", "updated_celltype", "UMI_transID"]] +df.rename(columns={"updated_cellID": "cell_id", "target": "feature_name", "UMI_transID": "transcript_id"}, inplace=True) +transcripts_dask = dask.dataframe.from_pandas(df, npartitions = 1) + + +sdata_transcripts_only = sd.SpatialData( + points={ + "transcripts": sd.models.PointsModel.parse(transcripts_dask) + }, + tables={ + "table": ad.AnnData( + obs=cell_df.loc[:,['updated_cellID', 'updated_celltype']], + var=pd.DataFrame(index=var_names['x']) + ) + } +) +sdata_transcripts_only.write(output_zarr_path) \ No newline at end of file diff --git a/src/methods_transcript_assignment/fastreseg/script.R b/src/methods_transcript_assignment/fastreseg/script.R new file mode 100644 index 00000000..f1e67810 --- /dev/null +++ b/src/methods_transcript_assignment/fastreseg/script.R @@ -0,0 +1,91 @@ +#install.packages("devtools") +#devtools::install_github("Nanostring-Biostats/FastReseg", build_vignettes = FALSE, ref = "main") + +library(FastReseg) +library(data.table) +library(dplyr) + +args <- commandArgs(trailingOnly = TRUE) + +# Check if arguments are provided +if (length(args) == 0) { + stop("No arguments provided") +} + +# Access individual arguments +path_to_counts <- args[1] +path_to_transcripts <- args[2] +path_to_cell_annot <- args[3] + +path_to_cell_ids_out <- args[4] +path_to_gene_names_out <- args[5] +path_to_transcripts_out <- args[6] + + +### reading in data +count_df <- as.matrix(read.csv(path_to_counts, row.names = 1)) +head(count_df) + +cell_df <- read.csv(path_to_cell_annot, row.names = 1) +cell_annot <- cell_df[row.names(count_df),] +cell_annot <- setNames(cell_annot, row.names(count_df)) + +transcriptDF <- read.csv(path_to_transcripts) +head(transcriptDF) + + +##run pipeline +refineAll_res_one_FOC <- fastReseg_full_pipeline( + counts = count_df, + clust = cell_annot, + + # Similar to `runPreprocess()`, one can use `clust = NULL` if providing `refProfiles` + + transcript_df = transcriptDF, # + transDF_fileInfo = NULL, + pixel_size = 0.18, + zstep_size = 0.8, + transID_coln = NULL, + transGene_coln = "target", + cellID_coln = "UMI_cellID", + spatLocs_colns = c("x","y","z"), + extracellular_cellID = c(-1), + + # Similar to `runPreprocess()`, one can set various cutoffs to NULL for automatic calculation from input data + + # distance cutoff for neighborhood searching at molecular and cellular levels, respectively + molecular_distance_cutoff = 2.7, + cellular_distance_cutoff = NULL, + + # cutoffs for transcript scores and number for cells under each cell type + score_baseline = NULL, + lowerCutoff_transNum = NULL, + higherCutoff_transNum= NULL, + imputeFlag_missingCTs = TRUE, + + # Settings for error detection and correction, refer to `runSegRefinement()` for more details + flagCell_lrtest_cutoff = 5, # cutoff to flag for cells with strong spatial dependcy in transcript score profiles + svmClass_score_cutoff = -2, # cutoff of transcript score to separate between high and low score classes + groupTranscripts_method = "dbscan", + spatialMergeCheck_method = "leidenCut", + cutoff_spatialMerge = 0.5, # spatial constraint cutoff for a valid merge event + + path_to_output = "res2_multiFiles", + save_intermediates = TRUE, # flag to return and write intermediate results to disk + return_perCellData = TRUE, # flag to return per cell level outputs from updated segmentation + combine_extra = FALSE # flag to include trimmed and extracellular transcripts in the exported `updated_transDF.csv` files +) + +if(is.null(refineAll_res_one_FOC$updated_transDF_list[[1]])){ + print("no transcripts assigned after refinement") + cell_df$UMI_cellID <- as.integer(row.names(cell_df)) + transcriptDF <- left_join(cell_df, transcriptDF) + names(transcriptDF)[names(transcriptDF) == "UMI_cellID"] <- "updated_cellID" + names(transcriptDF)[names(transcriptDF) == "ct_ssam"] <- "updated_celltype" + write.csv(transcriptDF, file = path_to_transcripts_out) +} else { + write.csv(refineAll_res_one_FOC$updated_transDF_list[[1]], file = path_to_transcripts_out) +} +### export outputs +write.csv(row.names(refineAll_res_one_FOC$updated_perCellExprs), file = path_to_gene_names_out) +write.csv(refineAll_res_one_FOC$updated_perCellDT, file = path_to_cell_ids_out)