Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions src/methods_transcript_assignment/fastreseg/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
__merge__: /src/api/comp_method_transcript_assignment.yaml

name: fastreseg
label: "fastReseg transcript assignment"
summary: "Spatial segmentation correction using fastReseg method"
description: |
fastReseg is an R package designed to enhance the precision of cell segmentation in spatial transcriptomics by
leveraging transcriptomic data to correct and refine initial image-based segmentation results.
links:
documentation: "https://github.com/openproblems-bio/task_ist_preprocessing"
repository: "https://github.com/openproblems-bio/task_ist_preprocessing"
references:
doi: "10.1038/s41598-025-08733-5"


arguments:
- name: --transcripts_key
type: string
default: "transcripts"
description: "Key for transcripts in the points layer"
- name: --coordinate_system
type: string
default: "global"
description: "Coordinate system for the transcripts"


resources:
- type: bash_script
path: orchestrator.sh
- type: python_script
path: input.py
- type: file
path: fastreseg.yml
dest: environment.yml
- type: r_script
path: script.R
- type: python_script
path: output.py

engines:
- type: docker
image: openproblems/base_python:1
setup:
- type: apt
packages:
- libv8-dev
- libudunits2-dev
- libabsl-dev
- gdal-bin
- libgdal-dev
- r-base
- type: r
packages:
- devtools
- terra
- Matrix
- dbscan
- igraph
- matrixStats
- codetools
- data.table
# Add other CRAN packages here
github:
# Add GitHub packages here if needed
- Nanostring-Biostats/FastReseg
#- type: python
# github:
# - openproblems-bio/core#subdirectory=packages/python/openproblems
__merge__:
- /src/base/setup_txsim_partial.yaml
- /src/base/setup_spatialdata_partial.yaml
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [ midtime, midcpu, midmem ]


##macOS workaround: export DOCKER_DEFAULT_PLATFORM=linux/amd64
13 changes: 13 additions & 0 deletions src/methods_transcript_assignment/fastreseg/fastreseg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: fastreseg
channels:
- anaconda
- conda-forge
- defaults
dependencies:
- r-base=4.5.1
- r-devtools
- pip
- ipykernel
- jupyter_client
- r-irkernel
prefix: /opt/miniconda3/envs/fastreseg
116 changes: 116 additions & 0 deletions src/methods_transcript_assignment/fastreseg/input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import spatialdata as sd
from tifffile import imwrite
import sys
import numpy as np
import pandas as pd
import xarray as xr
import dask
import txsim as tx
import anndata as ad
import argparse

def parse_arguments():
parser = argparse.ArgumentParser(
description="Process input files and generate output files for FastReseg input"
)
parser.add_argument('input_path_ist', help='Path to the input file')
parser.add_argument('input_segmentation_path', help='Path to the input segmentation file')
parser.add_argument('input_sc_reference_path', help='Path to the input single-cell reference file')
parser.add_argument('output_path_counts', help='Path for the output TSV file')
parser.add_argument('output_path_transcripts', help='Path for the output TIF file')
parser.add_argument('output_path_cell_type', help='Output cell type specification')

return parser.parse_args()

### parsing arguments
args = parse_arguments()
print("args:")
print(args)
input_path = args.input_path_ist
input_segmentation_path = args.input_segmentation_path
input_sc_reference_path = args.input_sc_reference_path
print("path")
print(input_sc_reference_path)
output_path_counts = args.output_path_counts
output_path_transcripts = args.output_path_transcripts
output_path_cell_type = args.output_path_cell_type

## potential other parameters (TODO - make configurable)
um_per_pixel = 0.5
sc_celltype_key = 'cell_type'

### reading the data in
sdata = sd.read_zarr(input_path)

### reading in basic segmentation
sdata_segm = sd.read_zarr(input_segmentation_path)
segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()

# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)['global'].inverse()

transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global')
transcripts = sd.transform(transcripts, trans, 'global')

print('Assigning transcripts to cell ids', flush=True)
y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
if isinstance(sdata_segm["segmentation"], xr.DataTree):
label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy()
else:
label_image = sdata_segm["segmentation"].to_numpy()
cell_id_dask_series = dask.dataframe.from_dask_array(
dask.array.from_array(
label_image[y_coords, x_coords], chunks=tuple(sdata['transcripts'].map_partitions(len).compute())
),
index=sdata['transcripts'].index
)
sdata['transcripts']["cell_id"] = cell_id_dask_series

### extracting transcript ids
print('Transforming transcripts coordinates', flush=True)
transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global')

transcripts_df = transcripts.compute()
transcripts_df.rename(columns = {'feature_name': 'target',
'transcript_id': 'UMI_transID', 'cell_id': 'UMI_cellID'}, inplace = True)

transcripts_df = transcripts_df.loc[:, ['target', 'x', 'y', 'z', 'UMI_transID', 'UMI_cellID']]
transcripts_df.to_csv(output_path_transcripts)


#### aggregating counts per transcript, based on
df = sdata['transcripts'].compute()
df.feature_name = df.feature_name.astype(str)

adata_sp = tx.preprocessing.generate_adata(df, cell_id_col='cell_id', gene_col='feature_name') #TODO: x and y refers to a specific coordinate system. Decide which space we want to use here. (probably should be handled in the previous assignment step)
adata_sp.layers['counts'] = adata_sp.layers['raw_counts']
del adata_sp.layers['raw_counts']
adata_sp.var["gene_name"] = adata_sp.var_names
print(adata_sp.var_names[1:10])

# currently the function also saves the transcripts in the adata object, but this is not necessary here
del adata_sp.uns['spots']
del adata_sp.uns['pct_noise']


count_df = pd.DataFrame(adata_sp.X.toarray(),
index=adata_sp.obs_names,
columns=adata_sp.var_names)
count_df.to_csv(output_path_counts)

#### run cell annotation with ssam
adata_sc = ad.read_h5ad(input_sc_reference_path)
adata_sc.X = adata_sc.layers["normalized"]
print(adata_sc.var_names[1:10])

shared_genes = [g for g in adata_sc.var_names if g in adata_sp.var_names]
adata_sp = adata_sp[:,shared_genes]

print('Annotating cell types', flush=True)
adata_sp = tx.preprocessing.run_ssam(
adata_sp, transcripts.compute(), adata_sc, um_p_px=um_per_pixel,
cell_id_col='cell_id', gene_col='feature_name', sc_ct_key=sc_celltype_key
)
cell_type_df = adata_sp.obs["ct_ssam"].astype(str)
cell_type_df.to_csv(output_path_cell_type, header=True)
88 changes: 88 additions & 0 deletions src/methods_transcript_assignment/fastreseg/orchestrator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash

## VIASH START
# The following code has been auto-generated by Viash.
par_input_ist='resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr'
par_input_segmentation='resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr'
par_input_scrnaseq='resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad'
par_sc_cell_type_key='cell_type'
par_output='resources_test/task_ist_preprocessing/mouse_brain_combined/transcript_assignments.zarr'
par_transcripts_key='transcripts'
par_coordinate_system='global'
meta_name='fastreseg'
meta_functionality_name='fastreseg'
meta_resources_dir='/private/tmp/viash_inject_fastreseg18170326436127140412'
meta_executable='/private/tmp/viash_inject_fastreseg18170326436127140412/fastreseg'
meta_config='/private/tmp/viash_inject_fastreseg18170326436127140412/.config.vsh.yaml'
meta_temp_dir='/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/'
meta_cpus='123'
meta_memory_b='123'
meta_memory_kb='123'
meta_memory_mb='123'
meta_memory_gb='123'
meta_memory_tb='123'
meta_memory_pb='123'
meta_memory_kib='123'
meta_memory_mib='123'
meta_memory_gib='123'
meta_memory_tib='123'
meta_memory_pib='123'

## VIASH END

par_intermediate_dir=$(mktemp -d -p "$(pwd)" tmp-processing-XXXXXXXX)
# Access the YAML file
CONDA_ENV_FILE="$meta_resources_dir/environment.yml"
echo $CONDA_ENV_FILE
echo "running FastReseg orchestrator"

# Create intermediate directory
mkdir -p "$par_intermediate_dir"
echo $(date +%T)

# Step 1: Run Python script to reformat input in the first Python environment
python "input.py" \
"$par_input_ist" \
"$par_input_segmentation" \
"$par_input_scrnaseq" \
"$par_intermediate_dir/counts.tsv" \
"$par_intermediate_dir/transcripts.tsv" \
"$par_intermediate_dir/cell_types.tsv"

head $par_intermediate_dir/cell_types.tsv

# Step 2: RunFastReseg
## making conda environment with r-base and fastReseg
#export CONDA_DIR=/opt/conda
#wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
# /bin/bash ~/miniconda.sh -b -p /opt/conda
# Put conda in path so we can use conda activate
#export PATH=$CONDA_DIR/bin:$PATH

# Create and activate the second Python environment
# Initialize conda for bash
#eval "$(/opt/conda/bin/conda shell.bash hook)"
#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
#conda env create -f $CONDA_ENV_FILE

#conda activate fastreseg
#conda install conda-forge::r-concaveman conda-forge::r-data.table conda-forge::r-ggplot2 conda-forge::r-devtools conda-forge::r-terra
#conda install conda-forge::r-codetools conda-forge::r-matrix conda-forge::r-dbscan conda-forge::r-igraph conda-forge::r-matrixstats

##running the R script
Rscript script.R "$par_intermediate_dir/counts.tsv" \
"$par_intermediate_dir/transcripts.tsv" \
"$par_intermediate_dir/cell_types.tsv" \
"$par_intermediate_dir/cell_ids.csv" \
"$par_intermediate_dir/gene_names.csv" \
"$par_intermediate_dir/transcripts_out.csv"

## python output
python "output.py" \
"$par_intermediate_dir/cell_ids.csv" \
"$par_intermediate_dir/gene_names.csv" \
"$par_intermediate_dir/transcripts_out.csv" \
"$par_output"

echo $(date +%T)
54 changes: 54 additions & 0 deletions src/methods_transcript_assignment/fastreseg/output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import spatialdata as sd
from tifffile import imwrite
import sys
import numpy as np
import pandas as pd
import xarray as xr
from scipy.sparse import coo_matrix
from pathlib import Path
import os
import dask
import anndata as ad

def convert_to_lower_dtype(arr):
max_val = arr.max()
if max_val <= np.iinfo(np.uint8).max:
new_dtype = np.uint8
elif max_val <= np.iinfo(np.uint16).max:
new_dtype = np.uint16
elif max_val <= np.iinfo(np.uint32).max:
new_dtype = np.uint32
else:
new_dtype = np.uint64

return arr.astype(new_dtype)

path_to_cell_ids_out = sys.argv[1]
path_to_gene_names_out = sys.argv[2]
path_to_transcripts_out = sys.argv[3]

output_zarr_path = sys.argv[4]

cell_df = pd.read_csv(path_to_cell_ids_out, index_col=0)
var_names = pd.read_csv(path_to_gene_names_out, index_col=0)
df = pd.read_csv(path_to_transcripts_out, index_col=0)
print(df.head())

##converting to dask transcript output for spatialdata format
df = df.loc[:,["x", "y", "z", "target", "updated_cellID", "updated_celltype", "UMI_transID"]]
df.rename(columns={"updated_cellID": "cell_id", "target": "feature_name", "UMI_transID": "transcript_id"}, inplace=True)
transcripts_dask = dask.dataframe.from_pandas(df, npartitions = 1)


sdata_transcripts_only = sd.SpatialData(
points={
"transcripts": sd.models.PointsModel.parse(transcripts_dask)
},
tables={
"table": ad.AnnData(
obs=cell_df.loc[:,['updated_cellID', 'updated_celltype']],
var=pd.DataFrame(index=var_names['x'])
)
}
)
sdata_transcripts_only.write(output_zarr_path)
Loading
Loading