diff --git a/Bidcell_outputanalysis b/Bidcell_outputanalysis new file mode 100644 index 000000000..c5eb719e1 --- /dev/null +++ b/Bidcell_outputanalysis @@ -0,0 +1,84 @@ +#Analysis and visualisation of BIDcell output +#resizing BIDcell segmentation .tiff w.r.t DAPI image +dapi_image = tiff.imread("morphology_mip_pyramidal.tiff") + +# Load the BIDCell segmentation mask (ensure the BIDCell segmentation is in the correct path;e.g., 'epoch_10_step_60.tif' or similar) +segmentation_mask = tiff.imread("epoch_10_step_60_connected.tif") +# Get the dimensions of the DAPI image (width and height) +h_dapi, w_dapi = dapi_image.shape + +# Resize the BIDCell segmentation mask to match the DAPI image dimensions +segmentation_mask_resized = cv2.resize(segmentation_mask.astype('float32'), (w_dapi, h_dapi), interpolation=cv2.INTER_NEAREST) + +# Convert the resized segmentation mask back to uint32 for storing the cell IDs +segmentation_mask_resized = segmentation_mask_resized.astype(np.uint32) +segmentation_mask_resized = segmentation_mask_resized.transpose(1, 0) +# Save the resized segmentation mask to a .tif file +tiff.imwrite("bidcellresult_resized.tif", segmentation_mask_resized) + + +#creating bidcelloutput.zarr + +# Load the image +image = tifffile.imread("morphology_mip_pyramidal.tiff") +# Add a fake channel dimension (1 channel) +image_with_channel = np.expand_dims(image, axis=0) +# Parse the image with 'c', 'x', and 'y' dimensions +# Load the image +label_image = tifffile.imread("bidcellresult_resized.tif") +labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x')) +# Convert 'x', 'y', and 'z' columns to float +# Check the columns in transcript +transcript_processed=pd.read_csv("/Volumes/Tejas_Kumar/testrunbidcell5/model_outputs/2025_03_17_09_36_19/test_output/transcript.csv.gz") +transcript_processed['x'] = transcript_processed['x'].astype(float) +transcript_processed['y'] = transcript_processed['y'].astype(float) +transcript_processed['z'] = transcript_processed['z'].astype(float) +transcript_processed['feature_name'] = transcript_processed['feature_name'].astype(str) +transcript_processed=pd.DataFrame(transcript_processed) + + +#creating images,labels and points for the bidcelloutput.zarr +images = sd.models.Image2DModel.parse(image_with_channel, dims=('c', 'x', 'y')) +labels = sd.models.Labels2DModel.parse(label_image, dims=('y', 'x')) +points = sd.models.PointsModel.parse(transcript_processed) + + +outputsdata = sd.SpatialData( + images={'DAPI': images}, + labels={'segmentation_mask_labels': labels}, + points={'transcripts': points} +) +outputsdata.write("Bidcelloutput.zarr",overwrite=True) + + +#read the output zarr file and visualise segmentations +outdata=sd.read_zarr("Bidcelloutput.zarr") +extents = sd.get_extent(outdata) +# Crop a specific region for visualization +crop0 = lambda x: sd.bounding_box_query( + x, + min_coordinate=[400,500], # Min coordinates + max_coordinate=[900,1000], # Max coordinates + #min_coordinate=[extents["x"][0], extents["y"][0]], # Min coordinates + #max_coordinate=[extents["x"][1], extents["y"][1]], # Max coordinates + #min_coordinate=[2125, 2125], # Min coordinates + #max_coordinate=[2550, 2550], # Max coordinates + axes=("x", "y"), # Include the z-axis + target_coordinate_system="global", +) +# NOTE: this solves the error, somehow the cropping currently doesn't support sdata tables +if "table" in outdata.tables: + del outdata.tables["table"] +# Apply the cropping function to the spatial data +sdata_crop = crop0(outdata) +fig, axs = plt.subplots(ncols=3, figsize=(12, 3)) +sdata_crop.pl.render_images().pl.show(ax=axs[0]) +axs[0].set_title('Raw_DAPI_image') +sdata_crop.pl.render_images().pl.render_labels(color="cell_id").pl.show(ax=axs[1]) +axs[1].set_title('BIDcell segmentations overlaid on DAPI') +sdata_crop.pl.render_labels(color="cell_id").pl.show(ax=axs[2]) +axs[2].set_title('BIDcell segmentations') +plt.tight_layout() +plt.show() + + diff --git a/bidcell.py b/bidcell.py new file mode 100644 index 000000000..09caf1804 --- /dev/null +++ b/bidcell.py @@ -0,0 +1,125 @@ +import spatialdata as sd +import spatialdata_plot as pl +import matplotlib.pyplot as plt +import numpy as np +import tifffile +import cv2 +import dask.dataframe as dd #dask.dataframe should only be imported after spatial data to avoid prevent dependency conflicts +import scanpy as sc +import pandas as pd +import natsort +import os +from bidcell import BIDCellModel + + +# preprocessing test data to get required input files for BIDcell +sdata = sd.read_zarr("dataset.zarr") +sdata_genes = sdata['transcripts']["feature_name"].unique().compute().sort_values().tolist() +# Extracting DAPI image from dataset.zarr +image_pyramid = [] +img = sdata["morphology_mip"]["/scale0"]["image"].values # Convert dask array to numpy +img = np.squeeze(img) # Remove singleton channel dimension (c:1) +image_pyramid.append(img) + +with tifffile.TiffWriter( "morphology_mip_pyramidal.tiff", bigtiff=True) as tiff: + for img in image_pyramid: + tiff.write(img, photometric="minisblack", resolution=(1, 1)) + + +#Converting h5ad single cell reference to .csv +adata = sc.read_h5ad("dataset.h5ad") +shared_genes = [g for g in sdata_genes if g in adata.var["feature_name"].values] #crucial to avoid key error due to discrepencies in the genes present +adata = adata[:, adata.var["feature_name"].isin(shared_genes)] + +# Set var_names to feature_name +adata.var_names = adata.var["feature_name"].astype(str) +sc_ref = pd.DataFrame( + data=adata[:, shared_genes].layers["normalized"].toarray(), + columns=shared_genes, + index=range(adata.n_obs) +) +celltypes = adata.obs['cell_type'].unique().tolist() +cell_type_col = adata.obs['cell_type'].astype('category') +sc_ref["ct_idx"] = cell_type_col.cat.codes.values +sc_ref["cell_type"] = cell_type_col.values +sc_ref["atlas"] = "custom" +sc_ref.to_csv( "scref.csv") + +# generating transcript map .csv from test data +transcript = sdata["transcripts"].compute() +transcript=pd.DataFrame(transcript) +#transcript.to_csv(filename="transcript.csv.gz", single_file=True, compression='gzip') #to generate transcript file without the filter +transcript[transcript["feature_name"].isin(shared_genes)].to_csv("transcript.csv.gz",compression='gzip' +) + + +#generating positive and negative marker files from the single cell reference +# defining the function generate_markers + +def generate_markers(ref_df, max_overlaps_pos=4, max_overlaps_neg=15): + """ + Generate positive and negative marker dataframes from reference data. + + Args: + ref_df (pd.DataFrame): Reference dataframe with gene expression data and cell type info + max_overlaps_pos (int): Maximum number of cell types that can share a positive marker + max_overlaps_neg (int): Maximum number of cell types that can share a negative marker + + Returns: + tuple: (df_pos, df_neg) - DataFrames containing positive and negative markers + """ + + n_genes = ref_df.shape[1] - 3 + cell_types = natsort.natsorted(list(set(ref_df["cell_type"].tolist()))) + n_cell_types = len(cell_types) + + ref_expr = ref_df.iloc[:, :n_genes].to_numpy() + gene_names = ref_df.columns[:n_genes] + ct_idx = ref_df["ct_idx"].to_numpy() + + # Generate negative markers + pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True) + pct_10 = np.tile(pct_10, (1, n_genes)) + low_expr_true = np.zeros(pct_10.shape) + low_expr_true[ref_expr <= pct_10] = 1 + + low_expr_true_agg = np.zeros((n_cell_types, n_genes)) + for ct in range(n_cell_types): + rows = np.where(ct_idx == ct)[0] + low_expr_true_ct = low_expr_true[rows] + low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0) + + overlaps = np.sum(low_expr_true_agg, 0) + too_many = np.where(overlaps > max_overlaps_neg)[0] + low_expr_true_agg[:, too_many] = 0 + df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names) + + # Generate positive markers + pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True) + pct_90 = np.tile(pct_90, (1, n_genes)) + high_expr_true = np.zeros(pct_90.shape) + high_expr_true[ref_expr >= pct_90] = 1 + + high_expr_true_agg = np.zeros((n_cell_types, n_genes)) + for ct in range(n_cell_types): + rows = np.where(ct_idx == ct)[0] + high_expr_true_ct = high_expr_true[rows] + high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0) + + overlaps = np.sum(high_expr_true_agg, 0) + too_many = np.where(overlaps > max_overlaps_pos)[0] + high_expr_true_agg[:, too_many] = 0 + df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names) + + return df_pos, df_neg + +# generate positive and negative marker files +df_pos, df_neg = generate_markers(sc_ref, max_overlaps_pos=4, max_overlaps_neg=15) +df_pos.to_csv( "pos_marker.csv") +df_neg.to_csv("neg_marker.csv") + + +# Setting up and running BIDCell +model = BIDCellModel("testdata.yaml") +model.run_pipeline() +