diff --git a/docs/source/api.md b/docs/source/api.md index 2816f24..45f2ee6 100644 --- a/docs/source/api.md +++ b/docs/source/api.md @@ -13,7 +13,7 @@ .. autosummary:: :toctree: generated - load_hest + iter_hest ``` ## Run HEST-Benchmark @@ -44,6 +44,8 @@ ## Pooling of transcripts, binning +Methods used to pool Xenium transcripts and Visium-HD bins into square bins of custom size + ```{eval-rst} .. currentmodule:: hest.readers @@ -72,7 +74,7 @@ correct_batch_effect ``` -## Resolving gene name aliases +## Gene names manipulation ```{eval-rst} .. currentmodule:: hest.HESTData @@ -81,12 +83,13 @@ :toctree: generated unify_gene_names + ensembl_id_to_gene ``` -## Readers to augment HEST-1k +## Readers to expand HEST-1k -Readers to create new HEST-1k samples. +Readers to expand HEST-1k with additional samples. ```{eval-rst} .. currentmodule:: hest.readers @@ -103,7 +106,7 @@ Readers to create new HEST-1k samples. ## CellViT segmentation -Nuclei segmentation methods +Simplified API for nuclei segmentation ```{eval-rst} diff --git a/docs/source/index.md b/docs/source/index.md index 1b5b508..0037c5d 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -3,7 +3,7 @@ Welcome to hest's documentation! `hest` is a python library for H&E/ST pairs manipulation. It was used to assemble the HEST-1k dataset. -For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) +For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) (work in progress) ```{eval-rst} .. card:: Installation diff --git a/src/hest/HESTData.py b/src/hest/HESTData.py index 78fa469..178c07c 100644 --- a/src/hest/HESTData.py +++ b/src/hest/HESTData.py @@ -681,8 +681,8 @@ def read_hest_wsi(wsi: WSI, width, height): return SpatialData(tables=new_table, images=images, shapes=shapes) - def ensembleID_to_gene(self): - ensembleID_to_gene(self) + def ensembl_id_to_gene(self): + ensembl_id_to_gene(self) class VisiumHESTData(HESTData): @@ -1058,6 +1058,15 @@ def __len__(self): return len(self.id_list) def iter_hest(hest_dir: str, id_list: List[str] = None, **read_kwargs) -> HESTIterator: + """ Iterate through the HEST samples contained in `hest_dir` + + Args: + hest_dir (str): hest directory containing folders: st, wsis, metadata, tissue_seg (optional) + id_list (List[str], Optional): list of ids to read (ex: ['TENX96', 'TENX99']), pass None to read all available samples. Default to None + + Returns: + HESTIterator: HESTData iterator + """ return HESTIterator(hest_dir, id_list, **read_kwargs) def _read_st(hest_dir, st_filename, load_transcripts=False): @@ -1247,7 +1256,7 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa # TODO return dict map of renamed, and remaining return adata -def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData: +def ensembl_id_to_gene(st: HESTData, filter_na = False) -> HESTData: """ Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID diff --git a/src/hest/__init__.py b/src/hest/__init__.py index 7f32c34..ab37d26 100644 --- a/src/hest/__init__.py +++ b/src/hest/__init__.py @@ -3,7 +3,7 @@ from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking from .autoalign import autoalign_visium from .readers import * -from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene +from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembl_id_to_gene from .segmentation.cell_segmenters import segment_cellvit __all__ = [ @@ -21,5 +21,5 @@ 'write_10X_h5', 'HESTData', 'segment_cellvit', - 'ensembleID_to_gene' + 'ensembl_id_to_gene' ] diff --git a/src/hest/readers.py b/src/hest/readers.py index 92cfbdd..74c3bdb 100644 --- a/src/hest/readers.py +++ b/src/hest/readers.py @@ -903,7 +903,8 @@ def read( cell_bound_path: str = None, dapi_path = None, load_img=True, - use_dask=False + use_dask=False, + spot_size_um=100. ) -> XeniumHESTData: """ Read a Xenium sample @@ -919,6 +920,7 @@ def read( dapi_path (_type_, optional): path to a `morphology_focus_0000.ome.tif`/`morphology_focus.ome.tif` file. Defaults to None. load_img (bool, optional): whenever to load the WSI. Defaults to True. use_dask (bool, optional): whenever to load the transcript dataframe with DASK (recommended if the transcript dataframe does not fit into the RAM). Defaults to False. + spot_size_um (float, optional): transcripts are pooled into squares of spot_size_um x spot_size_um mirometers and then stored in `HESTData.adata` Returns: XeniumHESTData: Xenium sample @@ -963,11 +965,12 @@ def read( transcript_df, dict['pixel_size_um_estimated'], key_x='he_x', - key_y='he_y' + key_y='he_y', + spot_size_um=spot_size_um ) - dict['spot_diameter'] = 55. - dict['inter_spot_dist'] = 100. + dict['spot_diameter'] = spot_size_um + dict['inter_spot_dist'] = spot_size_um dict['spots_under_tissue'] = len(adata.obs) else: @@ -1044,12 +1047,12 @@ def pool_transcripts_xenium( key_x='he_x', key_y='he_y', ) -> sc.AnnData: # type: ignore - """ Pool a xenium transcript dataframe by square spots of size 100um + """ Pool a xenium transcript dataframe by square spots of `spot_size_um` micrometers. Args: - df (pd.DataFrame): xenium transcipts dataframe containing columns: - - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image - - 'feature_name' indicating the transcript name + df (Union[pd.DataFrame, dd.DataFrame]): xenium transcipts dataframe containing columns: + - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image + - 'feature_name' indicating the transcript name pixel_size_he (float): pixel_size in um on the he image spot_size_um: pooling rectangle width in um key_x: column name of pixel x coordinate of each transcript in `df` @@ -1057,7 +1060,7 @@ def pool_transcripts_xenium( Returns: - sc.AnnData: AnnData object, each obs represents the sum of transcripts within that bin, coordinates of the center of each bin (in pixel on WSI) are in adata.obsm['spatial'] + sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of transcripts within that bin. Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial'] """ import scanpy as sc import dask.dataframe as dd @@ -1129,18 +1132,18 @@ def pool_transcripts_xenium( return adata -def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um=16, chunk_len=50000) -> sc.AnnData: # type: ignore - """ Pool visium hd bins +def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um: Literal[2, 8, 16]=16, chunk_len=50000) -> sc.AnnData: # type: ignore + """ Pool a Visium HD (with a source resolution of `src_bin_size_um`) by square spots of `spot_size_um` micrometers. Args: adata (sc.AnnData): adata containing spot center coordiniates in `pxl_row_in_fullres` and `pxl_col_in_fullres` pixel_size (float): pixel size of the WSI in um/px dst_bin_size_um (int, optional): target bin size in um. Defaults to 128. - src_bin_size_um (int, optional): bin size of `adata` in um. Defaults to 16. + src_bin_size_um (Literal[2, 8, 16], optional): bin size of `adata` in um. Defaults to 16. chunk_len (int, optional): chunk size when binning a larger than RAM `adata` (this is for RAM optimization only). Defaults to 50000. Returns: - sc.AnnData: adata with pooled bins + sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of visium-hd sub-bins (of size `src_bin_size_um`) within that larger bin (of size `dst_bin_size_um`). Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial'] """ import scanpy as sc diff --git a/tests/hest_tests.py b/tests/hest_tests.py index 484dbe9..151383b 100644 --- a/tests/hest_tests.py +++ b/tests/hest_tests.py @@ -17,7 +17,7 @@ from hest.autoalign import autoalign_visium from hest.readers import VisiumReader -from hest.HESTData import ensembleID_to_gene +from hest.HESTData import ensembl_id_to_gene from hest.utils import load_image @@ -136,7 +136,7 @@ def setUpClass(self): #def test_conversion_ensembleID(self): # for idx, st in enumerate(self.sts): # with self.subTest(st_object=idx): - # ensembleID_to_gene(st) + # ensembl_id_to_gene(st) def test_tissue_seg(self): diff --git a/tutorials/2-Interacting-with-HEST-1k.ipynb b/tutorials/2-Interacting-with-HEST-1k.ipynb index 9e9bae5..685e443 100644 --- a/tutorials/2-Interacting-with-HEST-1k.ipynb +++ b/tutorials/2-Interacting-with-HEST-1k.ipynb @@ -25,9 +25,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n", + " warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n", + "d:\\HEST\\HEST\\lib\\site-packages\\scanpy\\preprocessing\\_qc.py:432: RuntimeWarning: invalid value encountered in divide\n", + " return values / sums[:, None]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "* Scanpy adata:\n", + "AnnData object with n_obs × n_vars = 11845 × 541\n", + " obs: 'in_tissue', 'pxl_col_in_fullres', 'pxl_row_in_fullres', 'array_col', 'array_row', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'\n", + " var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n", + " uns: 'spatial'\n", + " obsm: 'spatial'\n", + "\n", + "* WSI:\n", + "\n", + "\n", + "* Shapes:\n", + "[name: cellvit, coord-system: he, , name: xenium_cell, coord-system: he, , name: xenium_nucleus, coord-system: he, ]\n", + "\n", + "* Tissue contours:\n", + " tissue_id geometry\n", + "0 0 POLYGON ((14052 2848, 14025 2874, 13998 2874, ...\n", + "\n", + "* SpatialData conversion:\n", + "SpatialData object\n", + "├── Images\n", + "│ ├── 'ST_downscaled_hires_image': SpatialImage[cyx] (3, 3358, 3023)\n", + "│ └── 'ST_downscaled_lowres_image': SpatialImage[cyx] (3, 1000, 900)\n", + "├── Shapes\n", + "│ ├── 'cellvit': GeoDataFrame shape: (497508, 3) (2D shapes)\n", + "│ ├── 'locations': GeoDataFrame shape: (11845, 2) (2D shapes)\n", + "│ ├── 'tissue_contours': GeoDataFrame shape: (1, 2) (2D shapes)\n", + "│ ├── 'xenium_cell': GeoDataFrame shape: (574852, 1) (2D shapes)\n", + "│ └── 'xenium_nucleus': GeoDataFrame shape: (574852, 1) (2D shapes)\n", + "└── Tables\n", + " └── 'table': AnnData (11845, 541)\n", + "with coordinate systems:\n", + " ▸ 'ST_downscaled_hires', with elements:\n", + " ST_downscaled_hires_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n", + " ▸ 'ST_downscaled_lowres', with elements:\n", + " ST_downscaled_lowres_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n" + ] + } + ], "source": [ "from hest import iter_hest\n", "\n", @@ -168,7 +221,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Patching" + "## Changing the Patching/pooling size\n", + "\n", + "### Patching\n", + "You can change the size of patches around the spots with `dump_patches`:" ] }, { @@ -188,6 +244,71 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Changing the Pooling size\n", + "\n", + "You can change the pooling size of Xenium/Visium HD samples stored in `HESTData.adata` with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n", + " warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", + "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", + "\u001b[1;31mClick here for more info. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], + "source": [ + "from hest.readers import pool_transcripts_xenium\n", + "from hest import iter_hest\n", + "\n", + "new_spot_size = 200\n", + "\n", + "\n", + "# Iterate through a subset of hest\n", + "for st in iter_hest('../hest_data', id_list=['TENX95'], load_transcripts=True,):\n", + " print(st.transcript_df)\n", + "\n", + " # Feel free to convert st.transcript_df to a Dask DataFrame if you are working with limited RAM.\n", + "\n", + " st.adata = pool_transcripts_xenium(\n", + " st.transcript_df, \n", + " st.pixel_size, \n", + " key_x='he_x',\n", + " key_y='he_y',\n", + " spot_size_um=new_spot_size\n", + " )\n", + "\n", + "\n", + " # We change the spots, so we need to re-extract the patches\n", + " st.dump_patches(\n", + " patch_save_dir,\n", + " name='demo',\n", + " target_patch_size=224, # target patch size in 224\n", + " target_pixel_size=0.5 # pixel size of the patches in um/px after rescaling\n", + " )" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -292,7 +413,7 @@ ], "metadata": { "kernelspec": { - "display_name": "cuml", + "display_name": "HEST", "language": "python", "name": "python3" }, @@ -306,7 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.10.11" } }, "nbformat": 4,