Update tutorial for Xenium pooling (#80)

* modify visium HD pooling logiic * resolve cell_vit bug on windows * HESTData: provide util to map ensemble ID to gene name (#71) * HESTData: provide util to map ensemble ID to gene name * HESTData: filter_na=False as default for ensemble ID mapping * add unit tests and inplace arg --------- Co-authored-by: Paul Doucet <[email protected]> * add xenium_reader doc * small correction * remove test * improve documentation * add further documentation * update tutorial for xenium pooling
mahmoodlab · Dec 14, 2024 · 0e5bf02 · 0e5bf02
1 parent 87e586f
commit 0e5bf02
Showing 7 changed files with 167 additions and 31 deletions.
diff --git a/docs/source/api.md b/docs/source/api.md
@@ -13,7 +13,7 @@
 .. autosummary::
     :toctree: generated
    
-    load_hest
+    iter_hest
 ```
 
 ## Run HEST-Benchmark
@@ -44,6 +44,8 @@
 
 ## Pooling of transcripts, binning
 
+Methods used to pool Xenium transcripts and Visium-HD bins into square bins of custom size
+
 ```{eval-rst}
 .. currentmodule:: hest.readers
 
@@ -72,7 +74,7 @@
     correct_batch_effect
 ```
 
-## Resolving gene name aliases
+## Gene names manipulation
 
 ```{eval-rst}
 .. currentmodule:: hest.HESTData
@@ -81,12 +83,13 @@
     :toctree: generated
 
     unify_gene_names
+    ensembl_id_to_gene
 ```
 
 
-## Readers to augment HEST-1k
+## Readers to expand HEST-1k
 
-Readers to create new HEST-1k samples.
+Readers to expand HEST-1k with additional samples.
 
 ```{eval-rst}
 .. currentmodule:: hest.readers
@@ -103,7 +106,7 @@ Readers to create new HEST-1k samples.
 
 
 ## CellViT segmentation
-Nuclei segmentation methods
+Simplified API for nuclei segmentation
 
 
 ```{eval-rst}

diff --git a/docs/source/index.md b/docs/source/index.md
@@ -3,7 +3,7 @@ Welcome to hest's documentation!
 
 `hest` is a python library for H&E/ST pairs manipulation. It was used to assemble the <em>HEST-1k</em> dataset.
 
-For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/)
+For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) (work in progress)
 
 ```{eval-rst}
 .. card:: Installation

diff --git a/src/hest/HESTData.py b/src/hest/HESTData.py
@@ -681,8 +681,8 @@ def read_hest_wsi(wsi: WSI, width, height):
 
         return SpatialData(tables=new_table, images=images, shapes=shapes)
 
-    def ensembleID_to_gene(self):
-        ensembleID_to_gene(self)
+    def ensembl_id_to_gene(self):
+        ensembl_id_to_gene(self)
 
 
 class VisiumHESTData(HESTData): 
@@ -1058,6 +1058,15 @@ def __len__(self):
         return len(self.id_list)
 
 def iter_hest(hest_dir: str, id_list: List[str] = None, **read_kwargs) -> HESTIterator:
+    """ Iterate through the HEST samples contained in `hest_dir`
+
+    Args:
+        hest_dir (str): hest directory containing folders: st, wsis, metadata, tissue_seg (optional)
+        id_list (List[str], Optional): list of ids to read (ex: ['TENX96', 'TENX99']), pass None to read all available samples. Default to None
+
+    Returns:
+        HESTIterator: HESTData iterator
+    """
     return HESTIterator(hest_dir, id_list, **read_kwargs)
 
 def _read_st(hest_dir, st_filename, load_transcripts=False):
@@ -1247,7 +1256,7 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa
     # TODO return dict map of renamed, and remaining
     return adata
 
-def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData:
+def ensembl_id_to_gene(st: HESTData, filter_na = False) -> HESTData:
     """
     Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID
     

diff --git a/src/hest/__init__.py b/src/hest/__init__.py
@@ -3,7 +3,7 @@
 from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking
 from .autoalign import autoalign_visium
 from .readers import *
-from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene
+from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembl_id_to_gene
 from .segmentation.cell_segmenters import segment_cellvit
 
 __all__ = [
@@ -21,5 +21,5 @@
     'write_10X_h5',
     'HESTData',
     'segment_cellvit', 
-    'ensembleID_to_gene'
+    'ensembl_id_to_gene'
 ]
diff --git a/src/hest/readers.py b/src/hest/readers.py
@@ -903,7 +903,8 @@ def read(
         cell_bound_path: str = None,
         dapi_path = None,
         load_img=True,
-        use_dask=False
+        use_dask=False,
+        spot_size_um=100.
     ) -> XeniumHESTData:
         """ Read a Xenium sample
 
@@ -919,6 +920,7 @@ def read(
             dapi_path (_type_, optional): path to a `morphology_focus_0000.ome.tif`/`morphology_focus.ome.tif` file. Defaults to None.
             load_img (bool, optional): whenever to load the WSI. Defaults to True.
             use_dask (bool, optional): whenever to load the transcript dataframe with DASK (recommended if the transcript dataframe does not fit into the RAM). Defaults to False.
+            spot_size_um (float, optional): transcripts are pooled into squares of spot_size_um x spot_size_um mirometers and then stored in `HESTData.adata`
 
         Returns:
             XeniumHESTData: Xenium sample
@@ -963,11 +965,12 @@ def read(
                 transcript_df, 
                 dict['pixel_size_um_estimated'], 
                 key_x='he_x',
-                key_y='he_y'
+                key_y='he_y',
+                spot_size_um=spot_size_um
             )
 
-            dict['spot_diameter'] = 55.
-            dict['inter_spot_dist'] = 100.
+            dict['spot_diameter'] = spot_size_um
+            dict['inter_spot_dist'] = spot_size_um
             dict['spots_under_tissue'] = len(adata.obs)
 
         else:
@@ -1044,20 +1047,20 @@ def pool_transcripts_xenium(
     key_x='he_x',
     key_y='he_y',
 ) -> sc.AnnData: # type: ignore
-    """ Pool a xenium transcript dataframe by square spots of size 100um
+    """ Pool a xenium transcript dataframe by square spots of `spot_size_um` micrometers.
 
     Args:
-        df (pd.DataFrame): xenium transcipts dataframe containing columns:
-        - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
-        - 'feature_name' indicating the transcript name
+        df (Union[pd.DataFrame, dd.DataFrame]): xenium transcipts dataframe containing columns:
+            - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
+            - 'feature_name' indicating the transcript name
         pixel_size_he (float): pixel_size in um on the he image
         spot_size_um: pooling rectangle width in um
         key_x: column name of pixel x coordinate of each transcript in `df`
         key_y: column name of pixel y coordinate of each transcript in `df`
         
 
     Returns:
-        sc.AnnData: AnnData object, each obs represents the sum of transcripts within that bin, coordinates of the center of each bin (in pixel on WSI) are in adata.obsm['spatial']
+        sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of transcripts within that bin. Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
     """
     import scanpy as sc
     import dask.dataframe as dd
@@ -1129,18 +1132,18 @@ def pool_transcripts_xenium(
     return adata
 
 
-def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um=16, chunk_len=50000) -> sc.AnnData: # type: ignore
-    """ Pool visium hd bins
+def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um: Literal[2, 8, 16]=16, chunk_len=50000) -> sc.AnnData: # type: ignore
+    """ Pool a Visium HD (with a source resolution of `src_bin_size_um`) by square spots of `spot_size_um` micrometers.
 
     Args:
         adata (sc.AnnData): adata containing spot center coordiniates in `pxl_row_in_fullres` and `pxl_col_in_fullres`
         pixel_size (float): pixel size of the WSI in um/px
         dst_bin_size_um (int, optional): target bin size in um. Defaults to 128.
-        src_bin_size_um (int, optional): bin size of `adata` in um. Defaults to 16.
+        src_bin_size_um (Literal[2, 8, 16], optional): bin size of `adata` in um. Defaults to 16.
         chunk_len (int, optional): chunk size when binning a larger than RAM `adata` (this is for RAM optimization only). Defaults to 50000.
 
     Returns:
-        sc.AnnData: adata with pooled bins
+        sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of visium-hd sub-bins (of size `src_bin_size_um`) within that larger bin (of size `dst_bin_size_um`). Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
     """
     import scanpy as sc
 

diff --git a/tests/hest_tests.py b/tests/hest_tests.py
@@ -17,7 +17,7 @@
 
 from hest.autoalign import autoalign_visium
 from hest.readers import VisiumReader
-from hest.HESTData import ensembleID_to_gene
+from hest.HESTData import ensembl_id_to_gene
 from hest.utils import load_image
 
 
@@ -136,7 +136,7 @@ def setUpClass(self):
     #def test_conversion_ensembleID(self):
     #    for idx, st in enumerate(self.sts):
     #        with self.subTest(st_object=idx):
-    #            ensembleID_to_gene(st)
+    #            ensembl_id_to_gene(st)
 
 
     def test_tissue_seg(self):

diff --git a/tutorials/2-Interacting-with-HEST-1k.ipynb b/tutorials/2-Interacting-with-HEST-1k.ipynb
@@ -25,9 +25,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
+      "  warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n",
+      "d:\\HEST\\HEST\\lib\\site-packages\\scanpy\\preprocessing\\_qc.py:432: RuntimeWarning: invalid value encountered in divide\n",
+      "  return values / sums[:, None]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "* Scanpy adata:\n",
+      "AnnData object with n_obs × n_vars = 11845 × 541\n",
+      "    obs: 'in_tissue', 'pxl_col_in_fullres', 'pxl_row_in_fullres', 'array_col', 'array_row', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'\n",
+      "    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n",
+      "    uns: 'spatial'\n",
+      "    obsm: 'spatial'\n",
+      "\n",
+      "* WSI:\n",
+      "<width=48376, height=53738, backend=OpenSlideWSI>\n",
+      "\n",
+      "* Shapes:\n",
+      "[name: cellvit, coord-system: he, <not loaded>, name: xenium_cell, coord-system: he, <not loaded>, name: xenium_nucleus, coord-system: he, <not loaded>]\n",
+      "\n",
+      "* Tissue contours:\n",
+      "  tissue_id                                           geometry\n",
+      "0         0  POLYGON ((14052 2848, 14025 2874, 13998 2874, ...\n",
+      "\n",
+      "* SpatialData conversion:\n",
+      "SpatialData object\n",
+      "├── Images\n",
+      "│     ├── 'ST_downscaled_hires_image': SpatialImage[cyx] (3, 3358, 3023)\n",
+      "│     └── 'ST_downscaled_lowres_image': SpatialImage[cyx] (3, 1000, 900)\n",
+      "├── Shapes\n",
+      "│     ├── 'cellvit': GeoDataFrame shape: (497508, 3) (2D shapes)\n",
+      "│     ├── 'locations': GeoDataFrame shape: (11845, 2) (2D shapes)\n",
+      "│     ├── 'tissue_contours': GeoDataFrame shape: (1, 2) (2D shapes)\n",
+      "│     ├── 'xenium_cell': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
+      "│     └── 'xenium_nucleus': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
+      "└── Tables\n",
+      "      └── 'table': AnnData (11845, 541)\n",
+      "with coordinate systems:\n",
+      "    ▸ 'ST_downscaled_hires', with elements:\n",
+      "        ST_downscaled_hires_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n",
+      "    ▸ 'ST_downscaled_lowres', with elements:\n",
+      "        ST_downscaled_lowres_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n"
+     ]
+    }
+   ],
    "source": [
     "from hest import iter_hest\n",
     "\n",
@@ -168,7 +221,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Patching"
+    "## Changing the Patching/pooling size\n",
+    "\n",
+    "### Patching\n",
+    "You can change the size of patches around the spots with `dump_patches`:"
    ]
   },
   {
@@ -188,6 +244,71 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Changing the Pooling size\n",
+    "\n",
+    "You can change the pooling size of Xenium/Visium HD samples stored in `HESTData.adata` with the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
+      "  warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from hest.readers import pool_transcripts_xenium\n",
+    "from hest import iter_hest\n",
+    "\n",
+    "new_spot_size = 200\n",
+    "\n",
+    "\n",
+    "# Iterate through a subset of hest\n",
+    "for st in iter_hest('../hest_data', id_list=['TENX95'], load_transcripts=True,):\n",
+    "    print(st.transcript_df)\n",
+    "\n",
+    "    # Feel free to convert st.transcript_df to a Dask DataFrame if you are working with limited RAM.\n",
+    "\n",
+    "    st.adata = pool_transcripts_xenium(\n",
+    "                    st.transcript_df, \n",
+    "                    st.pixel_size, \n",
+    "                    key_x='he_x',\n",
+    "                    key_y='he_y',\n",
+    "                    spot_size_um=new_spot_size\n",
+    "                )\n",
+    "\n",
+    "\n",
+    "    # We change the spots, so we need to re-extract the patches\n",
+    "    st.dump_patches(\n",
+    "        patch_save_dir,\n",
+    "        name='demo',\n",
+    "        target_patch_size=224, # target patch size in 224\n",
+    "        target_pixel_size=0.5 # pixel size of the patches in um/px after rescaling\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -292,7 +413,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "cuml",
+   "display_name": "HEST",
    "language": "python",
    "name": "python3"
   },
@@ -306,7 +427,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,