Skip to content

Commit

Permalink
Update tutorial for Xenium pooling (#80)
Browse files Browse the repository at this point in the history
* modify visium HD pooling logiic

* resolve cell_vit bug on windows

* HESTData: provide util to map ensemble ID to gene name (#71)

* HESTData: provide util to map ensemble ID to gene name

* HESTData: filter_na=False as default for ensemble ID mapping

* add unit tests and inplace arg

---------

Co-authored-by: Paul Doucet <[email protected]>

* add xenium_reader doc

* small correction

* remove test

* improve documentation

* add further documentation

* update tutorial for xenium pooling
pauldoucet authored Dec 14, 2024
1 parent 87e586f commit 0e5bf02
Showing 7 changed files with 167 additions and 31 deletions.
13 changes: 8 additions & 5 deletions docs/source/api.md
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@
.. autosummary::
:toctree: generated
load_hest
iter_hest
```

## Run HEST-Benchmark
@@ -44,6 +44,8 @@

## Pooling of transcripts, binning

Methods used to pool Xenium transcripts and Visium-HD bins into square bins of custom size

```{eval-rst}
.. currentmodule:: hest.readers
@@ -72,7 +74,7 @@
correct_batch_effect
```

## Resolving gene name aliases
## Gene names manipulation

```{eval-rst}
.. currentmodule:: hest.HESTData
@@ -81,12 +83,13 @@
:toctree: generated
unify_gene_names
ensembl_id_to_gene
```


## Readers to augment HEST-1k
## Readers to expand HEST-1k

Readers to create new HEST-1k samples.
Readers to expand HEST-1k with additional samples.

```{eval-rst}
.. currentmodule:: hest.readers
@@ -103,7 +106,7 @@ Readers to create new HEST-1k samples.


## CellViT segmentation
Nuclei segmentation methods
Simplified API for nuclei segmentation


```{eval-rst}
2 changes: 1 addition & 1 deletion docs/source/index.md
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ Welcome to hest's documentation!

`hest` is a python library for H&E/ST pairs manipulation. It was used to assemble the <em>HEST-1k</em> dataset.

For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/)
For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) (work in progress)

```{eval-rst}
.. card:: Installation
15 changes: 12 additions & 3 deletions src/hest/HESTData.py
Original file line number Diff line number Diff line change
@@ -681,8 +681,8 @@ def read_hest_wsi(wsi: WSI, width, height):

return SpatialData(tables=new_table, images=images, shapes=shapes)

def ensembleID_to_gene(self):
ensembleID_to_gene(self)
def ensembl_id_to_gene(self):
ensembl_id_to_gene(self)


class VisiumHESTData(HESTData):
@@ -1058,6 +1058,15 @@ def __len__(self):
return len(self.id_list)

def iter_hest(hest_dir: str, id_list: List[str] = None, **read_kwargs) -> HESTIterator:
""" Iterate through the HEST samples contained in `hest_dir`
Args:
hest_dir (str): hest directory containing folders: st, wsis, metadata, tissue_seg (optional)
id_list (List[str], Optional): list of ids to read (ex: ['TENX96', 'TENX99']), pass None to read all available samples. Default to None
Returns:
HESTIterator: HESTData iterator
"""
return HESTIterator(hest_dir, id_list, **read_kwargs)

def _read_st(hest_dir, st_filename, load_transcripts=False):
@@ -1247,7 +1256,7 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa
# TODO return dict map of renamed, and remaining
return adata

def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData:
def ensembl_id_to_gene(st: HESTData, filter_na = False) -> HESTData:
"""
Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID
4 changes: 2 additions & 2 deletions src/hest/__init__.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking
from .autoalign import autoalign_visium
from .readers import *
from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene
from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembl_id_to_gene
from .segmentation.cell_segmenters import segment_cellvit

__all__ = [
@@ -21,5 +21,5 @@
'write_10X_h5',
'HESTData',
'segment_cellvit',
'ensembleID_to_gene'
'ensembl_id_to_gene'
]
29 changes: 16 additions & 13 deletions src/hest/readers.py
Original file line number Diff line number Diff line change
@@ -903,7 +903,8 @@ def read(
cell_bound_path: str = None,
dapi_path = None,
load_img=True,
use_dask=False
use_dask=False,
spot_size_um=100.
) -> XeniumHESTData:
""" Read a Xenium sample
@@ -919,6 +920,7 @@ def read(
dapi_path (_type_, optional): path to a `morphology_focus_0000.ome.tif`/`morphology_focus.ome.tif` file. Defaults to None.
load_img (bool, optional): whenever to load the WSI. Defaults to True.
use_dask (bool, optional): whenever to load the transcript dataframe with DASK (recommended if the transcript dataframe does not fit into the RAM). Defaults to False.
spot_size_um (float, optional): transcripts are pooled into squares of spot_size_um x spot_size_um mirometers and then stored in `HESTData.adata`
Returns:
XeniumHESTData: Xenium sample
@@ -963,11 +965,12 @@ def read(
transcript_df,
dict['pixel_size_um_estimated'],
key_x='he_x',
key_y='he_y'
key_y='he_y',
spot_size_um=spot_size_um
)

dict['spot_diameter'] = 55.
dict['inter_spot_dist'] = 100.
dict['spot_diameter'] = spot_size_um
dict['inter_spot_dist'] = spot_size_um
dict['spots_under_tissue'] = len(adata.obs)

else:
@@ -1044,20 +1047,20 @@ def pool_transcripts_xenium(
key_x='he_x',
key_y='he_y',
) -> sc.AnnData: # type: ignore
""" Pool a xenium transcript dataframe by square spots of size 100um
""" Pool a xenium transcript dataframe by square spots of `spot_size_um` micrometers.
Args:
df (pd.DataFrame): xenium transcipts dataframe containing columns:
- 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
- 'feature_name' indicating the transcript name
df (Union[pd.DataFrame, dd.DataFrame]): xenium transcipts dataframe containing columns:
- 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
- 'feature_name' indicating the transcript name
pixel_size_he (float): pixel_size in um on the he image
spot_size_um: pooling rectangle width in um
key_x: column name of pixel x coordinate of each transcript in `df`
key_y: column name of pixel y coordinate of each transcript in `df`
Returns:
sc.AnnData: AnnData object, each obs represents the sum of transcripts within that bin, coordinates of the center of each bin (in pixel on WSI) are in adata.obsm['spatial']
sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of transcripts within that bin. Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc
import dask.dataframe as dd
@@ -1129,18 +1132,18 @@ def pool_transcripts_xenium(
return adata


def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um=16, chunk_len=50000) -> sc.AnnData: # type: ignore
""" Pool visium hd bins
def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um: Literal[2, 8, 16]=16, chunk_len=50000) -> sc.AnnData: # type: ignore
""" Pool a Visium HD (with a source resolution of `src_bin_size_um`) by square spots of `spot_size_um` micrometers.
Args:
adata (sc.AnnData): adata containing spot center coordiniates in `pxl_row_in_fullres` and `pxl_col_in_fullres`
pixel_size (float): pixel size of the WSI in um/px
dst_bin_size_um (int, optional): target bin size in um. Defaults to 128.
src_bin_size_um (int, optional): bin size of `adata` in um. Defaults to 16.
src_bin_size_um (Literal[2, 8, 16], optional): bin size of `adata` in um. Defaults to 16.
chunk_len (int, optional): chunk size when binning a larger than RAM `adata` (this is for RAM optimization only). Defaults to 50000.
Returns:
sc.AnnData: adata with pooled bins
sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of visium-hd sub-bins (of size `src_bin_size_um`) within that larger bin (of size `dst_bin_size_um`). Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc

4 changes: 2 additions & 2 deletions tests/hest_tests.py
Original file line number Diff line number Diff line change
@@ -17,7 +17,7 @@

from hest.autoalign import autoalign_visium
from hest.readers import VisiumReader
from hest.HESTData import ensembleID_to_gene
from hest.HESTData import ensembl_id_to_gene
from hest.utils import load_image


@@ -136,7 +136,7 @@ def setUpClass(self):
#def test_conversion_ensembleID(self):
# for idx, st in enumerate(self.sts):
# with self.subTest(st_object=idx):
# ensembleID_to_gene(st)
# ensembl_id_to_gene(st)


def test_tissue_seg(self):
131 changes: 126 additions & 5 deletions tutorials/2-Interacting-with-HEST-1k.ipynb
Original file line number Diff line number Diff line change
@@ -25,9 +25,62 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
" warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n",
"d:\\HEST\\HEST\\lib\\site-packages\\scanpy\\preprocessing\\_qc.py:432: RuntimeWarning: invalid value encountered in divide\n",
" return values / sums[:, None]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"* Scanpy adata:\n",
"AnnData object with n_obs × n_vars = 11845 × 541\n",
" obs: 'in_tissue', 'pxl_col_in_fullres', 'pxl_row_in_fullres', 'array_col', 'array_row', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'\n",
" var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n",
" uns: 'spatial'\n",
" obsm: 'spatial'\n",
"\n",
"* WSI:\n",
"<width=48376, height=53738, backend=OpenSlideWSI>\n",
"\n",
"* Shapes:\n",
"[name: cellvit, coord-system: he, <not loaded>, name: xenium_cell, coord-system: he, <not loaded>, name: xenium_nucleus, coord-system: he, <not loaded>]\n",
"\n",
"* Tissue contours:\n",
" tissue_id geometry\n",
"0 0 POLYGON ((14052 2848, 14025 2874, 13998 2874, ...\n",
"\n",
"* SpatialData conversion:\n",
"SpatialData object\n",
"├── Images\n",
"│ ├── 'ST_downscaled_hires_image': SpatialImage[cyx] (3, 3358, 3023)\n",
"│ └── 'ST_downscaled_lowres_image': SpatialImage[cyx] (3, 1000, 900)\n",
"├── Shapes\n",
"│ ├── 'cellvit': GeoDataFrame shape: (497508, 3) (2D shapes)\n",
"│ ├── 'locations': GeoDataFrame shape: (11845, 2) (2D shapes)\n",
"│ ├── 'tissue_contours': GeoDataFrame shape: (1, 2) (2D shapes)\n",
"│ ├── 'xenium_cell': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
"│ └── 'xenium_nucleus': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
"└── Tables\n",
" └── 'table': AnnData (11845, 541)\n",
"with coordinate systems:\n",
" ▸ 'ST_downscaled_hires', with elements:\n",
" ST_downscaled_hires_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n",
" ▸ 'ST_downscaled_lowres', with elements:\n",
" ST_downscaled_lowres_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n"
]
}
],
"source": [
"from hest import iter_hest\n",
"\n",
@@ -168,7 +221,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Patching"
"## Changing the Patching/pooling size\n",
"\n",
"### Patching\n",
"You can change the size of patches around the spots with `dump_patches`:"
]
},
{
@@ -188,6 +244,71 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Changing the Pooling size\n",
"\n",
"You can change the pooling size of Xenium/Visium HD samples stored in `HESTData.adata` with the following:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
" warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"from hest.readers import pool_transcripts_xenium\n",
"from hest import iter_hest\n",
"\n",
"new_spot_size = 200\n",
"\n",
"\n",
"# Iterate through a subset of hest\n",
"for st in iter_hest('../hest_data', id_list=['TENX95'], load_transcripts=True,):\n",
" print(st.transcript_df)\n",
"\n",
" # Feel free to convert st.transcript_df to a Dask DataFrame if you are working with limited RAM.\n",
"\n",
" st.adata = pool_transcripts_xenium(\n",
" st.transcript_df, \n",
" st.pixel_size, \n",
" key_x='he_x',\n",
" key_y='he_y',\n",
" spot_size_um=new_spot_size\n",
" )\n",
"\n",
"\n",
" # We change the spots, so we need to re-extract the patches\n",
" st.dump_patches(\n",
" patch_save_dir,\n",
" name='demo',\n",
" target_patch_size=224, # target patch size in 224\n",
" target_pixel_size=0.5 # pixel size of the patches in um/px after rescaling\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -292,7 +413,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "cuml",
"display_name": "HEST",
"language": "python",
"name": "python3"
},
@@ -306,7 +427,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.10.11"
}
},
"nbformat": 4,

0 comments on commit 0e5bf02

Please sign in to comment.