diff --git a/docs/source/api.md b/docs/source/api.md
index 2816f24..45f2ee6 100644
--- a/docs/source/api.md
+++ b/docs/source/api.md
@@ -13,7 +13,7 @@
.. autosummary::
:toctree: generated
- load_hest
+ iter_hest
```
## Run HEST-Benchmark
@@ -44,6 +44,8 @@
## Pooling of transcripts, binning
+Methods used to pool Xenium transcripts and Visium-HD bins into square bins of custom size
+
```{eval-rst}
.. currentmodule:: hest.readers
@@ -72,7 +74,7 @@
correct_batch_effect
```
-## Resolving gene name aliases
+## Gene names manipulation
```{eval-rst}
.. currentmodule:: hest.HESTData
@@ -81,12 +83,13 @@
:toctree: generated
unify_gene_names
+ ensembl_id_to_gene
```
-## Readers to augment HEST-1k
+## Readers to expand HEST-1k
-Readers to create new HEST-1k samples.
+Readers to expand HEST-1k with additional samples.
```{eval-rst}
.. currentmodule:: hest.readers
@@ -103,7 +106,7 @@ Readers to create new HEST-1k samples.
## CellViT segmentation
-Nuclei segmentation methods
+Simplified API for nuclei segmentation
```{eval-rst}
diff --git a/docs/source/index.md b/docs/source/index.md
index 1b5b508..0037c5d 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -3,7 +3,7 @@ Welcome to hest's documentation!
`hest` is a python library for H&E/ST pairs manipulation. It was used to assemble the HEST-1k dataset.
-For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/)
+For the documentations of core WSI manipulations methods please visit the [hestcore documentation](https://hestcore.readthedocs.io/en/latest/) (work in progress)
```{eval-rst}
.. card:: Installation
diff --git a/src/hest/HESTData.py b/src/hest/HESTData.py
index 78fa469..178c07c 100644
--- a/src/hest/HESTData.py
+++ b/src/hest/HESTData.py
@@ -681,8 +681,8 @@ def read_hest_wsi(wsi: WSI, width, height):
return SpatialData(tables=new_table, images=images, shapes=shapes)
- def ensembleID_to_gene(self):
- ensembleID_to_gene(self)
+ def ensembl_id_to_gene(self):
+ ensembl_id_to_gene(self)
class VisiumHESTData(HESTData):
@@ -1058,6 +1058,15 @@ def __len__(self):
return len(self.id_list)
def iter_hest(hest_dir: str, id_list: List[str] = None, **read_kwargs) -> HESTIterator:
+ """ Iterate through the HEST samples contained in `hest_dir`
+
+ Args:
+ hest_dir (str): hest directory containing folders: st, wsis, metadata, tissue_seg (optional)
+ id_list (List[str], Optional): list of ids to read (ex: ['TENX96', 'TENX99']), pass None to read all available samples. Default to None
+
+ Returns:
+ HESTIterator: HESTData iterator
+ """
return HESTIterator(hest_dir, id_list, **read_kwargs)
def _read_st(hest_dir, st_filename, load_transcripts=False):
@@ -1247,7 +1256,7 @@ def unify_gene_names(adata: sc.AnnData, species="human", drop=False) -> sc.AnnDa
# TODO return dict map of renamed, and remaining
return adata
-def ensembleID_to_gene(st: HESTData, filter_na = False) -> HESTData:
+def ensembl_id_to_gene(st: HESTData, filter_na = False) -> HESTData:
"""
Converts ensemble gene IDs of a HESTData object using Biomart annotations and filter out genes with no matching Ensembl ID
diff --git a/src/hest/__init__.py b/src/hest/__init__.py
index 7f32c34..ab37d26 100644
--- a/src/hest/__init__.py
+++ b/src/hest/__init__.py
@@ -3,7 +3,7 @@
from .utils import tiff_save, find_pixel_size_from_spot_coords, write_10X_h5, get_k_genes, SpotPacking
from .autoalign import autoalign_visium
from .readers import *
-from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembleID_to_gene
+from .HESTData import HESTData, read_HESTData, load_hest, iter_hest, ensembl_id_to_gene
from .segmentation.cell_segmenters import segment_cellvit
__all__ = [
@@ -21,5 +21,5 @@
'write_10X_h5',
'HESTData',
'segment_cellvit',
- 'ensembleID_to_gene'
+ 'ensembl_id_to_gene'
]
diff --git a/src/hest/readers.py b/src/hest/readers.py
index 92cfbdd..74c3bdb 100644
--- a/src/hest/readers.py
+++ b/src/hest/readers.py
@@ -903,7 +903,8 @@ def read(
cell_bound_path: str = None,
dapi_path = None,
load_img=True,
- use_dask=False
+ use_dask=False,
+ spot_size_um=100.
) -> XeniumHESTData:
""" Read a Xenium sample
@@ -919,6 +920,7 @@ def read(
dapi_path (_type_, optional): path to a `morphology_focus_0000.ome.tif`/`morphology_focus.ome.tif` file. Defaults to None.
load_img (bool, optional): whenever to load the WSI. Defaults to True.
use_dask (bool, optional): whenever to load the transcript dataframe with DASK (recommended if the transcript dataframe does not fit into the RAM). Defaults to False.
+ spot_size_um (float, optional): transcripts are pooled into squares of spot_size_um x spot_size_um mirometers and then stored in `HESTData.adata`
Returns:
XeniumHESTData: Xenium sample
@@ -963,11 +965,12 @@ def read(
transcript_df,
dict['pixel_size_um_estimated'],
key_x='he_x',
- key_y='he_y'
+ key_y='he_y',
+ spot_size_um=spot_size_um
)
- dict['spot_diameter'] = 55.
- dict['inter_spot_dist'] = 100.
+ dict['spot_diameter'] = spot_size_um
+ dict['inter_spot_dist'] = spot_size_um
dict['spots_under_tissue'] = len(adata.obs)
else:
@@ -1044,12 +1047,12 @@ def pool_transcripts_xenium(
key_x='he_x',
key_y='he_y',
) -> sc.AnnData: # type: ignore
- """ Pool a xenium transcript dataframe by square spots of size 100um
+ """ Pool a xenium transcript dataframe by square spots of `spot_size_um` micrometers.
Args:
- df (pd.DataFrame): xenium transcipts dataframe containing columns:
- - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
- - 'feature_name' indicating the transcript name
+ df (Union[pd.DataFrame, dd.DataFrame]): xenium transcipts dataframe containing columns:
+ - 'he_x' and 'he_y' indicating the pixel coordinates of each transcripts in the morphology image
+ - 'feature_name' indicating the transcript name
pixel_size_he (float): pixel_size in um on the he image
spot_size_um: pooling rectangle width in um
key_x: column name of pixel x coordinate of each transcript in `df`
@@ -1057,7 +1060,7 @@ def pool_transcripts_xenium(
Returns:
- sc.AnnData: AnnData object, each obs represents the sum of transcripts within that bin, coordinates of the center of each bin (in pixel on WSI) are in adata.obsm['spatial']
+ sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of transcripts within that bin. Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc
import dask.dataframe as dd
@@ -1129,18 +1132,18 @@ def pool_transcripts_xenium(
return adata
-def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um=16, chunk_len=50000) -> sc.AnnData: # type: ignore
- """ Pool visium hd bins
+def pool_bins_visiumhd(adata: sc.AnnData, pixel_size: float, dst_bin_size_um=128, src_bin_size_um: Literal[2, 8, 16]=16, chunk_len=50000) -> sc.AnnData: # type: ignore
+ """ Pool a Visium HD (with a source resolution of `src_bin_size_um`) by square spots of `spot_size_um` micrometers.
Args:
adata (sc.AnnData): adata containing spot center coordiniates in `pxl_row_in_fullres` and `pxl_col_in_fullres`
pixel_size (float): pixel size of the WSI in um/px
dst_bin_size_um (int, optional): target bin size in um. Defaults to 128.
- src_bin_size_um (int, optional): bin size of `adata` in um. Defaults to 16.
+ src_bin_size_um (Literal[2, 8, 16], optional): bin size of `adata` in um. Defaults to 16.
chunk_len (int, optional): chunk size when binning a larger than RAM `adata` (this is for RAM optimization only). Defaults to 50000.
Returns:
- sc.AnnData: adata with pooled bins
+ sc.AnnData: AnnData object, each row in .obs represents a bin, each row in `.X` represents the sum of visium-hd sub-bins (of size `src_bin_size_um`) within that larger bin (of size `dst_bin_size_um`). Center coordinates of each bin (in pixel on WSI) are in adata.obsm['spatial']
"""
import scanpy as sc
diff --git a/tests/hest_tests.py b/tests/hest_tests.py
index 484dbe9..151383b 100644
--- a/tests/hest_tests.py
+++ b/tests/hest_tests.py
@@ -17,7 +17,7 @@
from hest.autoalign import autoalign_visium
from hest.readers import VisiumReader
-from hest.HESTData import ensembleID_to_gene
+from hest.HESTData import ensembl_id_to_gene
from hest.utils import load_image
@@ -136,7 +136,7 @@ def setUpClass(self):
#def test_conversion_ensembleID(self):
# for idx, st in enumerate(self.sts):
# with self.subTest(st_object=idx):
- # ensembleID_to_gene(st)
+ # ensembl_id_to_gene(st)
def test_tissue_seg(self):
diff --git a/tutorials/2-Interacting-with-HEST-1k.ipynb b/tutorials/2-Interacting-with-HEST-1k.ipynb
index 9e9bae5..685e443 100644
--- a/tutorials/2-Interacting-with-HEST-1k.ipynb
+++ b/tutorials/2-Interacting-with-HEST-1k.ipynb
@@ -25,9 +25,62 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
+ " warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n",
+ "d:\\HEST\\HEST\\lib\\site-packages\\scanpy\\preprocessing\\_qc.py:432: RuntimeWarning: invalid value encountered in divide\n",
+ " return values / sums[:, None]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "* Scanpy adata:\n",
+ "AnnData object with n_obs × n_vars = 11845 × 541\n",
+ " obs: 'in_tissue', 'pxl_col_in_fullres', 'pxl_row_in_fullres', 'array_col', 'array_row', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito'\n",
+ " var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'\n",
+ " uns: 'spatial'\n",
+ " obsm: 'spatial'\n",
+ "\n",
+ "* WSI:\n",
+ "\n",
+ "\n",
+ "* Shapes:\n",
+ "[name: cellvit, coord-system: he, , name: xenium_cell, coord-system: he, , name: xenium_nucleus, coord-system: he, ]\n",
+ "\n",
+ "* Tissue contours:\n",
+ " tissue_id geometry\n",
+ "0 0 POLYGON ((14052 2848, 14025 2874, 13998 2874, ...\n",
+ "\n",
+ "* SpatialData conversion:\n",
+ "SpatialData object\n",
+ "├── Images\n",
+ "│ ├── 'ST_downscaled_hires_image': SpatialImage[cyx] (3, 3358, 3023)\n",
+ "│ └── 'ST_downscaled_lowres_image': SpatialImage[cyx] (3, 1000, 900)\n",
+ "├── Shapes\n",
+ "│ ├── 'cellvit': GeoDataFrame shape: (497508, 3) (2D shapes)\n",
+ "│ ├── 'locations': GeoDataFrame shape: (11845, 2) (2D shapes)\n",
+ "│ ├── 'tissue_contours': GeoDataFrame shape: (1, 2) (2D shapes)\n",
+ "│ ├── 'xenium_cell': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
+ "│ └── 'xenium_nucleus': GeoDataFrame shape: (574852, 1) (2D shapes)\n",
+ "└── Tables\n",
+ " └── 'table': AnnData (11845, 541)\n",
+ "with coordinate systems:\n",
+ " ▸ 'ST_downscaled_hires', with elements:\n",
+ " ST_downscaled_hires_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n",
+ " ▸ 'ST_downscaled_lowres', with elements:\n",
+ " ST_downscaled_lowres_image (Images), cellvit (Shapes), locations (Shapes), tissue_contours (Shapes), xenium_cell (Shapes), xenium_nucleus (Shapes)\n"
+ ]
+ }
+ ],
"source": [
"from hest import iter_hest\n",
"\n",
@@ -168,7 +221,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Patching"
+ "## Changing the Patching/pooling size\n",
+ "\n",
+ "### Patching\n",
+ "You can change the size of patches around the spots with `dump_patches`:"
]
},
{
@@ -188,6 +244,71 @@
")"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Changing the Pooling size\n",
+ "\n",
+ "You can change the pooling size of Xenium/Visium HD samples stored in `HESTData.adata` with the following:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\HEST\\HEST\\lib\\site-packages\\hestcore\\wsi.py:27: UserWarning: CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\n",
+ " warnings.warn(\"CuImage is not available. Ensure you have a GPU and cucim installed to use GPU acceleration.\")\n"
+ ]
+ },
+ {
+ "ename": "",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+ "\u001b[1;31mClick here for more info. \n",
+ "\u001b[1;31mView Jupyter log for further details."
+ ]
+ }
+ ],
+ "source": [
+ "from hest.readers import pool_transcripts_xenium\n",
+ "from hest import iter_hest\n",
+ "\n",
+ "new_spot_size = 200\n",
+ "\n",
+ "\n",
+ "# Iterate through a subset of hest\n",
+ "for st in iter_hest('../hest_data', id_list=['TENX95'], load_transcripts=True,):\n",
+ " print(st.transcript_df)\n",
+ "\n",
+ " # Feel free to convert st.transcript_df to a Dask DataFrame if you are working with limited RAM.\n",
+ "\n",
+ " st.adata = pool_transcripts_xenium(\n",
+ " st.transcript_df, \n",
+ " st.pixel_size, \n",
+ " key_x='he_x',\n",
+ " key_y='he_y',\n",
+ " spot_size_um=new_spot_size\n",
+ " )\n",
+ "\n",
+ "\n",
+ " # We change the spots, so we need to re-extract the patches\n",
+ " st.dump_patches(\n",
+ " patch_save_dir,\n",
+ " name='demo',\n",
+ " target_patch_size=224, # target patch size in 224\n",
+ " target_pixel_size=0.5 # pixel size of the patches in um/px after rescaling\n",
+ " )"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -292,7 +413,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "cuml",
+ "display_name": "HEST",
"language": "python",
"name": "python3"
},
@@ -306,7 +427,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
+ "version": "3.10.11"
}
},
"nbformat": 4,