Skip to content

Commit

Permalink
Add patch_url kwarg for signing hrefs and objects (#31)
Browse files Browse the repository at this point in the history
This seems like potentially a better approach than just signing
everything but it would be great to get a second opinion

closes #29
  • Loading branch information
jsignell authored Feb 3, 2025
1 parent 9094e71 commit b2d1b85
Show file tree
Hide file tree
Showing 14 changed files with 264,360 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ jobs:

- name: run tests
id: status
run: pytest -v --cov xpystac --cov-report term-missing .
run: pytest -v --cov xpystac --cov-report term-missing --block-network .
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ Read from a reference file:
collection = catalog.get_collection("nasa-nex-gddp-cmip6")
asset = collection.assets["ACCESS-CM2.historical"]

xr.open_dataset(asset)
xr.open_dataset(asset, patch_url=planetary_computer.sign)
```
ref: https://planetarycomputer.microsoft.com/dataset/nasa-nex-gddp-cmip6#Example-Notebook

Expand All @@ -58,14 +58,14 @@ Read from a zarr file:
collection = catalog.get_collection("daymet-daily-hi")
asset = collection.assets["zarr-abfs"]

xr.open_dataset(asset)
xr.open_dataset(asset, patch_url=planetary_computer.sign)
```
ref: https://planetarycomputer.microsoft.com/docs/quickstarts/reading-zarr-data/

## Install

```bash
pip install xpystac
pip install git+https://github.com/stac-utils/xpystac
```

## How it works
Expand Down
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ dependencies:
# testing
- pytest
- pytest-cov
- pytest-recording
254,575 changes: 254,575 additions & 0 deletions tests/cassettes/test_core/test_to_xarray_reference_file.yaml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

4,222 changes: 4,222 additions & 0 deletions tests/cassettes/test_core/test_to_xarray_zarr.yaml

Large diffs are not rendered by default.

4,184 changes: 4,184 additions & 0 deletions tests/cassettes/test_core/test_to_xarray_zarr_with_open_kwargs_engine.yaml

Large diffs are not rendered by default.

Large diffs are not rendered by default.

61 changes: 51 additions & 10 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,69 +7,110 @@
from xpystac.core import to_xarray


@pytest.mark.vcr
def test_to_xarray_with_cog_asset(simple_cog):
ds = to_xarray(simple_cog)
assert ds


@pytest.mark.vcr
def test_to_xarray_with_pystac_client_search(simple_search):
ds = to_xarray(simple_search)
assert ds


@pytest.mark.vcr
def test_to_xarray_returns_dask_backed_object(simple_search):
ds = to_xarray(simple_search)
assert isinstance(ds.blue.data, dask.array.Array)
assert ds.blue.data.npartitions > 1


@pytest.mark.vcr
def test_to_xarray_with_pystac_client_search_passes_kwargs_through(simple_search):
ds = to_xarray(simple_search, bands=["red", "green", "blue"], chunks={})
assert list(ds.data_vars) == ["red", "green", "blue"]
assert ds.blue.data.npartitions == 1


@pytest.mark.vcr
@pytest.mark.parametrize("stacking_library", ["odc.stac", "stackstac"])
def test_to_xarray_with_different_stacking_library(simple_search, stacking_library):
ds = to_xarray(simple_search, stacking_library=stacking_library)
assert isinstance(ds, xr.Dataset)
assert "band" not in ds.dims


@requires_planetary_computer
@pytest.mark.vcr
def test_to_xarray_with_pystac_client_search_with_patch_url():
import planetary_computer as pc
from rasterio.errors import RasterioIOError

client = pystac_client.Client.open(STAC_URLS["PLANETARY-COMPUTER"])
search = client.search(
intersects=dict(type="Point", coordinates=[-105.78, 35.79]),
collections=["sentinel-2-l2a"],
datetime="2020-05-01",
)

ds = to_xarray(search, assets=["B4", "B3", "B2"], stacking_library="odc.stac")

with pytest.raises(RasterioIOError, match="HTTP response code: 404"):
ds.B01.max().compute()

ds = to_xarray(
search,
assets=["B4", "B3", "B2"],
stacking_library="odc.stac",
patch_url=pc.sign,
)
assert ds.B01.max().compute() == 11080


@pytest.mark.vcr
def test_to_xarray_with_drop_variables_raises(simple_search):
with pytest.raises(KeyError, match="not implemented for pystac items"):
to_xarray(simple_search, drop_variables=["blue"])


@pytest.mark.vcr
def test_to_xarray_with_bad_type():
with pytest.raises(TypeError):
to_xarray("foo")


@requires_planetary_computer
@pytest.mark.vcr
def test_to_xarray_reference_file():
import planetary_computer
import planetary_computer as pc
from fsspec.implementations.reference import ReferenceNotReachable

client = pystac_client.Client.open(
STAC_URLS["PLANETARY-COMPUTER"], modifier=planetary_computer.sign_inplace
STAC_URLS["PLANETARY-COMPUTER"], modifier=pc.sign_inplace
)
collection = client.get_collection("nasa-nex-gddp-cmip6")
assert collection is not None
kerchunk_asset = collection.assets["ACCESS-CM2.historical"]

ds = to_xarray(kerchunk_asset)
assert ds
with pytest.raises(ReferenceNotReachable):
to_xarray(kerchunk_asset)

ds = to_xarray(kerchunk_asset, patch_url=pc.sign)
assert not ds.lon.isnull().all(), "Coordinates should be populated"

for da in ds.data_vars.values():
if da.ndim >= 2:
assert hasattr(da.data, "dask")


@requires_planetary_computer
@pytest.mark.vcr
def test_to_xarray_zarr():
import planetary_computer
import planetary_computer as pc

catalog = pystac_client.Client.open(
STAC_URLS["PLANETARY-COMPUTER"], modifier=planetary_computer.sign_inplace
STAC_URLS["PLANETARY-COMPUTER"], modifier=pc.sign_inplace
)
collection = catalog.get_collection("daymet-daily-hi")
assert collection is not None
Expand All @@ -82,16 +123,16 @@ def test_to_xarray_zarr():


@requires_planetary_computer
@pytest.mark.vcr
def test_to_xarray_zarr_with_open_kwargs_engine():
import planetary_computer
import planetary_computer as pc

catalog = pystac_client.Client.open(
STAC_URLS["PLANETARY-COMPUTER"], modifier=planetary_computer.sign_inplace
STAC_URLS["PLANETARY-COMPUTER"], modifier=pc.sign_inplace
)
collection = catalog.get_collection("daymet-daily-hi")
assert collection is not None
zarr_asset = collection.assets["zarr-abfs"]
zarr_asset.extra_fields["xarray:open_kwargs"]["engine"] = "zarr"

ds = to_xarray(zarr_asset)
assert ds
to_xarray(zarr_asset)
3 changes: 3 additions & 0 deletions tests/test_xarray_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
import xarray


@pytest.mark.vcr
def test_xarray_open_dataset_can_guess_for_pystac_objects(simple_cog):
ds = xarray.open_dataset(simple_cog)
assert ds


@pytest.mark.vcr
def test_xarray_open_dataset_can_guess_for_pystac_client_searchs(simple_search):
ds = xarray.open_dataset(simple_search, assets=["blue", "green", "red"])
assert ds


@pytest.mark.vcr
def test_xarray_open_dataset_with_drop_variables_raises(simple_search):
with pytest.raises(KeyError, match="not implemented for pystac items"):
xarray.open_dataset(simple_search, engine="stac", drop_variables=["B0"])
42 changes: 33 additions & 9 deletions xpystac/core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import functools
from collections.abc import Mapping
from collections.abc import Callable, Mapping
from typing import Literal

import pystac
Expand All @@ -11,7 +11,9 @@
@functools.singledispatch
def to_xarray(
obj,
*,
stacking_library: Literal["odc.stac", "stackstac"] | None = None,
patch_url: None | Callable[[str], str] = None,
**kwargs,
) -> xarray.Dataset:
"""Given a PySTAC object return an xarray dataset.
Expand All @@ -34,10 +36,20 @@ def to_xarray(
stacking_library : "odc.stac", "stackstac", optional
When stacking multiple items, this argument determines which library
to use. Defaults to ``odc.stac`` if available and otherwise ``stackstac``.
patch_url : Callable, optional
Function that takes a string or pystac object and returns an altered
version. Normally used to sign urls before trying to read data from
them. For instance when working with Planetary Computer this argument
should be set to ``pc.sign``.
"""
if _is_item_search(obj):
item_collection = obj.item_collection()
return to_xarray(item_collection, stacking_library=stacking_library, **kwargs)
return to_xarray(
item_collection,
stacking_library=stacking_library,
patch_url=patch_url,
**kwargs,
)
raise TypeError


Expand All @@ -47,6 +59,7 @@ def _(
obj: pystac.Item | pystac.ItemCollection,
drop_variables: str | list[str] | None = None,
stacking_library: Literal["odc.stac", "stackstac"] | None = None,
patch_url: None | Callable[[str], str] = None,
**kwargs,
) -> xarray.Dataset:
if drop_variables is not None:
Expand All @@ -68,9 +81,17 @@ def _(
items = [obj]
else:
items = [i for i in obj]
return odc_stac.load(items, **{"chunks": {"x": 1024, "y": 1024}, **kwargs})
return odc_stac.load(
items,
**{"chunks": {"x": 1024, "y": 1024}, "patch_url": patch_url, **kwargs},
)
elif stacking_library == "stackstac":
stackstac = _import_optional_dependency("stackstac")
if patch_url:
if isinstance(obj, pystac.STACObject):
obj = patch_url(obj)
else:
obj = [patch_url(o) for o in obj]
da = stackstac.stack(obj, **kwargs)
bands = {}
for band in da.band.values:
Expand All @@ -88,6 +109,7 @@ def _(
def _(
obj: pystac.Asset,
stacking_library: Literal["odc.stac", "stackstac"] | None = None,
patch_url: None | Callable[[str], str] = None,
**kwargs,
) -> xarray.Dataset:
default_kwargs: Mapping = {"chunks": {}}
Expand All @@ -104,12 +126,10 @@ def _(
fsspec = _import_optional_dependency("fsspec")
r = requests.get(obj.href)
r.raise_for_status()
try:
import planetary_computer # type: ignore

refs = planetary_computer.sign(r.json())
except ImportError:
refs = r.json()
refs = r.json()
if patch_url is not None:
refs = patch_url(refs)

mapper = fsspec.get_mapper("reference://", fo=refs)
default_kwargs = {
Expand All @@ -128,5 +148,9 @@ def _(
_import_optional_dependency("zarr")
default_kwargs = {**default_kwargs, "engine": "zarr"}

ds = xarray.open_dataset(obj.href, **{**default_kwargs, **open_kwargs, **kwargs})
href = obj.href
if patch_url is not None:
href = patch_url(href)

ds = xarray.open_dataset(href, **{**default_kwargs, **open_kwargs, **kwargs})
return ds
9 changes: 8 additions & 1 deletion xpystac/xarray_plugin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from typing import Any, Literal

import pystac
Expand All @@ -18,6 +18,7 @@ def open_dataset(
filename_or_obj: Any,
drop_variables: str | Iterable[str] | None = None,
stacking_library: Literal["odc.stac", "stackstac"] | None = None,
patch_url: None | Callable[[str], str] = None,
**kwargs,
):
"""Given a PySTAC object return an xarray dataset
Expand All @@ -40,11 +41,17 @@ def open_dataset(
stacking_library : "odc.stac", "stackstac", optional
When stacking multiple items, this argument determines which library
to use. Defaults to ``odc.stac`` if available and otherwise ``stackstac``.
patch_url : Callable, optional
Function that takes a string or pystac object and returns an altered
version. Normally used to sign urls before trying to read data from
them. For instance when working with Planetary Computer this argument
should be set to ``pc.sign``.
"""
return to_xarray(
filename_or_obj,
drop_variables=drop_variables,
stacking_library=stacking_library,
patch_url=patch_url,
**kwargs,
)

Expand Down

0 comments on commit b2d1b85

Please sign in to comment.