feat: added long_to_cube utils function

martibosch · martibosch · commit 92ae92ddfba1 · 2025-03-17T12:09:50.000+01:00
diff --git a/meteora/utils.py b/meteora/utils.py
@@ -21,6 +21,12 @@
 
 from meteora import settings
 
+try:
+    import xarray as xr
+    import xvec  # noqa: F401
+except ImportError:
+    xr = None
+
 RegionType = str | Sequence | gpd.GeoSeries | gpd.GeoDataFrame | os.PathLike | IO
 VariablesType = str | int | list[str] | list[int]
 DateTimeType = (
@@ -85,6 +91,58 @@ def long_to_wide(
     )
 
 
+def long_to_cube(
+    ts_df: pd.DataFrame,
+    stations_gdf: gpd.GeoDataFrame,
+    *,
+    stations_gdf_id_col: str | None = None,
+) -> xr.Dataset | None:
+    """Convert a time series data frame and station locations to a vector data cube.
+
+    A vector data cube is an n-D array with at least one dimension indexed by vector
+    geometries. In Python, this is represented as an xarray Dataset (or DataArray)
+    object with an indexed dimension with vector geometries set using xvec.
+
+    Parameters
+    ----------
+    ts_df : pd.DataFrame
+        Long form data frame with a time series of measurements (second-level index) at
+        each station (first-level index) for each variable (column).
+    stations_gdf : gpd.GeoDataFrame
+        The stations data as a GeoDataFrame.
+    stations_gdf_id_col : str, optional
+        The column in `stations_gdf` that matches the first-level index of `ts_df`. If
+        None, the first-level index name of `ts_df` is used (however, it may not be
+        an actual column in `stations_gdf`, in which case a KeyError is raised).
+
+    Returns
+    -------
+    ts_cube : xr.Dataset
+        The vector data cube with the time series of measurements for each station. The
+        stations are indexed by their geometry.
+    """
+    # get the stations id column in the time series data frame
+    stations_ts_df_id_col = ts_df.index.names[0]
+    # get the stations id column in the GeoDataFrame
+    if stations_gdf_id_col is None:
+        stations_gdf_id_col = stations_ts_df_id_col
+    # convert data frame to xarray
+    ts_ds = ts_df.to_xarray()
+    # assign the stations geometries as indexed dimension
+    return (
+        ts_ds.assign_coords(
+            **{
+                stations_ts_df_id_col: stations_gdf.set_index(stations_gdf_id_col).loc[
+                    ts_ds[stations_ts_df_id_col].values
+                ]["geometry"]
+            }
+        )
+        # .rename({stations_ts_df_id_col: "geometry"})
+        # .xvec.set_geom_indexes("geometry", crs=stations_gdf.crs)
+        .xvec.set_geom_indexes(stations_ts_df_id_col, crs=stations_gdf.crs)
+    )
+
+
 ########################################################################################
 # abstract attribute
 # `DummyAttribute` and `abstract_attribute` below are hardcoded from
diff --git a/pyproject.toml b/pyproject.toml
@@ -68,6 +68,10 @@ test = [
   "requests-mock",
   "ruff"
 ]
+xvec = [
+  "xarray",
+  "xvec"
+]
 
 [project.urls]
 Repository = "https://github.com/martibosch/meteora"
@@ -176,7 +180,8 @@ conda_deps = [
 ]
 extras = [
   "ox",
-  "test"
+  "test",
+  "xvec"
 ]
 whitelist_externals = [
   "pytest"
diff --git a/tests/data/stations.gpkg b/tests/data/stations.gpkg
diff --git a/tests/test_meteora.py b/tests/test_meteora.py
@@ -7,9 +7,12 @@
 import unittest
 from os import path
 
+import geopandas as gpd
 import osmnx as ox
 import pandas as pd
+import pytest
 import requests_mock
+import xarray as xr
 from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
 
 from meteora import settings, utils
@@ -68,6 +71,39 @@ def test_utils():
     assert len(wide_ts_df.columns.names) == 1
     assert isinstance(wide_ts_df.index, pd.DatetimeIndex)
 
+    # long to cube (xvec)
+    stations_gdf = gpd.read_file(path.join(tests_data_dir, "stations.gpkg"))
+    # the test data (ts_df and stations_gdf) is from GHCNh, where the stations id column
+    # is "Station_ID" in ts_df and "id" in stations_gdf
+    stations_ts_df_id_col = "Station_ID"
+    stations_gdf_id_col = "id"
+    with pytest.raises(KeyError):
+        # attempting to convert with a mismatching station id column between ts_df and
+        # stations_gdf raises a KeyError
+        utils.long_to_cube(ts_df, stations_gdf)
+        # if stations_gdf does not cover all stations in ts_df a KeyError is also raised
+        utils.long_to_cube(
+            ts_df, stations_gdf.iloc[:2], stations_gdf_id_col=stations_gdf_id_col
+        )
+        # attempting to convert from the wide form also raises a KeyError
+        utils.wide_to_cube(
+            wide_ts_df, stations_gdf, stations_gdf_id_col=stations_gdf_id_col
+        )
+    # test proper conversion
+    ts_cube = utils.long_to_cube(
+        ts_df, stations_gdf, stations_gdf_id_col=stations_gdf_id_col
+    )
+    # test an xarray dataset is returned
+    assert isinstance(ts_cube, xr.Dataset)
+    # test that the time column is in the coordinates
+    assert ts_df.index.names[1] in ts_cube.coords
+    # test that the variable columns are in the data_vars
+    assert all([var in ts_cube.data_vars for var in ts_df.columns])
+    # test that it has a dimension with geometry and that it is labeled using the
+    # stations id column in ts_df
+    assert stations_ts_df_id_col in ts_cube.xvec.geom_coords
+    assert stations_ts_df_id_col in ts_cube.xvec.geom_coords_indexed
+
     # logger
     def test_logging():
         utils.log("test a fake default message")

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,10 @@ test = [`
`68`	`68`	`"requests-mock",`
`69`	`69`	`"ruff"`
`70`	`70`	`]`
	`71`	`+xvec = [`
	`72`	`+ "xarray",`
	`73`	`+ "xvec"`
	`74`	`+]`
`71`	`75`
`72`	`76`	`[project.urls]`
`73`	`77`	`Repository = "https://github.com/martibosch/meteora"`
`@@ -176,7 +180,8 @@ conda_deps = [`
`176`	`180`	`]`
`177`	`181`	`extras = [`
`178`	`182`	`"ox",`
`179`		`- "test"`
	`183`	`+ "test",`
	`184`	`+ "xvec"`
`180`	`185`	`]`
`181`	`186`	`whitelist_externals = [`
`182`	`187`	`"pytest"`