Skip to content

Commit 92ae92d

Browse files
committed
feat: added long_to_cube utils function
1 parent 201e965 commit 92ae92d

File tree

4 files changed

+100
-1
lines changed

4 files changed

+100
-1
lines changed

meteora/utils.py

+58
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121

2222
from meteora import settings
2323

24+
try:
25+
import xarray as xr
26+
import xvec # noqa: F401
27+
except ImportError:
28+
xr = None
29+
2430
RegionType = str | Sequence | gpd.GeoSeries | gpd.GeoDataFrame | os.PathLike | IO
2531
VariablesType = str | int | list[str] | list[int]
2632
DateTimeType = (
@@ -85,6 +91,58 @@ def long_to_wide(
8591
)
8692

8793

94+
def long_to_cube(
95+
ts_df: pd.DataFrame,
96+
stations_gdf: gpd.GeoDataFrame,
97+
*,
98+
stations_gdf_id_col: str | None = None,
99+
) -> xr.Dataset | None:
100+
"""Convert a time series data frame and station locations to a vector data cube.
101+
102+
A vector data cube is an n-D array with at least one dimension indexed by vector
103+
geometries. In Python, this is represented as an xarray Dataset (or DataArray)
104+
object with an indexed dimension with vector geometries set using xvec.
105+
106+
Parameters
107+
----------
108+
ts_df : pd.DataFrame
109+
Long form data frame with a time series of measurements (second-level index) at
110+
each station (first-level index) for each variable (column).
111+
stations_gdf : gpd.GeoDataFrame
112+
The stations data as a GeoDataFrame.
113+
stations_gdf_id_col : str, optional
114+
The column in `stations_gdf` that matches the first-level index of `ts_df`. If
115+
None, the first-level index name of `ts_df` is used (however, it may not be
116+
an actual column in `stations_gdf`, in which case a KeyError is raised).
117+
118+
Returns
119+
-------
120+
ts_cube : xr.Dataset
121+
The vector data cube with the time series of measurements for each station. The
122+
stations are indexed by their geometry.
123+
"""
124+
# get the stations id column in the time series data frame
125+
stations_ts_df_id_col = ts_df.index.names[0]
126+
# get the stations id column in the GeoDataFrame
127+
if stations_gdf_id_col is None:
128+
stations_gdf_id_col = stations_ts_df_id_col
129+
# convert data frame to xarray
130+
ts_ds = ts_df.to_xarray()
131+
# assign the stations geometries as indexed dimension
132+
return (
133+
ts_ds.assign_coords(
134+
**{
135+
stations_ts_df_id_col: stations_gdf.set_index(stations_gdf_id_col).loc[
136+
ts_ds[stations_ts_df_id_col].values
137+
]["geometry"]
138+
}
139+
)
140+
# .rename({stations_ts_df_id_col: "geometry"})
141+
# .xvec.set_geom_indexes("geometry", crs=stations_gdf.crs)
142+
.xvec.set_geom_indexes(stations_ts_df_id_col, crs=stations_gdf.crs)
143+
)
144+
145+
88146
########################################################################################
89147
# abstract attribute
90148
# `DummyAttribute` and `abstract_attribute` below are hardcoded from

pyproject.toml

+6-1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ test = [
6868
"requests-mock",
6969
"ruff"
7070
]
71+
xvec = [
72+
"xarray",
73+
"xvec"
74+
]
7175

7276
[project.urls]
7377
Repository = "https://github.com/martibosch/meteora"
@@ -176,7 +180,8 @@ conda_deps = [
176180
]
177181
extras = [
178182
"ox",
179-
"test"
183+
"test",
184+
"xvec"
180185
]
181186
whitelist_externals = [
182187
"pytest"

tests/data/stations.gpkg

96 KB
Binary file not shown.

tests/test_meteora.py

+36
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@
77
import unittest
88
from os import path
99

10+
import geopandas as gpd
1011
import osmnx as ox
1112
import pandas as pd
13+
import pytest
1214
import requests_mock
15+
import xarray as xr
1316
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
1417

1518
from meteora import settings, utils
@@ -68,6 +71,39 @@ def test_utils():
6871
assert len(wide_ts_df.columns.names) == 1
6972
assert isinstance(wide_ts_df.index, pd.DatetimeIndex)
7073

74+
# long to cube (xvec)
75+
stations_gdf = gpd.read_file(path.join(tests_data_dir, "stations.gpkg"))
76+
# the test data (ts_df and stations_gdf) is from GHCNh, where the stations id column
77+
# is "Station_ID" in ts_df and "id" in stations_gdf
78+
stations_ts_df_id_col = "Station_ID"
79+
stations_gdf_id_col = "id"
80+
with pytest.raises(KeyError):
81+
# attempting to convert with a mismatching station id column between ts_df and
82+
# stations_gdf raises a KeyError
83+
utils.long_to_cube(ts_df, stations_gdf)
84+
# if stations_gdf does not cover all stations in ts_df a KeyError is also raised
85+
utils.long_to_cube(
86+
ts_df, stations_gdf.iloc[:2], stations_gdf_id_col=stations_gdf_id_col
87+
)
88+
# attempting to convert from the wide form also raises a KeyError
89+
utils.wide_to_cube(
90+
wide_ts_df, stations_gdf, stations_gdf_id_col=stations_gdf_id_col
91+
)
92+
# test proper conversion
93+
ts_cube = utils.long_to_cube(
94+
ts_df, stations_gdf, stations_gdf_id_col=stations_gdf_id_col
95+
)
96+
# test an xarray dataset is returned
97+
assert isinstance(ts_cube, xr.Dataset)
98+
# test that the time column is in the coordinates
99+
assert ts_df.index.names[1] in ts_cube.coords
100+
# test that the variable columns are in the data_vars
101+
assert all([var in ts_cube.data_vars for var in ts_df.columns])
102+
# test that it has a dimension with geometry and that it is labeled using the
103+
# stations id column in ts_df
104+
assert stations_ts_df_id_col in ts_cube.xvec.geom_coords
105+
assert stations_ts_df_id_col in ts_cube.xvec.geom_coords_indexed
106+
71107
# logger
72108
def test_logging():
73109
utils.log("test a fake default message")

0 commit comments

Comments
 (0)