-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Move json equality logic outside of test_arrow * Refactor to RawBatch and CleanBatch wrapper types * Move _from_arrow functions to _api * Update imports * fix circular import * keep deprecated api * Add write-read test and fix typing * add parquet tests * fix ci * Initial delta lake support * Manual schema updates * Add delta lake dep * Fix export * fix pyupgrade lint * Add type hints * any typing
- Loading branch information
1 parent
0ba3994
commit e43398b
Showing
8 changed files
with
168 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from __future__ import annotations | ||
|
||
import itertools | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Any, Iterable | ||
|
||
import pyarrow as pa | ||
from deltalake import write_deltalake | ||
|
||
from stac_geoparquet.arrow._api import parse_stac_ndjson_to_arrow | ||
from stac_geoparquet.arrow._to_parquet import create_geoparquet_metadata | ||
|
||
if TYPE_CHECKING: | ||
from deltalake import DeltaTable | ||
|
||
|
||
def parse_stac_ndjson_to_delta_lake( | ||
input_path: str | Path | Iterable[str | Path], | ||
table_or_uri: str | Path | DeltaTable, | ||
*, | ||
chunk_size: int = 65536, | ||
schema: pa.Schema | None = None, | ||
limit: int | None = None, | ||
**kwargs: Any, | ||
) -> None: | ||
batches_iter = parse_stac_ndjson_to_arrow( | ||
input_path, chunk_size=chunk_size, schema=schema, limit=limit | ||
) | ||
first_batch = next(batches_iter) | ||
schema = first_batch.schema.with_metadata( | ||
create_geoparquet_metadata(pa.Table.from_batches([first_batch])) | ||
) | ||
combined_iter = itertools.chain([first_batch], batches_iter) | ||
write_deltalake(table_or_uri, combined_iter, schema=schema, engine="rust", **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import json | ||
from pathlib import Path | ||
|
||
import pytest | ||
from deltalake import DeltaTable | ||
|
||
from stac_geoparquet.arrow import stac_table_to_items | ||
from stac_geoparquet.arrow._delta_lake import parse_stac_ndjson_to_delta_lake | ||
|
||
from .json_equals import assert_json_value_equal | ||
|
||
HERE = Path(__file__).parent | ||
|
||
TEST_COLLECTIONS = [ | ||
"3dep-lidar-copc", | ||
# "3dep-lidar-dsm", | ||
"cop-dem-glo-30", | ||
"io-lulc-annual-v02", | ||
# "io-lulc", | ||
"landsat-c2-l1", | ||
"landsat-c2-l2", | ||
"naip", | ||
"planet-nicfi-analytic", | ||
"sentinel-1-rtc", | ||
"sentinel-2-l2a", | ||
"us-census", | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("collection_id", TEST_COLLECTIONS) | ||
def test_round_trip_via_delta_lake(collection_id: str, tmp_path: Path): | ||
path = HERE / "data" / f"{collection_id}-pc.json" | ||
out_path = tmp_path / collection_id | ||
parse_stac_ndjson_to_delta_lake(path, out_path) | ||
|
||
# Read back into table and convert to json | ||
dt = DeltaTable(out_path) | ||
table = dt.to_pyarrow_table() | ||
items_result = list(stac_table_to_items(table)) | ||
|
||
# Compare with original json | ||
with open(HERE / "data" / f"{collection_id}-pc.json") as f: | ||
items = json.load(f) | ||
|
||
for result, expected in zip(items_result, items): | ||
assert_json_value_equal(result, expected, precision=0) |