diff --git a/README.md b/README.md index 8bb3f86..77e1c37 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Options: -r, --resolution [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15] H3 resolution to index [required] -pr, --parent_res [0|1|2|3|4|5|6|7|8|9|10|11|12|13|14|15] - H3 Parent resolution for the output + H3 parent resolution for the output partition. Defaults to resolution - 6 -id, --id_field TEXT Field to use as an ID; defaults to a constructed single 0...n index on the @@ -63,11 +63,11 @@ Options: cell ID and the ID given by the -id field (or the default index ID). -ch, --chunksize INTEGER The number of rows per index partition to - use when spatially partioning. Adjusting + use when spatially partitioning. Adjusting this number will trade off memory use and time. [default: 50; required] -s, --spatial_sorting [hilbert|morton|geohash|none] - Spatial sorting method when perfoming + Spatial sorting method when performing spatial partitioning. [default: none] -crs, --cut_crs INTEGER Set the coordinate reference system (CRS) used for cutting large geometries (see @@ -76,13 +76,13 @@ Options: -c, --cut_threshold FLOAT Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless - the `--cut_crs` is also given, in which case + `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will - skip bissection entirely (effectively + skip bisection entirely (effectively ignoring --cut_crs). -t, --threads INTEGER Amount of threads used for operation [default: NUM_CPUS - 1] @@ -111,6 +111,7 @@ Options: -o, --overwrite --version Show the version and exit. --help Show this message and exit. + ``` ## Visualising output diff --git a/tests/classes/a5.py b/tests/classes/a5.py index f2b89eb..6ce10fb 100644 --- a/tests/classes/a5.py +++ b/tests/classes/a5.py @@ -10,206 +10,172 @@ class TestA5(TestRunthrough): """ def test_a5_run(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"A5 runthrough failed.") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + ], + standalone_mode=False, + ) def test_a5_run_overwrite(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - ], - standalone_mode=False, - ) - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "-o", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"A5 runthrough with overwrite failed.") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + ], + standalone_mode=False, + ) + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "-o", + ], + standalone_mode=False, + ) def test_a5_cut_crs(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "-crs", - "3793", - "-c", - "4000", - ], - standalone_mode=False, - ) - - except Exception: - self.fail("A5 run through using actual CRS failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "-crs", + "3793", + "-c", + "4000", + ], + standalone_mode=False, + ) def test_a5_cut_crs_reproject(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "-crs", - "4326", - "-c", - "0.005", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through with reprojected CRS failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "-crs", + "4326", + "-c", + "0.005", + ], + standalone_mode=False, + ) def test_a5_no_bisection(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "-c", - "0", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through without bisection failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "-c", + "0", + ], + standalone_mode=False, + ) def test_a5_compaction(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "-co", - "-id", - "LCDB_UID", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"A5 runthrough with compaction failed.") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "-co", + "-id", + "LCDB_UID", + ], + standalone_mode=False, + ) def test_a5_geo_point(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "--geo", - "point", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through with --geo point failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "--geo", + "point", + ], + standalone_mode=False, + ) def test_a5_geo_point_compact(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "--geo", - "point", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through with --geo point -co failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "--geo", + "point", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) def test_a5_geo_polygon(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "--geo", - "polygon", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through with --geo polygon failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "--geo", + "polygon", + ], + standalone_mode=False, + ) def test_a5_geo_polygon_compact(self): - try: - a5( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "17", - "--geo", - "polygon", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("A5 run through with --geo polygon -co failed") + a5( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "17", + "--geo", + "polygon", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) diff --git a/tests/classes/base.py b/tests/classes/base.py index d43777a..fd1ea01 100644 --- a/tests/classes/base.py +++ b/tests/classes/base.py @@ -1,8 +1,3 @@ -""" - -@author: ndemaio -""" - from unittest import * from ..data.datapaths import * diff --git a/tests/classes/errors.py b/tests/classes/errors.py new file mode 100644 index 0000000..42c1111 --- /dev/null +++ b/tests/classes/errors.py @@ -0,0 +1,79 @@ +from unittest import TestCase + +from vector2dggs import common +from vector2dggs.h3 import h3 +from vector2dggs.indexerfactory import indexer_instance + +from .base import TestRunthrough +from ..data.datapaths import * + + +class TestErrors(TestCase): + """ + Error-path unit tests that raise before touching the filesystem, + so no output cleanup is needed. + """ + + def test_parent_res_not_less_than_resolution_raises(self): + with self.assertRaises(common.ParentResolutionException): + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-pr", + "8", + ], + standalone_mode=False, + ) + + def test_compact_without_id_field_raises(self): + with self.assertRaises(common.IdFieldError): + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-co", + ], + standalone_mode=False, + ) + + def test_unknown_dggs_raises(self): + with self.assertRaises(ValueError): + indexer_instance("not_a_real_dggs") + + +class TestOverwriteRequired(TestRunthrough): + """Requires a first successful run to create output, then checks the guard.""" + + def test_overwrite_flag_required_on_second_run(self): + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) + with self.assertRaises(FileExistsError): + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) diff --git a/tests/classes/geohash.py b/tests/classes/geohash.py index 4d374ea..0abea84 100644 --- a/tests/classes/geohash.py +++ b/tests/classes/geohash.py @@ -10,186 +10,170 @@ class TestGeohash(TestRunthrough): """ def test_geohash_run(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"TestGeohash.test_geohash_run: Geohash runthrough failed.") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + ], + standalone_mode=False, + ) def test_geohash_run_overwrite(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - ], - standalone_mode=False, - ) - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "-o", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"geohash runthrough with overwrite failed.") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + ], + standalone_mode=False, + ) + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "-o", + ], + standalone_mode=False, + ) def test_geohash_cut_crs(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "-crs", - "3793", - ], - standalone_mode=False, - ) - - except Exception: - self.fail("geohash run through using actual CRS failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "-crs", + "3793", + ], + standalone_mode=False, + ) def test_geohash_cut_crs_reproject(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "-crs", - "4326", - "-c", - "0.005", - ], - standalone_mode=False, - ) - except Exception: - self.fail("geohash run through with reprojected CRS failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "-crs", + "4326", + "-c", + "0.005", + ], + standalone_mode=False, + ) + + def test_geohash_no_bisection(self): + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "-c", + "0", + ], + standalone_mode=False, + ) def test_geohash_compaction(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "-co", - "-id", - "LCDB_UID", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"Geohash runthrough failed.") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "-co", + "-id", + "LCDB_UID", + ], + standalone_mode=False, + ) def test_geohash_geo_point(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "--geo", - "point", - ], - standalone_mode=False, - ) - except Exception: - self.fail("geohash run through with geo point failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "--geo", + "point", + ], + standalone_mode=False, + ) def test_geohash_geo_point_compact(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "--geo", - "point", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("geohash run through with geo point compact failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "--geo", + "point", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) def test_geohash_geo_polygon(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "--geo", - "polygon", - ], - standalone_mode=False, - ) - except Exception: - self.fail("geohash run through with geo polygon failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "--geo", + "polygon", + ], + standalone_mode=False, + ) def test_geohash_geo_polygon_compact(self): - try: - geohash( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "6", - "--geo", - "polygon", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("geohash run through with geo polygon compact failed") + geohash( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "6", + "--geo", + "polygon", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) diff --git a/tests/classes/geometry_types.py b/tests/classes/geometry_types.py new file mode 100644 index 0000000..f3cd067 --- /dev/null +++ b/tests/classes/geometry_types.py @@ -0,0 +1,42 @@ +from .base import TestRunthrough +from ..data.datapaths import * + +from vector2dggs.h3 import h3 + + +class TestGeometryTypes(TestRunthrough): + """ + Verifies that LineString and Point geometry types are indexed end-to-end. + Uses H3 as the reference backend. Bisection is disabled (-c 0) since the + fixtures are small and these tests are purely about geometry-type routing. + """ + + def test_h3_linestring(self): + h3( + [ + TEST_LINESTRING_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LINESTRING_LAYER_NAME, + "-r", + "10", + "-c", + "0", + ], + standalone_mode=False, + ) + + def test_h3_point(self): + h3( + [ + TEST_POINT_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_POINT_LAYER_NAME, + "-r", + "10", + "-c", + "0", + ], + standalone_mode=False, + ) diff --git a/tests/classes/h3.py b/tests/classes/h3.py index 36311bb..0efdd5b 100644 --- a/tests/classes/h3.py +++ b/tests/classes/h3.py @@ -10,208 +10,174 @@ class TestH3(TestRunthrough): """ def test_h3_run(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"H3 runthrough failed.") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) def test_h3_run_overwrite(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - ], - standalone_mode=False, - ) - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-o", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"H3 runthrough with overwrite failed.") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-o", + ], + standalone_mode=False, + ) def test_h3_cut_crs(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-crs", - "3793", - "-c", - "4000", - ], - standalone_mode=False, - ) - - except Exception: - self.fail("H3 run through using actual CRS failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-crs", + "3793", + "-c", + "4000", + ], + standalone_mode=False, + ) def test_h3_cut_crs_reproject(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-crs", - "4326", - "-c", - "0.005", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through with reprojected CRS failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-crs", + "4326", + "-c", + "0.005", + ], + standalone_mode=False, + ) def test_h3_no_bisection(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-c", - "0", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through without bisection failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-c", + "0", + ], + standalone_mode=False, + ) def test_h3_compaction(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-co", - "-id", - "LCDB_UID", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"H3 runthrough failed.") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-co", + "-id", + "LCDB_UID", + ], + standalone_mode=False, + ) def test_h3_geo_point(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "point", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through with --geo point failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "point", + "-o", + ], + standalone_mode=False, + ) def test_h3_geo_point_compact(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "point", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through with --geo point -co failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "point", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) def test_h3_geo_polygon(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "polygon", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through with --geo polygon failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "polygon", + "-o", + ], + standalone_mode=False, + ) def test_h3_geo_polygon_compact(self): - try: - h3( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "polygon", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("H3 run through with --geo polygon -co failed") + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "polygon", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) diff --git a/tests/classes/katana.py b/tests/classes/katana.py index 225d0c5..159757f 100644 --- a/tests/classes/katana.py +++ b/tests/classes/katana.py @@ -29,9 +29,6 @@ class TestKatana(TestRunthrough): def test_katana(self): area_threshold = 0.05 - try: - for geom in [polygon_a, polygon_b, polygon_c, polygon_d]: - collection = katana(geom, area_threshold) - # print(GeometryCollection(collection)) - except Exception: - self.fail(f"Bisection runthrough failed.") + for geom in [polygon_a, polygon_b, polygon_c, polygon_d]: + collection = katana(geom, area_threshold) + # print(GeometryCollection(collection)) diff --git a/tests/classes/output_validation.py b/tests/classes/output_validation.py new file mode 100644 index 0000000..9faf6ab --- /dev/null +++ b/tests/classes/output_validation.py @@ -0,0 +1,85 @@ +import json + +import pyarrow.parquet as pq + +from .base import TestRunthrough +from ..data.datapaths import * + +from vector2dggs.h3 import h3 + + +class TestOutputValidation(TestRunthrough): + """ + Reads output parquet files back after indexing and asserts structural + correctness. Uses H3 at resolution 8 (default parent_res=2) as the + reference backend throughout. + """ + + def _parquet_files(self): + files = sorted(TEST_OUTPUT_PATH.rglob("*.parquet")) + self.assertTrue(files, "No parquet files written to output") + return files + + def _run_h3(self, extra_args=()): + h3( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + *extra_args, + ], + standalone_mode=False, + ) + + def test_partition_dirs_named_by_parent_res(self): + """Hive partition directories are named h3_02=.""" + self._run_h3() + dirs = [d for d in TEST_OUTPUT_PATH.iterdir() if d.is_dir()] + self.assertTrue(dirs, "No partition directories in output") + for d in dirs: + self.assertTrue( + d.name.startswith("h3_02="), + f"Expected h3_02=… partition dir, got: {d.name}", + ) + + def test_explicit_parent_res_reflected_in_dirs(self): + """--parent-res 3 produces h3_03=… partition directories.""" + self._run_h3(("-pr", "3")) + dirs = [d for d in TEST_OUTPUT_PATH.iterdir() if d.is_dir()] + self.assertTrue(dirs, "No partition directories in output") + for d in dirs: + self.assertTrue( + d.name.startswith("h3_03="), + f"Expected h3_03=… partition dir, got: {d.name}", + ) + + def test_geo_point_output_has_geometry_column(self): + """GeoParquet point output contains a geometry column.""" + self._run_h3(("--geo", "point")) + table = pq.read_table(self._parquet_files()[0]) + self.assertIn("geometry", table.schema.names) + + def test_geo_point_output_has_geoparquet_metadata(self): + """GeoParquet point output carries valid geo metadata.""" + self._run_h3(("--geo", "point")) + table = pq.read_table(self._parquet_files()[0]) + self.assertIn(b"geo", table.schema.metadata) + geo = json.loads(table.schema.metadata[b"geo"]) + self.assertEqual(geo["primary_column"], "geometry") + self.assertIn("geometry", geo["columns"]) + + def test_geo_polygon_output_has_geometry_column(self): + """GeoParquet polygon output contains a geometry column.""" + self._run_h3(("--geo", "polygon")) + table = pq.read_table(self._parquet_files()[0]) + self.assertIn("geometry", table.schema.names) + + def test_keep_attributes_retains_source_columns(self): + """--keep-attributes includes original attribute columns in output.""" + self._run_h3(("-k",)) + table = pq.read_table(self._parquet_files()[0]) + self.assertIn("Name_2018", table.schema.names) + self.assertIn("LCDB_UID", table.schema.names) diff --git a/tests/classes/rHP.py b/tests/classes/rHP.py index 2187d9d..4c2bcf1 100644 --- a/tests/classes/rHP.py +++ b/tests/classes/rHP.py @@ -10,186 +10,170 @@ class TestRHP(TestRunthrough): """ def test_rhp_run(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"rHP runthrough failed.") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) def test_rhp_run_overwrite(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - ], - standalone_mode=False, - ) - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-o", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"rHP runthrough with overwrite failed.") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + ], + standalone_mode=False, + ) + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-o", + ], + standalone_mode=False, + ) def test_rhp_cut_crs(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-crs", - "3793", - ], - standalone_mode=False, - ) - - except Exception: - self.fail("rHP run through using actual CRS failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-crs", + "3793", + ], + standalone_mode=False, + ) def test_rhp_cut_crs_reproject(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-crs", - "4326", - "-c", - "0.005", - ], - standalone_mode=False, - ) - except Exception: - self.fail("rHP run through with reprojected CRS failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-crs", + "4326", + "-c", + "0.005", + ], + standalone_mode=False, + ) + + def test_rhp_no_bisection(self): + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-c", + "0", + ], + standalone_mode=False, + ) def test_rhp_compaction(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "-co", - "-id", - "LCDB_UID", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"rHP runthrough failed.") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "-co", + "-id", + "LCDB_UID", + ], + standalone_mode=False, + ) def test_rhp_geo_point(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "point", - ], - standalone_mode=False, - ) - except Exception: - self.fail("rHP run through with geo point failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "point", + ], + standalone_mode=False, + ) def test_rhp_geo_point_compact(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "point", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("rHP run through with geo point compact failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "point", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) def test_rhp_geo_polygon(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "polygon", - ], - standalone_mode=False, - ) - except Exception: - self.fail("rHP run through with geo polygon failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "polygon", + ], + standalone_mode=False, + ) def test_rhp_geo_polygon_compact(self): - try: - rhp( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "8", - "--geo", - "polygon", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("rHP run through with geo polygon compact failed") + rhp( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "8", + "--geo", + "polygon", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) diff --git a/tests/classes/s2.py b/tests/classes/s2.py index 59ae587..8ab4153 100644 --- a/tests/classes/s2.py +++ b/tests/classes/s2.py @@ -10,206 +10,172 @@ class TestS2(TestRunthrough): """ def test_s2_run(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"S2 runthrough failed.") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + ], + standalone_mode=False, + ) def test_s2_run_overwrite(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - ], - standalone_mode=False, - ) - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "-o", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"S2 runthrough with overwrite failed.") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + ], + standalone_mode=False, + ) + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "-o", + ], + standalone_mode=False, + ) def test_s2_cut_crs(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "-crs", - "3793", - "-c", - "4000", - ], - standalone_mode=False, - ) - - except Exception: - self.fail("S2 run through using actual CRS failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "-crs", + "3793", + "-c", + "4000", + ], + standalone_mode=False, + ) def test_s2_cut_crs_reproject(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "-crs", - "4326", - "-c", - "0.005", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through with reprojected CRS failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "-crs", + "4326", + "-c", + "0.005", + ], + standalone_mode=False, + ) def test_s2_no_bisection(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "-c", - "0", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through without bisection failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "-c", + "0", + ], + standalone_mode=False, + ) def test_s2_compaction(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "-co", - "-id", - "LCDB_UID", - ], - standalone_mode=False, - ) - - except Exception: - self.fail(f"S2 runthrough failed.") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "-co", + "-id", + "LCDB_UID", + ], + standalone_mode=False, + ) def test_s2_geo_point(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "--geo", - "point", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through with geo point failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "--geo", + "point", + ], + standalone_mode=False, + ) def test_s2_geo_point_compact(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "--geo", - "point", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through with geo point compact failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "--geo", + "point", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) def test_s2_geo_polygon(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "--geo", - "polygon", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through with geo polygon failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "--geo", + "polygon", + ], + standalone_mode=False, + ) def test_s2_geo_polygon_compact(self): - try: - s2( - [ - TEST_FILE_PATH, - str(TEST_OUTPUT_PATH), - "--layer", - TEST_LAYER_NAME, - "-r", - "13", - "--geo", - "polygon", - "-co", - "-id", - "LCDB_UID", - "-o", - ], - standalone_mode=False, - ) - except Exception: - self.fail("S2 run through with geo polygon compact failed") + s2( + [ + TEST_FILE_PATH, + str(TEST_OUTPUT_PATH), + "--layer", + TEST_LAYER_NAME, + "-r", + "13", + "--geo", + "polygon", + "-co", + "-id", + "LCDB_UID", + "-o", + ], + standalone_mode=False, + ) diff --git a/tests/data/datapaths.py b/tests/data/datapaths.py index 34b04af..d6402df 100644 --- a/tests/data/datapaths.py +++ b/tests/data/datapaths.py @@ -1,7 +1,3 @@ -""" -@author: ndemaio -""" - from pathlib import Path DATA_DIR = Path(__file__).resolve().parent @@ -9,3 +5,8 @@ TEST_FILE_PATH = str(DATA_DIR / "se-island.gpkg") TEST_LAYER_NAME = "se_island" TEST_OUTPUT_PATH = DATA_DIR / "output" + +TEST_LINESTRING_FILE_PATH = str(DATA_DIR / "se-island-contours.gpkg") +TEST_LINESTRING_LAYER_NAME = "contours" +TEST_POINT_FILE_PATH = str(DATA_DIR / "se-island-height-pts.gpkg") +TEST_POINT_LAYER_NAME = "nz_chatham_island_height_points_topo_150k" diff --git a/tests/test_runthrough.py b/tests/test_runthrough.py index a87c0f4..031b4f1 100644 --- a/tests/test_runthrough.py +++ b/tests/test_runthrough.py @@ -1,10 +1,9 @@ -""" -@author: ndemaio -""" - from .classes.a5 import TestA5 from .classes.h3 import TestH3 from .classes.rHP import TestRHP from .classes.s2 import TestS2 from .classes.geohash import TestGeohash from .classes.katana import TestKatana +from .classes.geometry_types import TestGeometryTypes +from .classes.output_validation import TestOutputValidation +from .classes.errors import TestErrors, TestOverwriteRequired diff --git a/tests/test_vector2dggs.py b/tests/test_vector2dggs.py index b8d83aa..b29dbf0 100644 --- a/tests/test_vector2dggs.py +++ b/tests/test_vector2dggs.py @@ -1,7 +1,3 @@ -""" -@author: ndemaio -""" - import sys import unittest from pathlib import Path diff --git a/vector2dggs/a5.py b/vector2dggs/a5.py index 82a884d..213372d 100644 --- a/vector2dggs/a5.py +++ b/vector2dggs/a5.py @@ -1,206 +1,4 @@ -import click -import click_log -import tempfile -import pyproj - -from typing import Union -from pathlib import Path - import vector2dggs.constants as const -import vector2dggs.common as common - -from vector2dggs import __version__ - - -@click.command(context_settings={"show_default": True}) -@click_log.simple_verbosity_option(common.LOGGER) -@click.argument("vector_input", required=True, type=click.Path(), nargs=1) -@click.argument("output_directory", required=True, type=click.Path(), nargs=1) -@click.option( - "-r", - "--resolution", - required=True, - type=click.Choice(list(map(str, range(const.MIN_A5, const.MAX_A5 + 1)))), - help="A5 resolution to index", - nargs=1, -) -@click.option( - "-pr", - "--parent_res", - required=False, - type=click.Choice(list(map(str, range(const.MIN_A5, const.MAX_A5 + 1)))), - help="A5 parent resolution for the output partition. Defaults to resolution - 6", -) -@click.option( - "-id", - "--id_field", - required=False, - default=const.DEFAULTS["id"], - type=str, - help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", - nargs=1, -) -@click.option( - "-k", - "--keep_attributes", - is_flag=True, - show_default=True, - default=const.DEFAULTS["k"], - help="Retain attributes in output. The default is to create an output that only includes A5 cell ID and the ID given by the -id field (or the default index ID).", -) -@click.option( - "-ch", - "--chunksize", - required=True, - type=int, - default=const.DEFAULTS["ch"], - help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.", - nargs=1, -) -@click.option( - "-s", - "--spatial_sorting", - type=click.Choice(const.SPATIAL_SORTING_METHODS), - default=const.DEFAULTS["s"], - help="Spatial sorting method when perfoming spatial partitioning.", -) -@click.option( - "-crs", - "--cut_crs", - required=False, - default=const.DEFAULTS["crs"], - type=int, - help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", - nargs=1, -) -@click.option( - "-c", - "--cut_threshold", - required=False, - default=const.DEFAULTS["c"], - type=float, - help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bissection entirely (effectively ignoring --cut_crs).", - nargs=1, -) -@click.option( - "-t", - "--threads", - required=False, - default=const.DEFAULTS["t"], - type=int, - help="Amount of threads used for operation", - nargs=1, -) -@click.option( - "-cp", - "--compression", - required=False, - default=const.DEFAULTS["cp"], - type=str, - help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", - nargs=1, -) -@click.option( - "-lyr", - "--layer", - required=False, - default=const.DEFAULTS["lyr"], - type=str, - help="Name of the layer or table to read when using an input that supports layers or tables", - nargs=1, -) -@click.option( - "-g", - "--geom_col", - required=False, - default=const.DEFAULTS["g"], - type=str, - help="Column name to use when using a spatial database connection as input", - nargs=1, -) -@click.option( - "--geo", - required=False, - default=const.DEFAULTS["geo"], - type=click.Choice(const.GEOM_TYPES), - help="Select geometry encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", - nargs=1, -) -@click.option( - "--tempdir", - default=const.DEFAULTS["tempdir"], - type=click.Path(), - help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", -) -@click.option( - "-co", - "--compact", - is_flag=True, - help="Compact the A5 cells up to the parent resolution. Compaction requires an id_field.", -) -@click.option("-o", "--overwrite", is_flag=True) -@click.version_option(version=__version__) -def a5( - vector_input: Union[str, Path], - output_directory: Union[str, Path], - resolution: str, - parent_res: str, - id_field: str, - keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_crs: int, - cut_threshold: int, - threads: int, - compression: str, - layer: str, - geom_col: str, - geo: str, - tempdir: Union[str, Path], - compact: bool, - overwrite: bool, -): - """ - Ingest a vector dataset and index it to the A5 DGGS. - - VECTOR_INPUT is the path to input vector geospatial data. - OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store. - """ - tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir - - common.check_resolutions(resolution, parent_res) - common.check_compaction_requirements(compact, id_field) - - spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value - geo = const.GeoOutputMode(geo).value - - con, vector_input = common.db_conn_and_input_path(vector_input) - output_directory = common.resolve_output_path(output_directory, overwrite) - - if cut_crs is not None: - cut_crs = pyproj.CRS.from_user_input(cut_crs) +from vector2dggs.cli_factory import make_dggs_command - try: - common.index( - "a5", - vector_input, - output_directory, - int(resolution), - parent_res, - keep_attributes, - chunksize, - spatial_sorting, - cut_threshold, - threads, - compression=compression, - cut_crs=cut_crs, - id_field=id_field, - con=con, - layer=layer, - geom_col=geom_col, - geo=geo, - overwrite=overwrite, - compact=compact, - ) - except: - raise +a5 = make_dggs_command("a5", "a5", "A5", const.MIN_A5, const.MAX_A5) diff --git a/vector2dggs/cli_factory.py b/vector2dggs/cli_factory.py new file mode 100644 index 0000000..080bb19 --- /dev/null +++ b/vector2dggs/cli_factory.py @@ -0,0 +1,214 @@ +import tempfile +from pathlib import Path +from typing import Union + +import click +import click_log +import pyproj + +import vector2dggs.common as common +import vector2dggs.constants as const +from vector2dggs import __version__ + + +def make_dggs_command( + dggs_key: str, + command_name: str, + display_name: str, + min_res: int, + max_res: int, +) -> click.Command: + res_choices = list(map(str, range(min_res, max_res + 1))) + + @click.command(name=command_name, context_settings={"show_default": True}) + @click_log.simple_verbosity_option(common.LOGGER) + @click.argument("vector_input", required=True, type=click.Path(), nargs=1) + @click.argument("output_directory", required=True, type=click.Path(), nargs=1) + @click.option( + "-r", + "--resolution", + required=True, + type=click.Choice(res_choices), + help=f"{display_name} resolution to index", + nargs=1, + ) + @click.option( + "-pr", + "--parent_res", + required=False, + type=click.Choice(res_choices), + help=f"{display_name} parent resolution for the output partition. Defaults to resolution - 6", + ) + @click.option( + "-id", + "--id_field", + required=False, + default=const.DEFAULTS["id"], + type=str, + help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", + nargs=1, + ) + @click.option( + "-k", + "--keep_attributes", + is_flag=True, + show_default=True, + default=const.DEFAULTS["k"], + help=f"Retain attributes in output. The default is to create an output that only includes {display_name} cell ID and the ID given by the -id field (or the default index ID).", + ) + @click.option( + "-ch", + "--chunksize", + required=True, + type=int, + default=const.DEFAULTS["ch"], + help="The number of rows per index partition to use when spatially partitioning. Adjusting this number will trade off memory use and time.", + nargs=1, + ) + @click.option( + "-s", + "--spatial_sorting", + type=click.Choice(const.SPATIAL_SORTING_METHODS), + default=const.DEFAULTS["s"], + help="Spatial sorting method when performing spatial partitioning.", + ) + @click.option( + "-crs", + "--cut_crs", + required=False, + default=const.DEFAULTS["crs"], + type=int, + help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", + nargs=1, + ) + @click.option( + "-c", + "--cut_threshold", + required=False, + default=const.DEFAULTS["c"], + type=float, + help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bisection entirely (effectively ignoring --cut_crs).", + nargs=1, + ) + @click.option( + "-t", + "--threads", + required=False, + default=const.DEFAULTS["t"], + type=int, + help="Amount of threads used for operation", + nargs=1, + ) + @click.option( + "-cp", + "--compression", + required=False, + default=const.DEFAULTS["cp"], + type=str, + help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", + nargs=1, + ) + @click.option( + "-lyr", + "--layer", + required=False, + default=const.DEFAULTS["lyr"], + type=str, + help="Name of the layer or table to read when using an input that supports layers or tables", + nargs=1, + ) + @click.option( + "-g", + "--geom_col", + required=False, + default=const.DEFAULTS["g"], + type=str, + help="Column name to use when using a spatial database connection as input", + nargs=1, + ) + @click.option( + "--geo", + required=False, + default=const.DEFAULTS["geo"], + type=click.Choice(const.GEOM_TYPES), + help="Select geometry encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", + nargs=1, + ) + @click.option( + "--tempdir", + default=const.DEFAULTS["tempdir"], + type=click.Path(), + help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", + ) + @click.option( + "-co", + "--compact", + is_flag=True, + help=f"Compact the {display_name} cells up to the parent resolution. Compaction requires an id_field.", + ) + @click.option("-o", "--overwrite", is_flag=True) + @click.version_option(version=__version__) + def command( + vector_input: Union[str, Path], + output_directory: Union[str, Path], + resolution: str, + parent_res: str, + id_field: str, + keep_attributes: bool, + chunksize: int, + spatial_sorting: str, + cut_crs: int, + cut_threshold: float, + threads: int, + compression: str, + layer: str, + geom_col: str, + geo: str, + tempdir: Union[str, Path], + compact: bool, + overwrite: bool, + ): + tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir + + common.check_resolutions(resolution, parent_res) + common.check_compaction_requirements(compact, id_field) + + spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value + geo = const.GeoOutputMode(geo).value + + con, vector_input = common.db_conn_and_input_path(vector_input) + output_directory = common.resolve_output_path(output_directory, overwrite) + + if cut_crs is not None: + cut_crs = pyproj.CRS.from_user_input(cut_crs) + + common.index( + dggs_key, + vector_input, + output_directory, + int(resolution), + parent_res, + keep_attributes, + chunksize, + spatial_sorting, + cut_threshold, + threads, + compression=compression, + cut_crs=cut_crs, + id_field=id_field, + con=con, + layer=layer, + geom_col=geom_col, + geo=geo, + overwrite=overwrite, + compact=compact, + ) + + command.help = ( + f"Ingest a vector dataset and index it to the {display_name} DGGS.\n\n" + "VECTOR_INPUT is the path to input vector geospatial data.\n" + "OUTPUT_DIRECTORY should be a directory, not a file or database table, " + "as it will instead be the write location for an Apache Parquet data store." + ) + + return command diff --git a/vector2dggs/common.py b/vector2dggs/common.py index b7f82e4..2a09a06 100644 --- a/vector2dggs/common.py +++ b/vector2dggs/common.py @@ -142,7 +142,7 @@ def get_parent_res(dggs: str, parent_res: Union[None, str], resolution: int) -> Used for intermediate re-partioning. """ - if not dggs in const.DEFAULT_DGGS_PARENT_RES.keys(): + if dggs not in const.DEFAULT_DGGS_PARENT_RES.keys(): raise RuntimeError( "Unknown dggs {dggs}) - must be one of [ {options} ]".format( dggs=dggs, options=", ".join(const.DEFAULT_DGGS_PARENT_RES.keys()) @@ -163,7 +163,7 @@ def write_partition_as_geoparquet( dggs_col: str, compression: str, ) -> int: - if len(partition_df.index) == 0: + if partition_df.empty: return 0 if ( @@ -336,7 +336,7 @@ def _merge_partition_files(partition_dir: Path, compression: str) -> None: f.unlink() -def parent_partitioning( +def _parent_partitioning( indexer: VectorIndexer, input_dir: Path, output_dir: Path, @@ -428,10 +428,8 @@ def parent_partitioning( LOGGER.debug("Parent cell partitioning complete") - return - -def polyfill( +def _polyfill( indexer: VectorIndexer, pq_in: Path, spatial_sort_col: str, @@ -448,14 +446,14 @@ def polyfill( df = gpd.read_parquet(pq_in).reset_index() if spatial_sort_col != "none": df = df.drop(columns=[spatial_sort_col]) - if len(df.index) == 0: + if df.empty: # Input is empty, nothing to convert return None # DGGS specific conversion df = indexer.polyfill(df, resolution) - if len(df.index) == 0: + if df.empty: # Conversion resulted in empty output (e.g. large cell, small feature) return None @@ -470,8 +468,8 @@ def polyfill( return None -def polyfill_star(args) -> None: - return polyfill(*args) +def _polyfill_star(args) -> None: + return _polyfill(*args) def bisection_preparation( @@ -481,10 +479,10 @@ def bisection_preparation( cut_crs: pyproj.CRS = None, cut_threshold: Union[None, float] = None, ) -> tuple[pd.DataFrame, pyproj.CRS, Union[None, float]]: - cut_threshold = float(cut_threshold) if cut_threshold != None else None + cut_threshold = float(cut_threshold) if cut_threshold is not None else None if cut_threshold and cut_crs: - if df.crs is None and len(df.index) == 0: + if df.crs is None and df.empty: # empty + naive: nothing to transform df = df.set_crs(cut_crs, allow_override=True) elif df.crs is None: @@ -510,7 +508,7 @@ def bisection_preparation( f"Using CRS units for input polygon bisection: {cut_crs.axis_info[0].unit_name}" ) - if cut_threshold == None: + if cut_threshold is None: unit_name = cut_crs.axis_info[0].unit_name cut_threshold_m2 = const.DEFAULT_AREA_THRESHOLD_M2(dggs, (int(parent_res))) if unit_name == "metre": @@ -531,42 +529,21 @@ def bisect_geometry(geometry, cut_threshold): return GeometryCollection(katana.katana(geometry, cut_threshold)) -def index( - dggs: str, +def _read_input( input_file: Union[Path, str], - output_directory: Union[Path, str], - resolution: int, - parent_res: Union[None, int], + layer: str, + con: SQLConnectionType, keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_threshold: Union[None, float], - processes: int, - compression: str = "snappy", - id_field: str = None, - cut_crs: pyproj.CRS = None, - con: SQLConnectionType = None, - layer: str = None, - geom_col: str = "geom", - geo: str = const.GeoOutputMode.NONE.value, - overwrite: bool = False, - compact: bool = True, -) -> Path: - """ - Performs multi-threaded DGGS indexing on geometries (including multipart and collections). - """ - indexer = idxfactory.indexer_instance(dggs) - parent_res = get_parent_res(dggs, parent_res, resolution) - + id_field: str, + geom_col: str, +) -> gpd.GeoDataFrame: if layer and con: - # Database connection with con.connect() as connection: parts = layer.rsplit(".", 1) schema, tbl_name = ( (parts[0], parts[1]) if len(parts) == 2 else (None, parts[0]) ) tbl = sqlalchemy.table(tbl_name, schema=schema) - if keep_attributes: stmt = tbl.select() elif id_field and not keep_attributes: @@ -575,126 +552,176 @@ def index( ).select_from(tbl) else: stmt = sqlalchemy.select(sqlalchemy.column(geom_col)).select_from(tbl) + return gpd.read_postgis( + stmt, connection, geom_col=geom_col + ).rename_geometry("geometry") + return gpd.read_file(input_file, layer=layer) - df = gpd.read_postgis(stmt, connection, geom_col=geom_col).rename_geometry( - "geometry" - ) - else: - # Read file - df = gpd.read_file(input_file, layer=layer) - - if df is None or len(df.index) == 0: - LOGGER.warning( - "Input contained 0 features (layer=%s). Nothing to index; exiting.", - layer if layer else "", - ) - return output_directory - - df, cut_crs, cut_threshold = bisection_preparation( - df, dggs, parent_res, cut_crs, cut_threshold - ) +def _prepare_dataframe( + df: gpd.GeoDataFrame, + id_field: str, + keep_attributes: bool, +) -> gpd.GeoDataFrame: if id_field: df = df.set_index(id_field) else: df = df.reset_index() df = df.rename(columns={"index": "fid"}).set_index("fid") - if not keep_attributes: - # Remove all attributes except the geometry df = df.loc[:, ["geometry"]] + return df - LOGGER.debug("Bisecting large geometries") +def _run_bisection( + df: gpd.GeoDataFrame, + cut_threshold: Union[None, float], + processes: int, +) -> gpd.GeoDataFrame: + LOGGER.debug("Bisecting large geometries") if cut_threshold is not None and cut_threshold > 0: with ThreadPoolExecutor(max_workers=max(1, processes)) as executor: futures = [] - for index, row in df.iterrows(): - future = executor.submit(bisect_geometry, row.geometry, cut_threshold) - futures.append((index, future)) - + for idx, row in df.iterrows(): + futures.append( + (idx, executor.submit(bisect_geometry, row.geometry, cut_threshold)) + ) with tqdm(total=len(futures), desc="Bisection") as pbar: - for index, future in futures: - df.at[index, "geometry"] = future.result() + for idx, future in futures: + df.at[idx, "geometry"] = future.result() pbar.update(1) else: LOGGER.debug("No bisection applied to input.") + return df + +def _clean_geometries(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: LOGGER.debug("Exploding geometry collections and multipolygons") df = ( df.to_crs(4326) .explode(index_parts=False) # Explode from GeometryCollection .explode(index_parts=False) # Explode multipolygons to polygons ).reset_index() + df = drop_condition( + df, + df[df.geometry.is_empty | df.geometry.isna()].index, + "Considering empty or null geometries", + ) + df = drop_condition( + df, + df[ + (df.geometry.geom_type != "Polygon") + & (df.geometry.geom_type != "LineString") + & (df.geometry.geom_type != "Point") + ].index, + "Considering unsupported geometries", + ) + return df - drop_conditions = [ - { - "index": lambda frame: frame[ - (frame.geometry.is_empty | frame.geometry.isna()) - ], - "message": "Considering empty or null geometries", - }, - { - "index": lambda frame: frame[ - (frame.geometry.geom_type != "Polygon") - & (frame.geometry.geom_type != "LineString") - & (frame.geometry.geom_type != "Point") - ], - "message": "Considering unsupported geometries", - }, + +def _run_dggs_indexing( + indexer: VectorIndexer, + filepaths: list, + spatial_sort_col: str, + resolution: int, + parent_res: int, + output_dir: str, + compression: str, + processes: int, +) -> None: + LOGGER.debug("DGGS indexing by spatial partitions with resolution: %d", resolution) + args = [ + ( + indexer, + filepath, + spatial_sort_col, + resolution, + parent_res, + output_dir, + compression, + ) + for filepath in filepaths ] - for condition in drop_conditions: - df = drop_condition(df, condition["index"](df).index, condition["message"]) + with ProcessPoolExecutor(max_workers=processes) as executor: + futures = {executor.submit(_polyfill_star, arg): arg for arg in args} + for future in tqdm( + as_completed(futures), total=len(futures), desc="DGGS indexing" + ): + try: + future.result() + except Exception as e: + LOGGER.error(f"Task failed with {e}") + raise e - ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True) +def index( + dggs: str, + input_file: Union[Path, str], + output_directory: Union[Path, str], + resolution: int, + parent_res: Union[None, int], + keep_attributes: bool, + chunksize: int, + spatial_sorting: str, + cut_threshold: Union[None, float], + processes: int, + compression: str = "snappy", + id_field: str = None, + cut_crs: pyproj.CRS = None, + con: SQLConnectionType = None, + layer: str = None, + geom_col: str = "geom", + geo: str = const.GeoOutputMode.NONE.value, + overwrite: bool = False, + compact: bool = True, +) -> Path: + """ + Performs multi-threaded DGGS indexing on geometries (including multipart and collections). + """ + indexer = idxfactory.indexer_instance(dggs) + parent_res = get_parent_res(dggs, parent_res, resolution) + + df = _read_input(input_file, layer, con, keep_attributes, id_field, geom_col) + if df is None or df.empty: + LOGGER.warning( + "Input contained 0 features (layer=%s). Nothing to index; exiting.", + layer if layer else "", + ) + return output_directory + + df, cut_crs, cut_threshold = bisection_preparation( + df, dggs, parent_res, cut_crs, cut_threshold + ) + df = _prepare_dataframe(df, id_field, keep_attributes) + df = _run_bisection(df, cut_threshold, processes) + df = _clean_geometries(df) + + ddf = dgpd.from_geopandas(df, chunksize=max(1, chunksize), sort=True) if spatial_sorting != "none": LOGGER.debug("Spatially sorting and partitioning (%s)", spatial_sorting) ddf = ddf.spatial_shuffle(by=spatial_sorting) spatial_sort_col = ( spatial_sorting - if (spatial_sorting == "geohash" or spatial_sorting == "none") + if spatial_sorting in ("geohash", "none") else f"{spatial_sorting}_distance" ) with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir: with TqdmCallback(desc="Spatially partitioning"): ddf.to_parquet(tmpdir, overwrite=True) + filepaths = [f.absolute() for f in Path(tmpdir).glob("*")] - filepaths = list(map(lambda f: f.absolute(), Path(tmpdir).glob("*"))) - - # Multithreaded DGGS indexing - LOGGER.debug( - "DGGS indexing by spatial partitions with resolution: %d", - resolution, - ) with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2: - - args = [ - ( - indexer, - filepath, - spatial_sort_col, - resolution, - parent_res, - tmpdir2, - compression, - ) - for filepath in filepaths - ] - - with ProcessPoolExecutor(max_workers=processes) as executor: - futures = {executor.submit(polyfill_star, arg): arg for arg in args} - - for future in tqdm( - as_completed(futures), total=len(futures), desc="DGGS indexing" - ): - try: - future.result() - except Exception as e: - LOGGER.error(f"Task failed with {e}") - raise (e) - + _run_dggs_indexing( + indexer, + filepaths, + spatial_sort_col, + resolution, + parent_res, + tmpdir2, + compression, + processes, + ) if not any(Path(tmpdir2).glob("*.parquet")): LOGGER.warning( "No features were indexed (resolution %s may be too coarse for the input). Nothing to write; exiting.", @@ -702,7 +729,7 @@ def index( ) return output_directory - parent_partitioning( + _parent_partitioning( indexer, Path(tmpdir2), output_directory, diff --git a/vector2dggs/geohash.py b/vector2dggs/geohash.py index 8ff2579..d0937e1 100644 --- a/vector2dggs/geohash.py +++ b/vector2dggs/geohash.py @@ -1,207 +1,6 @@ -import click -import click_log -import tempfile -import pyproj - -from typing import Union -from pathlib import Path - import vector2dggs.constants as const -import vector2dggs.common as common - -from vector2dggs import __version__ +from vector2dggs.cli_factory import make_dggs_command - -@click.command(context_settings={"show_default": True}) -@click_log.simple_verbosity_option(common.LOGGER) -@click.argument("vector_input", required=True, type=click.Path(), nargs=1) -@click.argument("output_directory", required=True, type=click.Path(), nargs=1) -@click.option( - "-r", - "--resolution", - "level", - required=True, - type=click.Choice(list(map(str, range(const.MIN_GEOHASH, const.MAX_GEOHASH + 1)))), - help="Geohash level to index", - nargs=1, -) -@click.option( - "-pr", - "--parent_res", - "parent_level", - required=False, - type=click.Choice(list(map(str, range(const.MIN_GEOHASH, const.MAX_GEOHASH + 1)))), - help="Geohash parent level for the output partition. Defaults to resolution - 6", -) -@click.option( - "-id", - "--id_field", - required=False, - default=const.DEFAULTS["id"], - type=str, - help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", - nargs=1, -) -@click.option( - "-k", - "--keep_attributes", - is_flag=True, - show_default=True, - default=const.DEFAULTS["k"], - help="Retain attributes in output. The default is to create an output that only includes Geohash cell ID and the ID given by the -id field (or the default index ID).", -) -@click.option( - "-ch", - "--chunksize", - required=True, - type=int, - default=const.DEFAULTS["ch"], - help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.", - nargs=1, -) -@click.option( - "-s", - "--spatial_sorting", - type=click.Choice(const.SPATIAL_SORTING_METHODS), - default=const.DEFAULTS["s"], - help="Spatial sorting method when perfoming spatial partitioning.", -) -@click.option( - "-crs", - "--cut_crs", - required=False, - default=const.DEFAULTS["crs"], - type=int, - help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", - nargs=1, -) -@click.option( - "-c", - "--cut_threshold", - required=False, - default=const.DEFAULTS["c"], - type=float, - help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bissection entirely (effectively ignoring --cut_crs).", - nargs=1, -) -@click.option( - "-t", - "--threads", - required=False, - default=const.DEFAULTS["t"], - type=int, - help="Amount of threads used for operation", - nargs=1, +geohash = make_dggs_command( + "geohash", "geohash", "Geohash", const.MIN_GEOHASH, const.MAX_GEOHASH ) -@click.option( - "-cp", - "--compression", - required=False, - default=const.DEFAULTS["cp"], - type=str, - help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", - nargs=1, -) -@click.option( - "-lyr", - "--layer", - required=False, - default=const.DEFAULTS["lyr"], - type=str, - help="Name of the layer or table to read when using an input that supports layers or tables", - nargs=1, -) -@click.option( - "-g", - "--geom_col", - required=False, - default=const.DEFAULTS["g"], - type=str, - help="Select geometr encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", - nargs=1, -) -@click.option( - "--geo", - required=False, - default=const.DEFAULTS["geo"], - type=click.Choice(const.GEOM_TYPES), - help="Write output as a GeoParquet (v1.1.0) with either point or polygon geometry.", -) -@click.option( - "--tempdir", - default=const.DEFAULTS["tempdir"], - type=click.Path(), - help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", -) -@click.option( - "-co", - "--compact", - is_flag=True, - help="Compact the geohash cells up to the parent resolution. Compaction requires an id_field.", -) -@click.option("-o", "--overwrite", is_flag=True) -@click.version_option(version=__version__) -def geohash( - vector_input: Union[str, Path], - output_directory: Union[str, Path], - level: str, - parent_level: str, - id_field: str, - keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_crs: int, - cut_threshold: int, - threads: int, - compression: str, - layer: str, - geom_col: str, - geo: str, - tempdir: Union[str, Path], - compact: bool, - overwrite: bool, -): - """ - Ingest a vector dataset and index it using the Geohash geocode system. - - VECTOR_INPUT is the path to input vector geospatial data. - OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store. - """ - tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir - - common.check_resolutions(level, parent_level) - common.check_compaction_requirements(compact, id_field) - - spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value - geo = const.GeoOutputMode(geo).value - - con, vector_input = common.db_conn_and_input_path(vector_input) - output_directory = common.resolve_output_path(output_directory, overwrite) - - if cut_crs is not None: - cut_crs = pyproj.CRS.from_user_input(cut_crs) - - try: - common.index( - "geohash", - vector_input, - output_directory, - int(level), - parent_level, - keep_attributes, - chunksize, - spatial_sorting, - cut_threshold, - threads, - compression=compression, - cut_crs=cut_crs, - id_field=id_field, - con=con, - layer=layer, - geom_col=geom_col, - geo=geo, - overwrite=overwrite, - compact=compact, - ) - except: - raise diff --git a/vector2dggs/h3.py b/vector2dggs/h3.py index 209a687..afe691d 100644 --- a/vector2dggs/h3.py +++ b/vector2dggs/h3.py @@ -1,206 +1,4 @@ -import click -import click_log -import tempfile -import pyproj - -from typing import Union -from pathlib import Path - import vector2dggs.constants as const -import vector2dggs.common as common - -from vector2dggs import __version__ - - -@click.command(context_settings={"show_default": True}) -@click_log.simple_verbosity_option(common.LOGGER) -@click.argument("vector_input", required=True, type=click.Path(), nargs=1) -@click.argument("output_directory", required=True, type=click.Path(), nargs=1) -@click.option( - "-r", - "--resolution", - required=True, - type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))), - help="H3 resolution to index", - nargs=1, -) -@click.option( - "-pr", - "--parent_res", - required=False, - type=click.Choice(list(map(str, range(const.MIN_H3, const.MAX_H3 + 1)))), - help="H3 Parent resolution for the output partition. Defaults to resolution - 6", -) -@click.option( - "-id", - "--id_field", - required=False, - default=const.DEFAULTS["id"], - type=str, - help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", - nargs=1, -) -@click.option( - "-k", - "--keep_attributes", - is_flag=True, - show_default=True, - default=const.DEFAULTS["k"], - help="Retain attributes in output. The default is to create an output that only includes H3 cell ID and the ID given by the -id field (or the default index ID).", -) -@click.option( - "-ch", - "--chunksize", - required=True, - type=int, - default=const.DEFAULTS["ch"], - help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.", - nargs=1, -) -@click.option( - "-s", - "--spatial_sorting", - type=click.Choice(const.SPATIAL_SORTING_METHODS), - default=const.DEFAULTS["s"], - help="Spatial sorting method when perfoming spatial partitioning.", -) -@click.option( - "-crs", - "--cut_crs", - required=False, - default=const.DEFAULTS["crs"], - type=int, - help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", - nargs=1, -) -@click.option( - "-c", - "--cut_threshold", - required=False, - default=const.DEFAULTS["c"], - type=float, - help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bissection entirely (effectively ignoring --cut_crs).", - nargs=1, -) -@click.option( - "-t", - "--threads", - required=False, - default=const.DEFAULTS["t"], - type=int, - help="Amount of threads used for operation", - nargs=1, -) -@click.option( - "-cp", - "--compression", - required=False, - default=const.DEFAULTS["cp"], - type=str, - help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", - nargs=1, -) -@click.option( - "-lyr", - "--layer", - required=False, - default=const.DEFAULTS["lyr"], - type=str, - help="Name of the layer or table to read when using an input that supports layers or tables", - nargs=1, -) -@click.option( - "-g", - "--geom_col", - required=False, - default=const.DEFAULTS["g"], - type=str, - help="Column name to use when using a spatial database connection as input", - nargs=1, -) -@click.option( - "--geo", - required=False, - default=const.DEFAULTS["geo"], - type=click.Choice(const.GEOM_TYPES), - help="Select geometry encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", - nargs=1, -) -@click.option( - "--tempdir", - default=const.DEFAULTS["tempdir"], - type=click.Path(), - help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", -) -@click.option( - "-co", - "--compact", - is_flag=True, - help="Compact the H3 cells up to the parent resolution. Compaction requires an id_field.", -) -@click.option("-o", "--overwrite", is_flag=True) -@click.version_option(version=__version__) -def h3( - vector_input: Union[str, Path], - output_directory: Union[str, Path], - resolution: str, - parent_res: str, - id_field: str, - keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_crs: int, - cut_threshold: int, - threads: int, - compression: str, - layer: str, - geom_col: str, - geo: str, - tempdir: Union[str, Path], - compact: bool, - overwrite: bool, -): - """ - Ingest a vector dataset and index it to the H3 DGGS. - - VECTOR_INPUT is the path to input vector geospatial data. - OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store. - """ - tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir - - common.check_resolutions(resolution, parent_res) - common.check_compaction_requirements(compact, id_field) - - spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value - geo = const.GeoOutputMode(geo).value - - con, vector_input = common.db_conn_and_input_path(vector_input) - output_directory = common.resolve_output_path(output_directory, overwrite) - - if cut_crs is not None: - cut_crs = pyproj.CRS.from_user_input(cut_crs) +from vector2dggs.cli_factory import make_dggs_command - try: - common.index( - "h3", - vector_input, - output_directory, - int(resolution), - parent_res, - keep_attributes, - chunksize, - spatial_sorting, - cut_threshold, - threads, - compression=compression, - cut_crs=cut_crs, - id_field=id_field, - con=con, - layer=layer, - geom_col=geom_col, - geo=geo, - overwrite=overwrite, - compact=compact, - ) - except: - raise +h3 = make_dggs_command("h3", "h3", "H3", const.MIN_H3, const.MAX_H3) diff --git a/vector2dggs/indexerfactory.py b/vector2dggs/indexerfactory.py index fa51f45..dfcaa42 100644 --- a/vector2dggs/indexerfactory.py +++ b/vector2dggs/indexerfactory.py @@ -1,28 +1,20 @@ -""" -@author: ndemaio, alpha-beta-soup -""" - from importlib import import_module from typing import Dict, Tuple, Type from vector2dggs.indexers import vectorindexer -INDEXER_LOOKUP: Dict[str, Tuple[str, str, str]] = { - "h3": ("vector2dggs.indexers.h3vectorindexer", "H3VectorIndexer", "h3"), - "rhp": ("vector2dggs.indexers.rhpvectorindexer", "RHPVectorIndexer", "rhp"), - "geohash": ( - "vector2dggs.indexers.geohashvectorindexer", - "GeohashVectorIndexer", - "geohash", - ), - "s2": ("vector2dggs.indexers.s2vectorindexer", "S2VectorIndexer", "s2"), - "a5": ("vector2dggs.indexers.a5vectorindexer", "A5VectorIndexer", "a5"), +INDEXER_LOOKUP: Dict[str, Tuple[str, str]] = { + "h3": ("vector2dggs.indexers.h3vectorindexer", "H3VectorIndexer"), + "rhp": ("vector2dggs.indexers.rhpvectorindexer", "RHPVectorIndexer"), + "geohash": ("vector2dggs.indexers.geohashvectorindexer", "GeohashVectorIndexer"), + "s2": ("vector2dggs.indexers.s2vectorindexer", "S2VectorIndexer"), + "a5": ("vector2dggs.indexers.a5vectorindexer", "A5VectorIndexer"), } def indexer_instance(dggs: str) -> vectorindexer.VectorIndexer: try: - module_name, class_name, extra = INDEXER_LOOKUP[dggs] + module_name, class_name = INDEXER_LOOKUP[dggs] except KeyError as e: raise ValueError( f"Unknown DGGS: '{dggs}'. Options: {sorted(INDEXER_LOOKUP)}" @@ -32,8 +24,8 @@ def indexer_instance(dggs: str) -> vectorindexer.VectorIndexer: module = import_module(module_name) except ModuleNotFoundError as e: raise ImportError( - f"Mising dependency '{e.name}' for backend '{dggs}'.\n" - f"Install optional dependencies: pip install 'vector2dggs[{extra}]' " + f"Missing dependency '{e.name}' for backend '{dggs}'.\n" + f"Install optional dependencies: pip install 'vector2dggs[{dggs}]' " f"(or 'vector2dggs[all]')." ) from e indexer: Type[vectorindexer.VectorIndexer] = getattr(module, class_name) diff --git a/vector2dggs/indexers/a5vectorindexer.py b/vector2dggs/indexers/a5vectorindexer.py index 8d4ff9d..a78ecf9 100644 --- a/vector2dggs/indexers/a5vectorindexer.py +++ b/vector2dggs/indexers/a5vectorindexer.py @@ -11,21 +11,6 @@ class A5VectorIndexer(VectorIndexer): Provides integration for the A5 pentagonal DGGS. """ - @staticmethod - def _geo_to_cells( - df: gpd.GeoDataFrame, resolution: int, cell_fn, geom_col: str - ) -> pd.DataFrame: - return ( - df.assign( - __cells__=df[geom_col].apply(lambda geom: cell_fn(geom, resolution)) - ) - .drop(columns=[geom_col]) - .explode("__cells__") - .dropna(subset=["__cells__"]) - .set_index("__cells__") - .rename_axis(None) - ) - @staticmethod def _polyfill_polygon(geom, resolution: int) -> list: cells = set( diff --git a/vector2dggs/indexers/geohashvectorindexer.py b/vector2dggs/indexers/geohashvectorindexer.py index b68aa4f..5fbcad2 100644 --- a/vector2dggs/indexers/geohashvectorindexer.py +++ b/vector2dggs/indexers/geohashvectorindexer.py @@ -1,8 +1,3 @@ -""" - -@author: ndemaio -""" - from geohash_polygon import polygon_to_geohashes # rusty-polygon-geohasher from geohash import encode, decode, decode_exactly # python-geohash @@ -23,14 +18,13 @@ def __init__(self, dggs): self.GEOHASH_BASE32_SET = set("0123456789bcdefghjkmnpqrstuvwxyz") def polyfill(self, df: gpd.GeoDataFrame, level: int) -> pd.DataFrame: - """ - Implementation of abstract function. - """ - + geom_col = df.geometry.name gh_col = "geohash" + parts = [] + df_polygon = df[df.geom_type == "Polygon"].copy() if not df_polygon.empty: - df_polygon = ( + result = ( df_polygon.assign( **{ gh_col: df_polygon.geometry.apply( @@ -38,26 +32,31 @@ def polyfill(self, df: gpd.GeoDataFrame, level: int) -> pd.DataFrame: ) } ) + .drop(columns=[geom_col]) .explode(gh_col, ignore_index=True) .set_index(gh_col) ) + parts.append(pd.DataFrame(result)) # TODO linestring support - # e.g. JS implementation https://github.com/alrico88/geohashes-along + # e.g. JS implementation https://github.com/alrico88/geohashes-along and https://github.com/alrico88/geohashes-between/blob/master/src/index.ts df_point = df[df.geom_type == "Point"].copy() - if len(df_point.index) > 0: - df_point[gh_col] = df_point.geometry.apply( - lambda geom: encode(geom.y, geom.x, precision=level) + if not df_point.empty: + result = ( + df_point.assign( + **{ + gh_col: df_point.geometry.apply( + lambda geom: encode(geom.y, geom.x, precision=level) + ) + } + ) + .drop(columns=[geom_col]) + .set_index(gh_col) ) - df_point = df_point.set_index(gh_col) + parts.append(pd.DataFrame(result)) - return pd.concat( - map( - lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])), - [df_polygon, df_point], - ) - ) + return pd.concat(parts) if parts else pd.DataFrame() def secondary_index(self, df: pd.DataFrame, parent_level: int) -> pd.DataFrame: """ diff --git a/vector2dggs/indexers/h3vectorindexer.py b/vector2dggs/indexers/h3vectorindexer.py index 79aa519..7e2c931 100644 --- a/vector2dggs/indexers/h3vectorindexer.py +++ b/vector2dggs/indexers/h3vectorindexer.py @@ -11,21 +11,6 @@ class H3VectorIndexer(VectorIndexer): Provides integration for Uber's H3 DGGS. """ - @staticmethod - def _geo_to_cells( - df: gpd.GeoDataFrame, resolution: int, cell_fn, geom_col: str - ) -> pd.DataFrame: - return ( - df.assign( - __cells__=df[geom_col].apply(lambda geom: cell_fn(geom, resolution)) - ) - .drop(columns=[geom_col]) - .explode("__cells__") - .dropna(subset=["__cells__"]) - .set_index("__cells__") - .rename_axis(None) - ) - @staticmethod def _polyfill_polygon(geom, resolution: int) -> list: return h3.geo_to_cells(mapping(geom), resolution) diff --git a/vector2dggs/indexers/rhpvectorindexer.py b/vector2dggs/indexers/rhpvectorindexer.py index 8516228..e45ff24 100644 --- a/vector2dggs/indexers/rhpvectorindexer.py +++ b/vector2dggs/indexers/rhpvectorindexer.py @@ -1,8 +1,3 @@ -""" - -@author: ndemaio -""" - from rhealpixdggs.conversion import compress_order_cells from rhealpixdggs.rhp_wrappers import ( rhp_to_center_child, @@ -27,35 +22,33 @@ class RHPVectorIndexer(VectorIndexer): """ def polyfill(self, df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame: - """ - Implementation of abstract function. - """ + geom_col = df.geometry.name + parts = [] df_polygon = df[df.geom_type == "Polygon"] - if len(df_polygon.index) > 0: - df_polygon = df_polygon.rhp.polyfill_resample( + if not df_polygon.empty: + result = df_polygon.rhp.polyfill_resample( resolution, return_geometry=False, compress=False - ).drop(columns=["index"]) + ).drop(columns=["index", geom_col]) + parts.append(pd.DataFrame(result)) df_linestring = df[df.geom_type == "LineString"] - if len(df_linestring.index) > 0: - df_linestring = ( + if not df_linestring.empty: + result = ( df_linestring.rhp.linetrace(resolution) .explode(COLUMNS["linetrace"]) .set_index(COLUMNS["linetrace"]) + .drop(columns=[geom_col]) ) - df_linestring = df_linestring[~df_linestring.index.duplicated(keep="first")] + result = result[~result.index.duplicated(keep="first")] + parts.append(pd.DataFrame(result)) df_point = df[df.geom_type == "Point"] - if len(df_point.index) > 0: - df_point = df_point.rhp.geo_to_rhp(resolution, set_index=True) + if not df_point.empty: + result = df_point.rhp.geo_to_rhp(resolution, set_index=True) + parts.append(pd.DataFrame(result.drop(columns=[geom_col]))) - return pd.concat( - map( - lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])), - [df_polygon, df_linestring, df_point], - ) - ) + return pd.concat(parts) if parts else pd.DataFrame() def secondary_index(self, df: pd.DataFrame, parent_res: int) -> pd.DataFrame: """ diff --git a/vector2dggs/indexers/s2vectorindexer.py b/vector2dggs/indexers/s2vectorindexer.py index 5ff52de..bfe5c03 100644 --- a/vector2dggs/indexers/s2vectorindexer.py +++ b/vector2dggs/indexers/s2vectorindexer.py @@ -1,8 +1,3 @@ -""" - -@author: ndemaio -""" - from typing import Union from math import ceil @@ -26,38 +21,40 @@ class S2VectorIndexer(VectorIndexer): """ def polyfill(self, df: gpd.GeoDataFrame, level: int) -> pd.DataFrame: - """ - Implementation of abstract function. - """ + geom_col = df.geometry.name + parts = [] df_polygon = df[df.geom_type == "Polygon"].copy() - if len(df_polygon.index) > 0: - df_polygon = ( + if not df_polygon.empty: + result = ( self.polyfill_polygons(df_polygon, level) .explode("s2index") .set_index("s2index") + .drop(columns=[geom_col]) ) + parts.append(pd.DataFrame(result)) df_linestring = df[df.geom_type == "LineString"].copy() - if len(df_linestring.index) > 0: + if not df_linestring.empty: df_linestring["s2index"] = df_linestring.geometry.apply( lambda geom: self.cell_ids_from_linestring(geom, level) ) - df_linestring = df_linestring.explode("s2index").set_index("s2index") + result = ( + df_linestring.drop(columns=[geom_col]) + .explode("s2index") + .set_index("s2index") + ) + parts.append(pd.DataFrame(result)) df_point = df[df.geom_type == "Point"].copy() - if len(df_point.index) > 0: + if not df_point.empty: df_point["s2index"] = df_point.geometry.apply( lambda geom: self.cell_id_from_point(geom, level) ) - df_point = df_point.set_index("s2index") + result = df_point.drop(columns=[geom_col]).set_index("s2index") + parts.append(pd.DataFrame(result)) - return pd.concat( - map( - lambda _df: pd.DataFrame(_df.drop(columns=[_df.geometry.name])), - [df_polygon, df_linestring, df_point], - ) - ) + return pd.concat(parts) if parts else pd.DataFrame() def secondary_index(self, df: pd.DataFrame, parent_level: int) -> pd.DataFrame: """ diff --git a/vector2dggs/indexers/vectorindexer.py b/vector2dggs/indexers/vectorindexer.py index ab8b589..46b8851 100644 --- a/vector2dggs/indexers/vectorindexer.py +++ b/vector2dggs/indexers/vectorindexer.py @@ -1,8 +1,4 @@ -""" - -@author: ndemaio -""" - +from abc import ABC, abstractmethod from uuid import uuid4 from typing import Union, Callable, Iterable @@ -11,32 +7,21 @@ from shapely.geometry import Polygon, Point -class VectorIndexer: +class VectorIndexer(ABC): """ - Provides an abstract base class and interface for all indexers integrating - a specific DGGS. It should never be instantiated directly because some - methods raise a NotImplementedError by design. Those methods should be - implemented by the child classes deriving from this interface instead. + Abstract base class and interface for all DGGS indexers. """ def __init__(self, dggs: str): - """ - Value used across all child classes - """ self.dggs = dggs - def polyfill(self, df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame: - """ - Needs to be implemented by child class - """ - raise NotImplementedError() + @abstractmethod + def polyfill(self, df: gpd.GeoDataFrame, resolution: int) -> pd.DataFrame: ... - def secondary_index(self, df: pd.DataFrame, parent_res: int) -> pd.DataFrame: - """ - Needs to be implemented by child class - """ - raise NotImplementedError() + @abstractmethod + def secondary_index(self, df: pd.DataFrame, parent_res: int) -> pd.DataFrame: ... + @abstractmethod def compaction( self, df: pd.DataFrame, @@ -44,11 +29,30 @@ def compaction( col_order: list, dggs_col: str, id_field: str, + ) -> pd.DataFrame: ... + + @staticmethod + @abstractmethod + def cell_to_point(cell: str) -> Point: ... + + @staticmethod + @abstractmethod + def cell_to_polygon(cell: str) -> Polygon: ... + + @staticmethod + def _geo_to_cells( + df: gpd.GeoDataFrame, resolution: int, cell_fn, geom_col: str ) -> pd.DataFrame: - """ - Needs to be implemented by child class - """ - raise NotImplementedError() + return ( + df.assign( + __cells__=df[geom_col].apply(lambda geom: cell_fn(geom, resolution)) + ) + .drop(columns=[geom_col]) + .explode("__cells__") + .dropna(subset=["__cells__"]) + .set_index("__cells__") + .rename_axis(None) + ) def compaction_common( self, @@ -82,14 +86,12 @@ def compaction_common( } # Get rows that cannot be compressed - mask = pd.Series([False] * len(df), index=df.index) # Init bool mask + mask = pd.Series([False] * len(df), index=df.index) for key, value_set in uncompressable.items(): mask |= (df[id_field] == key) & (df[dggs_col].isin(value_set)) uncompressable_df = df[mask].set_index(dggs_col) - # Get rows that can be compressed - # Convert each compressed (coarser resolution) cell into a cell at - # the original resolution (usu using centre child as reference) + # Get rows that can be compressed; replace fine cell with its compacted parent compression_mapping = { (id, cell_to_child_func(cell, res)): cell for id, cells in compressable.items() @@ -98,22 +100,16 @@ def compaction_common( } mask = pd.Series([False] * len(df), index=df.index) composite_key = f"composite_key_{uuid4()}" - # Update mask for compressible rows and prepare for replacement - get_composite_key = lambda row: (row[id_field], row[dggs_col]) + + def get_composite_key(row): + return (row[id_field], row[dggs_col]) + df[composite_key] = df.apply(get_composite_key, axis=1) mask |= df[composite_key].isin(compression_mapping) compressable_df = df[mask].copy() compressable_df[dggs_col] = compressable_df[composite_key].map( compression_mapping - ) # Replace DGGS cell ID with compressed representation + ) compressable_df = compressable_df.set_index(dggs_col) return pd.concat([compressable_df, uncompressable_df])[col_order] - - @staticmethod - def cell_to_point(cell: str) -> Point: - raise NotImplementedError() - - @staticmethod - def cell_to_polygon(cell: str) -> Polygon: - raise NotImplementedError() diff --git a/vector2dggs/rHP.py b/vector2dggs/rHP.py index bff25a0..e44113a 100644 --- a/vector2dggs/rHP.py +++ b/vector2dggs/rHP.py @@ -1,206 +1,4 @@ -import click -import click_log -import tempfile -import pyproj - -from typing import Union -from pathlib import Path - import vector2dggs.constants as const -import vector2dggs.common as common - -from vector2dggs import __version__ - - -@click.command(context_settings={"show_default": True}) -@click_log.simple_verbosity_option(common.LOGGER) -@click.argument("vector_input", required=True, type=click.Path(), nargs=1) -@click.argument("output_directory", required=True, type=click.Path(), nargs=1) -@click.option( - "-r", - "--resolution", - required=True, - type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))), - help="rHEALPix resolution to index", - nargs=1, -) -@click.option( - "-pr", - "--parent_res", - required=False, - type=click.Choice(list(map(str, range(const.MIN_RHP, const.MAX_RHP + 1)))), - help="rHEALPix Parent resolution for the output partition. Defaults to resolution - 6", -) -@click.option( - "-id", - "--id_field", - required=False, - default=const.DEFAULTS["id"], - type=str, - help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", - nargs=1, -) -@click.option( - "-k", - "--keep_attributes", - is_flag=True, - show_default=True, - default=const.DEFAULTS["k"], - help="Retain attributes in output. The default is to create an output that only includes rHEALPix cell ID and the ID given by the -id field (or the default index ID).", -) -@click.option( - "-ch", - "--chunksize", - required=True, - type=int, - default=const.DEFAULTS["ch"], - help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.", - nargs=1, -) -@click.option( - "-s", - "--spatial_sorting", - type=click.Choice(const.SPATIAL_SORTING_METHODS), - default=const.DEFAULTS["s"], - help="Spatial sorting method when perfoming spatial partitioning.", -) -@click.option( - "-crs", - "--cut_crs", - required=False, - default=const.DEFAULTS["crs"], - type=int, - help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cut_threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", - nargs=1, -) -@click.option( - "-c", - "--cut_threshold", - required=False, - default=const.DEFAULTS["c"], - type=float, - help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bissection entirely (effectively ignoring --cut_crs).", - nargs=1, -) -@click.option( - "-t", - "--threads", - required=False, - default=const.DEFAULTS["t"], - type=int, - help="Amount of threads used for operation", - nargs=1, -) -@click.option( - "-cp", - "--compression", - required=False, - default=const.DEFAULTS["cp"], - type=str, - help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", - nargs=1, -) -@click.option( - "-lyr", - "--layer", - required=False, - default=const.DEFAULTS["lyr"], - type=str, - help="Name of the layer or table to read when using an input that supports layers or tables", - nargs=1, -) -@click.option( - "-g", - "--geom_col", - required=False, - default=const.DEFAULTS["g"], - type=str, - help="Column name to use when using a spatial database connection as input", - nargs=1, -) -@click.option( - "--geo", - required=False, - default=const.DEFAULTS["geo"], - type=click.Choice(const.GEOM_TYPES), - help="Select geometr encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", - nargs=1, -) -@click.option( - "--tempdir", - default=const.DEFAULTS["tempdir"], - type=click.Path(), - help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", -) -@click.option( - "-co", - "--compact", - is_flag=True, - help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.", -) -@click.option("-o", "--overwrite", is_flag=True) -@click.version_option(version=__version__) -def rhp( - vector_input: Union[str, Path], - output_directory: Union[str, Path], - resolution: str, - parent_res: str, - id_field: str, - keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_crs: int, - cut_threshold: int, - threads: int, - compression: str, - layer: str, - geom_col: str, - geo: str, - tempdir: Union[str, Path], - compact: bool, - overwrite: bool, -): - """ - Ingest a vector dataset and index it to the rHEALPix DGGS. - - VECTOR_INPUT is the path to input vector geospatial data. - OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store. - """ - tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir - - spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value - geo = const.GeoOutputMode(geo).value - - common.check_resolutions(resolution, parent_res) - common.check_compaction_requirements(compact, id_field) - - con, vector_input = common.db_conn_and_input_path(vector_input) - output_directory = common.resolve_output_path(output_directory, overwrite) - - if cut_crs is not None: - cut_crs = pyproj.CRS.from_user_input(cut_crs) +from vector2dggs.cli_factory import make_dggs_command - try: - common.index( - "rhp", - vector_input, - output_directory, - int(resolution), - parent_res, - keep_attributes, - chunksize, - spatial_sorting, - cut_threshold, - threads, - compression=compression, - cut_crs=cut_crs, - id_field=id_field, - con=con, - layer=layer, - geom_col=geom_col, - geo=geo, - overwrite=overwrite, - compact=compact, - ) - except: - raise +rhp = make_dggs_command("rhp", "rhp", "rHEALPix", const.MIN_RHP, const.MAX_RHP) diff --git a/vector2dggs/s2.py b/vector2dggs/s2.py index 17669e7..c9ee658 100644 --- a/vector2dggs/s2.py +++ b/vector2dggs/s2.py @@ -1,208 +1,4 @@ -import click -import click_log -import tempfile -import pyproj - -from typing import Union -from pathlib import Path - import vector2dggs.constants as const -import vector2dggs.common as common - -from vector2dggs import __version__ - - -@click.command(context_settings={"show_default": True}) -@click_log.simple_verbosity_option(common.LOGGER) -@click.argument("vector_input", required=True, type=click.Path(), nargs=1) -@click.argument("output_directory", required=True, type=click.Path(), nargs=1) -@click.option( - "-r", - "--resolution", - "level", - required=True, - type=click.Choice(list(map(str, range(const.MIN_S2, const.MAX_S2 + 1)))), - help="S2 level to index", - nargs=1, -) -@click.option( - "-pr", - "--parent_res", - "parent_level", - required=False, - type=click.Choice(list(map(str, range(const.MIN_S2, const.MAX_S2 + 1)))), - help="S2 parent level for the output partition. Defaults to resolution - 6", -) -@click.option( - "-id", - "--id_field", - required=False, - default=const.DEFAULTS["id"], - type=str, - help="Field to use as an ID; defaults to a constructed single 0...n index on the original feature order.", - nargs=1, -) -@click.option( - "-k", - "--keep_attributes", - is_flag=True, - show_default=True, - default=const.DEFAULTS["k"], - help="Retain attributes in output. The default is to create an output that only includes S2 cell ID and the ID given by the -id field (or the default index ID).", -) -@click.option( - "-ch", - "--chunksize", - required=True, - type=int, - default=const.DEFAULTS["ch"], - help="The number of rows per index partition to use when spatially partioning. Adjusting this number will trade off memory use and time.", - nargs=1, -) -@click.option( - "-s", - "--spatial_sorting", - type=click.Choice(const.SPATIAL_SORTING_METHODS), - default=const.DEFAULTS["s"], - help="Spatial sorting method when perfoming spatial partitioning.", -) -@click.option( - "-crs", - "--cut_crs", - required=False, - default=const.DEFAULTS["crs"], - type=int, - help="Set the coordinate reference system (CRS) used for cutting large geometries (see `--cur-threshold`). Defaults to the same CRS as the input. Should be a valid EPSG code.", - nargs=1, -) -@click.option( - "-c", - "--cut_threshold", - required=False, - default=const.DEFAULTS["c"], - type=float, - help="Cutting up large geometries into smaller geometries based on a target area. Units are assumed to match the input CRS units unless the `--cut_crs` is also given, in which case units match the units of the supplied CRS. If left unspecified, the threshold will be the maximum area of a cell at the parent resolution, in square metres or feet according to the CRS. A threshold of 0 will skip bissection entirely (effectively ignoring --cut_crs).", - nargs=1, -) -@click.option( - "-t", - "--threads", - required=False, - default=const.DEFAULTS["t"], - type=int, - help="Amount of threads used for operation", - nargs=1, -) -@click.option( - "-cp", - "--compression", - required=False, - default=const.DEFAULTS["cp"], - type=str, - help="Compression method to use for the output Parquet files. Options include 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', etc. Use 'none' for no compression.", - nargs=1, -) -@click.option( - "-lyr", - "--layer", - required=False, - default=const.DEFAULTS["lyr"], - type=str, - help="Name of the layer or table to read when using an input that supports layers or tables", - nargs=1, -) -@click.option( - "-g", - "--geom_col", - required=False, - default=const.DEFAULTS["g"], - type=str, - help="Select geometr encoding for the output: 'none' for regular Parquet (no GeoParquet metadata), or 'point'/'polygon' to write GeoParquet (v1.1.0) with the corresponding geometry type.", - nargs=1, -) -@click.option( - "--geo", - required=False, - default=const.DEFAULTS["geo"], - type=click.Choice(const.GEOM_TYPES), - help="Write output as a GeoParquet (v1.1.0) with either point or polygon geometry.", - nargs=1, -) -@click.option( - "--tempdir", - default=const.DEFAULTS["tempdir"], - type=click.Path(), - help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.", -) -@click.option( - "-co", - "--compact", - is_flag=True, - help="Compact the rHEALPix cells up to the parent resolution. Compaction requires an id_field.", -) -@click.option("-o", "--overwrite", is_flag=True) -@click.version_option(version=__version__) -def s2( - vector_input: Union[str, Path], - output_directory: Union[str, Path], - level: str, - parent_level: str, - id_field: str, - keep_attributes: bool, - chunksize: int, - spatial_sorting: str, - cut_crs: int, - cut_threshold: int, - threads: int, - compression: str, - layer: str, - geom_col: str, - geo: str, - tempdir: Union[str, Path], - compact: bool, - overwrite: bool, -): - """ - Ingest a vector dataset and index it to the S2 DGGS. - - VECTOR_INPUT is the path to input vector geospatial data. - OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store. - """ - tempfile.tempdir = tempdir if tempdir is not None else tempfile.tempdir - - common.check_resolutions(level, parent_level) - common.check_compaction_requirements(compact, id_field) - - spatial_sorting = const.SpatialSortingMethod(spatial_sorting).value - geo = const.GeoOutputMode(geo).value - - con, vector_input = common.db_conn_and_input_path(vector_input) - output_directory = common.resolve_output_path(output_directory, overwrite) - - if cut_crs is not None: - cut_crs = pyproj.CRS.from_user_input(cut_crs) +from vector2dggs.cli_factory import make_dggs_command - try: - common.index( - "s2", - vector_input, - output_directory, - int(level), - parent_level, - keep_attributes, - chunksize, - spatial_sorting, - cut_threshold, - threads, - compression=compression, - cut_crs=cut_crs, - id_field=id_field, - con=con, - layer=layer, - geom_col=geom_col, - geo=geo, - overwrite=overwrite, - compact=compact, - ) - except: - raise +s2 = make_dggs_command("s2", "s2", "S2", const.MIN_S2, const.MAX_S2)