diff --git a/Cargo.lock b/Cargo.lock index 75e8f2d914..8aa6b22b22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -152,6 +152,15 @@ version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + [[package]] name = "arbitrary" version = "1.4.1" @@ -262,7 +271,7 @@ dependencies = [ "chrono", "comfy-table", "half", - "lexical-core", + "lexical-core 1.0.5", "num", "ryu", ] @@ -322,8 +331,8 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", - "lexical-core", + "indexmap 2.9.0", + "lexical-core 1.0.5", "memchr", "num", "serde", @@ -364,6 +373,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7450c76ab7c5a6805be3440dc2e2096010da58f7cab301fdc996a4ee3ee74e49" dependencies = [ "bitflags 2.9.0", + "serde", + "serde_json", ] [[package]] @@ -1530,7 +1541,7 @@ dependencies = [ "base64", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.9.0", "libc", "log", "object_store", @@ -1708,7 +1719,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap", + "indexmap 2.9.0", "paste", "serde_json", "sqlparser", @@ -1722,7 +1733,7 @@ checksum = "422ac9cf3b22bbbae8cdf8ceb33039107fde1b5492693168f13bd566b1bcc839" dependencies = [ "arrow", "datafusion-common", - "indexmap", + "indexmap 2.9.0", "itertools 0.14.0", "paste", ] @@ -1851,7 +1862,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "indexmap", + "indexmap 2.9.0", "itertools 0.14.0", "log", "regex", @@ -1873,7 +1884,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.9.0", "itertools 0.14.0", "log", "paste", @@ -1934,7 +1945,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.9.0", "itertools 0.14.0", "log", "parking_lot", @@ -1976,12 +1987,22 @@ dependencies = [ "bigdecimal", "datafusion-common", "datafusion-expr", - "indexmap", + "indexmap 2.9.0", "log", "regex", "sqlparser", ] +[[package]] +name = "dbase" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "847c0b5d4f3a3d80f9c64db3cb60eb00304b3ea1262c7299dd6274a83e714d24" +dependencies = [ + "byteorder", + "time", +] + [[package]] name = "deranged" version = "0.4.0" @@ -1989,6 +2010,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" dependencies = [ "powerfmt", + "serde", ] [[package]] @@ -2048,6 +2070,16 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "15401da73a9ed8c80e3b2d4dc05fe10e7b72d7243b9f614e516a44fa99986e88" +[[package]] +name = "earcutr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79127ed59a85d7687c409e9978547cffb7dc79675355ed22da6b66fd5f6ead01" +dependencies = [ + "itertools 0.11.0", + "num-traits", +] + [[package]] name = "either" version = "1.15.0" @@ -2069,6 +2101,18 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.101", +] + [[package]] name = "enum-iterator" version = "2.1.0" @@ -2271,6 +2315,12 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "float_next_after" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8" + [[package]] name = "flume" version = "0.11.1" @@ -2434,6 +2484,178 @@ dependencies = [ "version_check", ] +[[package]] +name = "geo" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4416397671d8997e9a3e7ad99714f4f00a22e9eaa9b966a5985d2194fc9e02e1" +dependencies = [ + "earcutr", + "float_next_after", + "geo-types", + "geographiclib-rs", + "i_overlay", + "log", + "num-traits", + "robust", + "rstar", + "spade", +] + +[[package]] +name = "geo-index" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e6b2a121e60180a118037426b1e93d7f5904ebd95cf6e841a575195a65a31ce" +dependencies = [ + "bytemuck", + "float_next_after", + "geo-traits", + "num-traits", + "thiserror 1.0.69", + "tinyvec", +] + +[[package]] +name = "geo-traits" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b018fc19fa58202b03f1c809aebe654f7d70fd3887dace34c3d05c11aeb474b5" +dependencies = [ + "geo-types", +] + +[[package]] +name = "geo-types" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ddb1950450d67efee2bbc5e429c68d052a822de3aad010d28b351fbb705224" +dependencies = [ + "approx", + "num-traits", + "rayon", + "rstar", + "serde", +] + +[[package]] +name = "geoarrow" +version = "0.4.0-beta.4" +source = "git+https://github.com/geoarrow/geoarrow-rs.git?rev=c241034f#c241034f98eb1709342fc8da6d2cb87140f10cc5" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "chrono", + "dbase", + "enum-as-inner", + "geo", + "geo-index", + "geo-traits", + "geoarrow-schema", + "geozero", + "half", + "indexmap 2.9.0", + "lexical-core 0.8.5", + "num-traits", + "phf", + "rstar", + "serde", + "serde_json", + "shapefile", + "thiserror 1.0.69", + "wkb", + "wkt 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "geoarrow-array" +version = "0.1.0-dev" +source = "git+https://github.com/geoarrow/geoarrow-rs.git?rev=c241034f#c241034f98eb1709342fc8da6d2cb87140f10cc5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "geo-traits", + "geoarrow-schema", + "num-traits", + "serde_json", + "thiserror 1.0.69", + "wkb", + "wkt 0.12.0 (git+https://github.com/georust/wkt?rev=270ffe0eaf5ba5255c364dbade39c451562a9e9b)", +] + +[[package]] +name = "geoarrow-geoparquet" +version = "0.1.0-dev" +source = "git+https://github.com/geoarrow/geoarrow-rs.git?rev=c241034f#c241034f98eb1709342fc8da6d2cb87140f10cc5" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "geo-traits", + "geo-types", + "geoarrow-array", + "geoarrow-schema", + "parquet", + "serde", + "serde_json", + "serde_with", +] + +[[package]] +name = "geoarrow-schema" +version = "0.1.0-dev" +source = "git+https://github.com/geoarrow/geoarrow-rs.git?rev=c241034f#c241034f98eb1709342fc8da6d2cb87140f10cc5" +dependencies = [ + "arrow-schema", + "geo-traits", + "serde", + "serde_json", +] + +[[package]] +name = "geographiclib-rs" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e5ed84f8089c70234b0a8e0aedb6dc733671612ddc0d37c6066052f9781960" +dependencies = [ + "libm", +] + +[[package]] +name = "geojson" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e26f3c45b36fccc9cf2805e61d4da6bc4bbd5a3a9589b01afa3a40eff703bd79" +dependencies = [ + "log", + "serde", + "serde_json", + "thiserror 2.0.12", +] + +[[package]] +name = "geozero" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5f28f34864745eb2f123c990c6ffd92c1584bd39439b3f27ff2a0f4ea5b309b" +dependencies = [ + "geo-types", + "geojson", + "log", + "scroll", + "serde_json", + "thiserror 1.0.69", + "wkt 0.11.1", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -2526,7 +2748,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap", + "indexmap 2.9.0", "slab", "tokio", "tokio-util", @@ -2544,6 +2766,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "hash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -2583,6 +2814,16 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "heapless" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfb9eb618601c89945a70e254898da93b13be0388091d42117462b265bb3fad" +dependencies = [ + "hash32", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.4.1" @@ -2733,6 +2974,50 @@ dependencies = [ "tracing", ] +[[package]] +name = "i_float" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85df3a416829bb955fdc2416c7b73680c8dcea8d731f2c7aa23e1042fe1b8343" +dependencies = [ + "serde", +] + +[[package]] +name = "i_key_sort" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "347c253b4748a1a28baf94c9ce133b6b166f08573157e05afe718812bc599fcd" + +[[package]] +name = "i_overlay" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0542dfef184afdd42174a03dcc0625b6147fb73e1b974b1a08a2a42ac35cee49" +dependencies = [ + "i_float", + "i_key_sort", + "i_shape", + "i_tree", + "rayon", +] + +[[package]] +name = "i_shape" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a38f5a42678726718ff924f6d4a0e79b129776aeed298f71de4ceedbd091bce" +dependencies = [ + "i_float", + "serde", +] + +[[package]] +name = "i_tree" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "155181bc97d770181cf9477da51218a19ee92a8e5be642e796661aee2b601139" + [[package]] name = "iana-time-zone" version = "0.1.63" @@ -2745,7 +3030,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.61.0", + "windows-core 0.58.0", ] [[package]] @@ -2902,6 +3187,17 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + [[package]] name = "indexmap" version = "2.9.0" @@ -2910,6 +3206,7 @@ checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", "hashbrown 0.15.3", + "serde", ] [[package]] @@ -2983,6 +3280,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -3132,17 +3438,41 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float 0.8.5", + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "lexical-write-float 0.8.5", + "lexical-write-integer 0.8.5", +] + [[package]] name = "lexical-core" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", + "lexical-parse-float 1.0.5", + "lexical-parse-integer 1.0.5", + "lexical-util 1.0.6", + "lexical-write-float 1.0.5", + "lexical-write-integer 1.0.5", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer 0.8.6", + "lexical-util 0.8.5", + "static_assertions", ] [[package]] @@ -3151,8 +3481,18 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ - "lexical-parse-integer", - "lexical-util", + "lexical-parse-integer 1.0.5", + "lexical-util 1.0.6", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util 0.8.5", "static_assertions", ] @@ -3162,7 +3502,16 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ - "lexical-util", + "lexical-util 1.0.6", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ "static_assertions", ] @@ -3175,14 +3524,35 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util 0.8.5", + "lexical-write-integer 0.8.5", + "static_assertions", +] + [[package]] name = "lexical-write-float" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ - "lexical-util", - "lexical-write-integer", + "lexical-util 1.0.6", + "lexical-write-integer 1.0.5", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util 0.8.5", "static_assertions", ] @@ -3192,7 +3562,7 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ - "lexical-util", + "lexical-util 1.0.6", "static_assertions", ] @@ -3964,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 2.9.0", ] [[package]] @@ -3973,6 +4343,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ + "phf_macros", "phf_shared", ] @@ -3996,6 +4367,19 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.101", +] + [[package]] name = "phf_shared" version = "0.11.3" @@ -4760,6 +5144,23 @@ dependencies = [ "byteorder", ] +[[package]] +name = "robust" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf4a6aa5f6d6888f39e980649f3ad6b666acdce1d78e95b8a2cb076e687ae30" + +[[package]] +name = "rstar" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" +dependencies = [ + "heapless", + "num-traits", + "smallvec", +] + [[package]] name = "rstest" version = "0.25.0" @@ -4961,6 +5362,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" + [[package]] name = "seahash" version = "4.1.0" @@ -5087,6 +5494,46 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.9.0", + "serde", + "serde_derive", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.101", +] + +[[package]] +name = "shapefile" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79d5472e932503059d02779ad2c1b96258980940c6923e49f427fbe80eb3053c" +dependencies = [ + "byteorder", + "dbase", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -5221,6 +5668,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "spade" +version = "2.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ece03ff43cd2a9b57ebf776ea5e78bd30b3b4185a619f041079f4109f385034" +dependencies = [ + "hashbrown 0.15.3", + "num-traits", + "robust", + "smallvec", +] + [[package]] name = "spin" version = "0.9.8" @@ -5700,7 +6159,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap", + "indexmap 2.9.0", "toml_datetime", "winnow 0.5.40", ] @@ -5711,7 +6170,7 @@ version = "0.22.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" dependencies = [ - "indexmap", + "indexmap 2.9.0", "toml_datetime", "winnow 0.7.9", ] @@ -6220,11 +6679,14 @@ name = "vortex-dtype" version = "0.32.0" dependencies = [ "arbitrary", + "arcref", "arrow-schema", "flatbuffers", "half", + "inventory", "itertools 0.14.0", "jiff", + "log", "num-traits", "num_enum 0.7.3", "prost", @@ -6431,6 +6893,32 @@ dependencies = [ "vortex-scalar", ] +[[package]] +name = "vortex-geo" +version = "0.32.0" +dependencies = [ + "anyhow", + "arcref", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", + "clap", + "geoarrow", + "geoarrow-geoparquet", + "geoarrow-schema", + "serde_json", + "tokio", + "vortex-array", + "vortex-btrblocks", + "vortex-dtype", + "vortex-error", + "vortex-expr", + "vortex-file", + "vortex-layout", + "vortex-scalar", +] + [[package]] name = "vortex-io" version = "0.32.0" @@ -6621,6 +7109,7 @@ dependencies = [ "taffy", "tokio", "vortex", + "vortex-geo", "vortex-layout", ] @@ -6866,19 +7355,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-core" -version = "0.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4763c1de310c86d75a878046489e2e5ba02c649d185f21c67d4cf8a56d098980" -dependencies = [ - "windows-implement 0.60.0", - "windows-interface 0.59.1", - "windows-link", - "windows-result 0.3.2", - "windows-strings 0.4.0", -] - [[package]] name = "windows-implement" version = "0.57.0" @@ -6901,17 +7377,6 @@ dependencies = [ "syn 2.0.101", ] -[[package]] -name = "windows-implement" -version = "0.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.101", -] - [[package]] name = "windows-interface" version = "0.57.0" @@ -6934,17 +7399,6 @@ dependencies = [ "syn 2.0.101", ] -[[package]] -name = "windows-interface" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.101", -] - [[package]] name = "windows-link" version = "0.1.1" @@ -7008,15 +7462,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-strings" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ba9642430ee452d5a7aa78d72907ebe8cfda358e8cb7918a2050581322f97" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-sys" version = "0.45.0" @@ -7269,6 +7714,54 @@ dependencies = [ "serde-value", ] +[[package]] +name = "wkb" +version = "0.8.0" +source = "git+https://github.com/georust/wkb?rev=5a2027995997017bcd531e6be7e5cf126db1d4c1#5a2027995997017bcd531e6be7e5cf126db1d4c1" +dependencies = [ + "byteorder", + "geo-traits", + "num_enum 0.7.3", + "thiserror 1.0.69", +] + +[[package]] +name = "wkt" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54f7f1ff4ea4c18936d6cd26a6fd24f0003af37e951a8e0e8b9e9a2d0bd0a46d" +dependencies = [ + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + +[[package]] +name = "wkt" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1c591649bd1c9d4e28459758bbb5fb5c0edc7a67060b52422f4761c94ffe961" +dependencies = [ + "geo-traits", + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + +[[package]] +name = "wkt" +version = "0.12.0" +source = "git+https://github.com/georust/wkt?rev=270ffe0eaf5ba5255c364dbade39c451562a9e9b#270ffe0eaf5ba5255c364dbade39c451562a9e9b" +dependencies = [ + "geo-traits", + "geo-types", + "log", + "num-traits", + "thiserror 1.0.69", +] + [[package]] name = "worker" version = "0.5.0" diff --git a/Cargo.toml b/Cargo.toml index 4191943589..8a5f7240fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "bench-vortex", "encodings/*", + "extensions/*", "fuzz", "pyvortex", "vortex", @@ -61,6 +62,7 @@ arrow-arith = "55" arrow-array = "55" arrow-buffer = "55" arrow-cast = "55" +arrow-ipc = "55" arrow-ord = "55" arrow-schema = "55" arrow-select = "55" @@ -72,7 +74,7 @@ bit-vec = "0.8.0" bytes = "1.10" bzip2 = "0.5.0" cfg-if = "1" -chrono = "0.4.40" +chrono = "0.4.41" clap = "4.5" compio = { version = "0.14", features = ["io-uring"], default-features = false } crossterm = "0.28" @@ -188,6 +190,7 @@ vortex-fastlanes = { version = "0.32.0", path = "./encodings/fastlanes", default vortex-file = { version = "0.32.0", path = "./vortex-file", default-features = false } vortex-flatbuffers = { version = "0.32.0", path = "./vortex-flatbuffers", default-features = false } vortex-fsst = { version = "0.32.0", path = "./encodings/fsst", default-features = false } +vortex-geo = { version = "0.32.0", path = "./extensions/geo" } vortex-io = { version = "0.32.0", path = "./vortex-io", default-features = false } vortex-ipc = { version = "0.32.0", path = "./vortex-ipc", default-features = false } vortex-layout = { version = "0.32.0", path = "./vortex-layout", default-features = false } diff --git a/extensions/geo/Cargo.toml b/extensions/geo/Cargo.toml new file mode 100644 index 0000000000..1dfb7cbab2 --- /dev/null +++ b/extensions/geo/Cargo.toml @@ -0,0 +1,50 @@ +[package] +name = "vortex-geo" +description = "Geospatial extensions for Vortex" +authors.workspace = true +categories.workspace = true +edition.workspace = true +homepage.workspace = true +include.workspace = true +keywords.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +arcref = { workspace = true } +arrow-array = { workspace = true } +arrow-buffer = { workspace = true } +arrow-schema = { workspace = true } +geoarrow = { git = "https://github.com/geoarrow/geoarrow-rs.git", rev = "c241034f" } +geoarrow-schema = { git = "https://github.com/geoarrow/geoarrow-rs.git", rev = "c241034f" } +serde_json = { workspace = true } +vortex-array = { workspace = true } +vortex-dtype = { workspace = true } +vortex-error = { workspace = true } +vortex-expr = { workspace = true, optional = true } +vortex-layout = { workspace = true, optional = true } +clap = { version = "4.5.37", features = ["derive"] } + +[dev-dependencies] +anyhow = { workspace = true } +arrow-ipc = { workspace = true } +clap = { workspace = true, features = ["derive"] } +tokio = { workspace = true, features = ["full"] } +vortex-btrblocks = { workspace = true } +vortex-dtype = { workspace = true, features = ["arrow"] } +vortex-file = { workspace = true, features = ["tokio"] } +vortex-scalar = { workspace = true } +geoarrow-geoparquet = { git = "https://github.com/geoarrow/geoarrow-rs.git", rev = "c241034f", features = [ + "compression", +] } + + +[features] +# layouts request both vortex-layout and vortex-expr for ExprEvaluator impls +layouts = ["dep:vortex-layout", "dep:vortex-expr"] + +[lints] +workspace = true diff --git a/extensions/geo/README.md b/extensions/geo/README.md new file mode 100644 index 0000000000..04bc4daa12 --- /dev/null +++ b/extensions/geo/README.md @@ -0,0 +1,9 @@ +# GeoVortex + +This crate provides geospatial extension types on top of Vortex. The current set of supported types include + +* Point for 2D or 3D point geometry +* LineString for 2D or 3D linestring geometry +* Polygon for 2D or 3D polygons +* WKB for opaque well-known binary encoded geometry + diff --git a/extensions/geo/examples/load_geoparquet.rs b/extensions/geo/examples/load_geoparquet.rs new file mode 100644 index 0000000000..fb4b57f64d --- /dev/null +++ b/extensions/geo/examples/load_geoparquet.rs @@ -0,0 +1,40 @@ +extern crate vortex_dtype; +extern crate vortex_geo; + +use std::fs::File; + +use vortex_array::variants::StructArrayTrait; +use vortex_array::{ArrayRef, ToCanonical, TryIntoArray}; +use vortex_btrblocks::BtrBlocksCompressor; +use vortex_geo::POLYGON_ID; + +#[allow(clippy::unwrap_used, clippy::expect_used)] +pub fn main() { + vortex_geo::arrow::register_extension_types(); + + let file = File::open("/Volumes/Code/Data/afghan.parquet").unwrap(); + let mut geo = geoarrow_geoparquet::GeoParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); + + let arrow_batch = geo.next().unwrap().unwrap(); + + let vortex_batch: ArrayRef = arrow_batch + .try_into_array() + .expect("convert to vortex array"); + let geometry = vortex_batch + .to_struct() + .unwrap() + .maybe_null_field_by_name("geometry") + .unwrap(); + + assert_eq!(geometry.dtype().as_extension().unwrap().id(), &*POLYGON_ID); + + println!("uncompressed:\n{}", geometry.tree_display()); + + // Run through the compressor to see how it performs. It should just compress the individual storage + // arrays. + let compressed = BtrBlocksCompressor.compress(geometry.as_ref()).unwrap(); + println!("compressed:\n{}", compressed.tree_display()); +} diff --git a/extensions/geo/examples/ogc_server.rs b/extensions/geo/examples/ogc_server.rs new file mode 100644 index 0000000000..deca19615a --- /dev/null +++ b/extensions/geo/examples/ogc_server.rs @@ -0,0 +1,59 @@ +use std::fs::File; +use std::path::PathBuf; + +use anyhow::Context; +use arrow_array::RecordBatch; +use arrow_array::cast::AsArray; +use clap::Parser; +use vortex_array::arrow::IntoArrowArray; +use vortex_file::VortexOpenOptions; + +#[derive(Parser, Debug)] +pub struct Args { + /// Path to a Vortex file with geometry data + input: PathBuf, +} + +#[allow(clippy::use_debug)] +#[tokio::main] +pub async fn main() -> anyhow::Result<()> { + vortex_geo::arrow::register_extension_types(); + + let args = Args::parse(); + let reader = VortexOpenOptions::file() + .open(&args.input) + .await + .context("open input file")?; + + let arrow_schema = reader + .dtype() + .to_arrow_schema() + .context("get arrow schema")?; + + let out = File::create("ipc.arrow").context("open IPC file")?; + let mut writer = arrow_ipc::writer::StreamWriter::try_new(out, &arrow_schema)?; + + println!("arrow schema: {:?}", arrow_schema); + + // Stream batches to read the file as-is. + let batches = reader.scan().context("scan builder")?.into_array_iter()?; + + let mut written = 0; + for batch in batches { + let record_batch = batch?.into_arrow_preferred()?; + let batch = RecordBatch::from( + record_batch + .as_struct_opt() + .ok_or_else(|| anyhow::anyhow!("expected struct"))? + .clone(), + ); + writer.write(&batch)?; + written += batch.num_rows(); + } + + writer.finish()?; + + eprintln!("wrote {written} rows to ipc.arrow"); + + Ok(()) +} diff --git a/extensions/geo/src/array.rs b/extensions/geo/src/array.rs new file mode 100644 index 0000000000..ab973634fc --- /dev/null +++ b/extensions/geo/src/array.rs @@ -0,0 +1,161 @@ +//! `ExtensionArray` wrapper for arrays that hold geospatial data types. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::arrays::ExtensionArray; +use vortex_array::variants::ExtensionArrayTrait; +use vortex_dtype::{DType, PType, StructDType}; +use vortex_error::{VortexError, VortexResult, vortex_assert, vortex_bail}; + +use crate::{Dimension, GeoMetadata, GeometryType, OwnedGeoMetadata, OwnedGeometryType}; + +/// Holder for what is known to be one of the blessed extension array types. +pub enum GeometryArray<'a> { + Point(&'a ExtensionArray, GeoMetadata<'a>), + LineString(&'a ExtensionArray, GeoMetadata<'a>), + Polygon(&'a ExtensionArray, GeoMetadata<'a>), + #[allow(clippy::upper_case_acronyms)] + WKB(&'a ExtensionArray, GeoMetadata<'a>), +} + +impl<'a> TryFrom<&'a ExtensionArray> for GeometryArray<'a> { + type Error = VortexError; + + fn try_from(value: &'a ExtensionArray) -> VortexResult { + let geometry_type = GeometryType::try_from(value.ext_dtype().as_ref())?; + Ok(match geometry_type { + GeometryType::Point(meta) => Self::Point(value, meta), + GeometryType::Polygon(meta) => Self::Polygon(value, meta), + GeometryType::WKB(meta) => Self::WKB(value, meta), + GeometryType::LineString(meta) => Self::LineString(value, meta), + }) + } +} + +#[derive(Debug, Clone)] +pub struct PointArray { + inner: ExtensionArray, + metadata: OwnedGeoMetadata, +} + +impl PointArray { + /// Wrap an existing array as a `geovortex.point` extension array. + /// + /// ## Error checking + /// + /// The provided `points` storage array must be a struct-typed array with f64 columns named based + /// on their dimensions. + pub fn try_new(points: ArrayRef, metadata: OwnedGeoMetadata) -> VortexResult { + let DType::Struct(schema, _) = points.dtype() else { + vortex_bail!("points must be Struct typed, was {}", points.dtype()) + }; + + validate_coord_schema(schema, metadata.dimension)?; + let point_type = + OwnedGeometryType::Point(metadata.clone()).into_ext_dtype(points.dtype().nullability()); + let inner = ExtensionArray::new(Arc::new(point_type), points); + Ok(Self { inner, metadata }) + } + + /// Deconstruct the `PointsArray` wrapper into the storage array and the parsed extension metadata. + pub fn into_parts(self) -> (ExtensionArray, OwnedGeoMetadata) { + (self.inner, self.metadata) + } +} + +pub fn validate_coord_schema(schema: &StructDType, dimensions: Dimension) -> VortexResult<()> { + match dimensions { + Dimension::XY => { + vortex_assert!(schema.nfields() == 2); + vortex_assert!(schema.field_name(0)?.as_ref().eq("x")); + vortex_assert!(schema.field_name(1)?.as_ref().eq("y")); + schema + .fields() + .all(|field| field.eq_ignore_nullability(PType::F64.into())); + } + Dimension::XYZ => { + vortex_assert!(schema.nfields() == 3); + vortex_assert!(schema.field_name(0)?.as_ref().eq("x")); + vortex_assert!(schema.field_name(1)?.as_ref().eq("y")); + vortex_assert!(schema.field_name(2)?.as_ref().eq("z")); + schema + .fields() + .all(|field| field.eq_ignore_nullability(PType::F64.into())); + } + Dimension::XYM => { + vortex_assert!(schema.nfields() == 3); + vortex_assert!(schema.field_name(0)?.as_ref().eq("x")); + vortex_assert!(schema.field_name(1)?.as_ref().eq("y")); + vortex_assert!(schema.field_name(2)?.as_ref().eq("m")); + schema + .fields() + .all(|field| field.eq_ignore_nullability(PType::F64.into())); + } + Dimension::XYZM => { + vortex_assert!(schema.nfields() == 3); + vortex_assert!(schema.field_name(0)?.as_ref().eq("x")); + vortex_assert!(schema.field_name(1)?.as_ref().eq("y")); + vortex_assert!(schema.field_name(2)?.as_ref().eq("z")); + vortex_assert!(schema.field_name(3)?.as_ref().eq("m")); + schema + .fields() + .all(|field| field.eq_ignore_nullability(PType::F64.into())); + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use vortex_array::Array; + use vortex_array::arrays::{PrimitiveArray, StructArray}; + + use super::PointArray; + use crate::{OwnedGeoMetadata, POINT_ID}; + + #[test] + fn test_points() { + let values = StructArray::from_fields(&[ + ( + "x", + PrimitiveArray::from_iter([0f64, 0f64, 0f64]).into_array(), + ), + ( + "y", + PrimitiveArray::from_iter([1f64, 2f64, 3f64]).into_array(), + ), + ( + "z", + PrimitiveArray::from_iter([4f64, 5f64, 6f64]).into_array(), + ), + ]) + .unwrap() + .into_array(); + + let points_array = PointArray::try_new( + values, + OwnedGeoMetadata { + crs: None, + dimension: crate::Dimension::XYZ, + }, + ) + .unwrap(); + + let (ext, meta) = points_array.into_parts(); + assert_eq!( + meta, + OwnedGeoMetadata { + crs: None, + dimension: crate::Dimension::XYZ, + } + ); + assert_eq!(ext.id(), &*POINT_ID); + } + + #[test] + fn test_polygon() { + // Create two lists of points: one defining an interior and exterior. + } +} diff --git a/extensions/geo/src/arrow.rs b/extensions/geo/src/arrow.rs new file mode 100644 index 0000000000..088b376894 --- /dev/null +++ b/extensions/geo/src/arrow.rs @@ -0,0 +1,441 @@ +//! Conversions between GeoVortex and GeoArrow arrays and extension types, using +//! the `geoarrow` crate. + +use std::sync::Arc; + +use arcref::ArcRef; +use arrow_buffer::{OffsetBuffer, ScalarBuffer}; +use arrow_schema::extension::ExtensionType; +use arrow_schema::{DataType, Field}; +use geoarrow::ArrayBase; +use geoarrow::array::{ + CoordBuffer, LineStringArray, PointArray, PolygonArray, SeparatedCoordBuffer, +}; +use geoarrow_schema::{Crs, LineStringType, PointType, PolygonType, WkbType}; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrow::ArrowArray; +use vortex_array::arrow::compute::{ToArrowArgs, ToArrowKernelRef}; +use vortex_array::compute::{InvocationArgs, Kernel, Output, cast}; +use vortex_array::{Array, ToCanonical, register_kernel}; +use vortex_dtype::arrow::{ArrowTypeConversion, ArrowTypeConversionRef}; +use vortex_dtype::{DType, ExtDType, PType}; +use vortex_error::{VortexResult, vortex_bail}; + +use crate::array::GeometryArray; +use crate::{Dimension, GeoMetadata, GeometryType, OwnedGeoMetadata, OwnedGeometryType}; + +impl From for geoarrow_schema::Dimension { + fn from(value: Dimension) -> Self { + match value { + Dimension::XY => geoarrow_schema::Dimension::XY, + Dimension::XYZ => geoarrow_schema::Dimension::XYZ, + Dimension::XYM => geoarrow_schema::Dimension::XYM, + Dimension::XYZM => geoarrow_schema::Dimension::XYZM, + } + } +} + +/// Kernel to convert into GeoArrow memory format. +#[derive(Debug)] +pub struct ToGeoArrow; + +/// Extension type registration. +pub static REGISTER_EXTENSION_TYPES: fn() = || { + DType::register_extension_type(ArrowTypeConversionRef(ArcRef::new_ref(&GeoArrowConversion))); +}; + +impl Kernel for ToGeoArrow { + fn invoke(&self, args: &InvocationArgs) -> VortexResult> { + let ToArrowArgs { array, .. } = ToArrowArgs::try_from(args)?; + match array.as_any().downcast_ref::() { + None => Ok(None), + Some(ext_array) => { + if let Ok(geometry_array) = GeometryArray::try_from(ext_array) { + // based on the particular geometry, encode into GeoArrow type. + let array = match geometry_array { + GeometryArray::Point(point_array, metadata) => { + to_arrow_point(point_array, &metadata)? + } + GeometryArray::LineString(line_string, metadata) => { + to_arrow_linestring(line_string, &metadata)? + } + GeometryArray::Polygon(polygon, metadata) => { + to_arrow_polygon(polygon, &metadata)? + } + GeometryArray::WKB(..) => todo!(), + }; + + Ok(Some(Output::Array( + ArrowArray::new(array, ext_array.dtype().nullability()).into_array(), + ))) + } else { + Ok(None) + } + } + } + } +} +register_kernel!(ToArrowKernelRef(ArcRef::new_ref(&ToGeoArrow))); + +fn to_arrow_point( + point_array: &ExtensionArray, + metadata: &GeoMetadata, +) -> VortexResult { + let coords = coordinate_buffer(point_array.storage(), metadata)?; + let nulls = point_array.validity_mask()?.to_null_buffer(); + let crs = match metadata.crs { + None => Crs::default(), + Some(wkt) => Crs::from_wkt2_2019(wkt.to_string()), + }; + + Ok(PointArray::new( + coords, + nulls, + Arc::new(geoarrow_schema::Metadata::new(crs, None)), + ) + .into_array_ref()) +} + +fn to_arrow_linestring( + line_string: &ExtensionArray, + metadata: &GeoMetadata, +) -> VortexResult { + let list = line_string.storage().to_list()?; + let coords = coordinate_buffer(list.elements().as_ref(), metadata)?; + let offsets = cast(&list.offsets().to_primitive()?, &PType::I32.into())? + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let offsets = OffsetBuffer::new(offsets); + let nulls = list.validity_mask()?.to_null_buffer(); + + let crs = match metadata.crs { + None => Crs::default(), + Some(wkt) => Crs::from_wkt2_2019(wkt.to_string()), + }; + + Ok(LineStringArray::new( + coords, + offsets, + nulls, + Arc::new(geoarrow_schema::Metadata::new(crs, None)), + ) + .into_array_ref()) +} + +fn to_arrow_polygon( + polygon_array: &ExtensionArray, + metadata: &GeoMetadata, +) -> VortexResult { + // Polygon is list of list of points. + let rings = polygon_array.storage().to_list()?; + let points = rings.elements().to_list()?; + let coords = coordinate_buffer(points.elements().as_ref(), metadata)?; + let ring_offsets = cast(&points.offsets().to_primitive()?, &PType::I32.into())? + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let geom_offsets = cast(&rings.offsets().to_primitive()?, &PType::I32.into())? + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + + let crs = match metadata.crs { + None => Crs::default(), + Some(wkt) => Crs::from_wkt2_2019(wkt.to_string()), + }; + Ok(PolygonArray::new( + coords, + OffsetBuffer::new(geom_offsets), + OffsetBuffer::new(ring_offsets), + None, + Arc::new(geoarrow_schema::Metadata::new(crs, None)), + ) + .into_array_ref()) +} + +/// ZST that holds the VTable for building a `DType` from an Arrow [`Field`] type. +#[derive(Debug)] +pub struct GeoArrowConversion; + +// Conversion between Vortex geospatial extension types and GeoArrow extension types. +impl ArrowTypeConversion for GeoArrowConversion { + fn to_vortex(&self, field: &Field) -> VortexResult> { + // Validate that the field is one of the supported geospatial + // extension types. + let Some(ext_type) = field.extension_type_name() else { + return Ok(None); + }; + + if ext_type != PointType::NAME + && ext_type != LineStringType::NAME + && ext_type != PolygonType::NAME + && ext_type != WkbType::NAME + { + return Ok(None); + } + + macro_rules! dim_from_fields { + ($fields:expr) => {{ + match $fields.len() { + 2 => { + // XY is only supported 2-item format + Dimension::XY + } + 3 => { + // Can be either XYZ or XYM + if $fields.iter().any(|x| x.name().to_lowercase() == "m") { + Dimension::XYM + } else { + Dimension::XYZ + } + } + 4 => Dimension::XYZM, + _ => vortex_bail!("Unsupported field layout for Point geometry: {:?}", $fields), + } + }}; + } + + let ext_dtype: ExtDType = match ext_type { + x if x == PointType::NAME => { + let geoarrow_meta = geoarrow_schema::Metadata::try_from(field)?; + // We only accept Separated format. + let DataType::Struct(fields) = field.data_type() else { + vortex_bail!("Only Separated format is supported for Point geometry") + }; + + OwnedGeometryType::Point(OwnedGeoMetadata { + dimension: dim_from_fields!(fields), + crs: geoarrow_meta.crs().crs_value().map(|v| v.to_string()), + }) + .into_ext_dtype(field.is_nullable().into()) + } + x if x == LineStringType::NAME => { + let geoarrow_meta = geoarrow_schema::Metadata::try_from(field)?; + let DataType::List(coordinates_field) = field.data_type() else { + vortex_bail!( + "LineString geometry must be List, was {:?}", + field.data_type() + ) + }; + let DataType::Struct(fields) = coordinates_field.data_type() else { + vortex_bail!( + "LineString geometry must be List, was {:?}", + field.data_type() + ) + }; + OwnedGeometryType::LineString(OwnedGeoMetadata { + dimension: dim_from_fields!(fields), + crs: geoarrow_meta.crs().crs_value().map(|v| v.to_string()), + }) + .into_ext_dtype(field.is_nullable().into()) + } + x if x == PolygonType::NAME => { + let geoarrow_meta = geoarrow_schema::Metadata::try_from(field)?; + let DataType::List(ring_field) = field.data_type() else { + vortex_bail!( + "LineString geometry must be List, was {:?}", + field.data_type() + ) + }; + let DataType::List(coordinates_field) = ring_field.data_type() else { + vortex_bail!( + "Polygon geometry must be List>, was {:?}", + field.data_type() + ) + }; + let DataType::Struct(fields) = coordinates_field.data_type() else { + vortex_bail!( + "LineString geometry must be List, was {:?}", + field.data_type() + ) + }; + OwnedGeometryType::Polygon(OwnedGeoMetadata { + dimension: dim_from_fields!(fields), + crs: geoarrow_meta.crs().crs_value().map(|v| v.to_string()), + }) + .into_ext_dtype(field.is_nullable().into()) + } + x if x == WkbType::NAME => { + let geoarrow_meta = geoarrow_schema::Metadata::try_from(field)?; + if field.data_type() != &DataType::Binary { + vortex_bail!( + "WKB geometry Arrow type must be Binary, was {:?}", + field.data_type() + ) + } + OwnedGeometryType::WKB(OwnedGeoMetadata { + dimension: Dimension::default(), + crs: geoarrow_meta.crs().crs_value().map(|v| v.to_string()), + }) + .into_ext_dtype(field.is_nullable().into()) + } + _ => vortex_bail!("extension type {} not supported", ext_type), + }; + + Ok(Some(DType::Extension(Arc::new(ext_dtype)))) + } + + #[allow(clippy::disallowed_types)] + fn to_arrow(&self, vortex_extension_type: &ExtDType) -> VortexResult> { + if let Ok(geometry) = GeometryType::try_from(vortex_extension_type) { + Ok(Some(geometry.to_owned().into_arrow_field( + vortex_extension_type.storage_dtype().nullability(), + ))) + } else { + Ok(None) + } + } +} + +pub fn register_extension_types() { + DType::register_extension_type(ArrowTypeConversionRef(ArcRef::new_ref(&GeoArrowConversion))); +} + +/// Unpack the geoarrow CoordBuffer. Errors if the dimensions specified in the metadata do not +/// match the actual encoding. +fn coordinate_buffer(array: &dyn Array, meta: &GeoMetadata) -> VortexResult { + let children = array.children(); + match (children.len(), meta.dimension) { + (2, Dimension::XY) => { + let xs = children[0] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ys = children[1] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + Ok(CoordBuffer::Separated(SeparatedCoordBuffer::new( + [ + xs, + ys, + ScalarBuffer::from(Vec::::new()), + ScalarBuffer::from(Vec::::new()), + ], + geoarrow_schema::Dimension::XY, + ))) + } + (3, Dimension::XYZ) => { + let xs = children[0] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ys = children[1] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let zs = children[2] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + Ok(CoordBuffer::Separated(SeparatedCoordBuffer::new( + [xs, ys, zs, ScalarBuffer::from(Vec::::new())], + geoarrow_schema::Dimension::XYZ, + ))) + } + (3, Dimension::XYM) => { + let xs = children[0] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ys = children[1] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ms = children[1] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + Ok(CoordBuffer::Separated(SeparatedCoordBuffer::new( + [xs, ys, ms, ScalarBuffer::from(Vec::::new())], + geoarrow_schema::Dimension::XYM, + ))) + } + (4, Dimension::XYZM) => { + let xs = children[0] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ys = children[1] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let zs = children[2] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + let ms = children[3] + .to_primitive()? + .into_buffer::() + .into_arrow_scalar_buffer(); + Ok(CoordBuffer::Separated(SeparatedCoordBuffer::new( + [xs, ys, zs, ms], + geoarrow_schema::Dimension::XYM, + ))) + } + _ => { + vortex_bail!( + "child count {} invalid for expected Dimension {:?}", + children.len(), + meta.dimension + ) + } + } +} + +#[cfg(test)] +mod tests { + //! Test that round trip through GeoArrow works as expected. + use std::sync::Arc; + + use arrow_array::{ArrayRef, StructArray}; + use arrow_schema::Fields; + use geoarrow::ArrayBase; + use geoarrow::array::PointBuilder; + use geoarrow_schema::{CoordType, Dimension}; + use vortex_array::arrow::{FromArrowArray, IntoArrowArray}; + use vortex_dtype::arrow::FromArrowType; + use vortex_dtype::{DType, ExtDType}; + + use crate::{OwnedGeoMetadata, OwnedGeometryType}; + + #[test] + fn test_geo_simple() { + // Make a square + let mut points = + PointBuilder::new_with_options(Dimension::XY, CoordType::Separated, Default::default()); + points.push_coord(Some(&(0.0f64, 0.0f64))); + let points = points.finish(); + let field_type = points.extension_field(); + let dtype = DType::from_arrow(field_type.as_ref()); + + let owned_type: ExtDType = OwnedGeometryType::Point(OwnedGeoMetadata { + dimension: crate::Dimension::XY, + crs: None, + }) + .into_ext_dtype(true.into()); + assert_eq!(dtype, DType::Extension(Arc::new(owned_type))); + + // round trip back to Arrow type. + assert_eq!(&dtype.to_arrow().unwrap(), field_type.data_type()); + } + + #[test] + fn test_arrow_extension_type() { + let mut points = + PointBuilder::new_with_options(Dimension::XY, CoordType::Separated, Default::default()); + points.push_coord(Some(&(0.0f64, 0.0f64))); + let points = points.finish(); + + let struct_array: ArrayRef = Arc::new(StructArray::new( + Fields::from(vec![points.extension_field()]), + vec![points.to_array_ref()], + None, + )); + + let imported = vortex_array::ArrayRef::from_arrow(struct_array.clone(), false); + let exported = imported.into_arrow_preferred().unwrap(); + assert_eq!(exported.data_type(), struct_array.data_type()); + } +} diff --git a/extensions/geo/src/layout/mod.rs b/extensions/geo/src/layout/mod.rs new file mode 100644 index 0000000000..d2ac2bf8e5 --- /dev/null +++ b/extensions/geo/src/layout/mod.rs @@ -0,0 +1,43 @@ +use std::collections::BTreeSet; +use std::sync::Arc; + +use vortex_array::ArrayContext; +use vortex_dtype::FieldMask; +use vortex_error::VortexResult; +use vortex_layout::segments::SegmentSource; +use vortex_layout::{Layout, LayoutId, LayoutReader, LayoutVTable}; + +use crate::layout::reader::BBoxReader; + +pub mod reader; +pub mod writer; + +pub const ID: LayoutId = LayoutId::new_ref("geovortex.bbox"); + +#[derive(Debug)] +pub struct BBoxLayout; + +impl LayoutVTable for BBoxLayout { + fn id(&self) -> LayoutId { + ID + } + + fn reader( + &self, + layout: Layout, + segment_source: &Arc, + ctx: &ArrayContext, + ) -> VortexResult> { + Ok(Arc::new(BBoxReader::try_new(layout)?)) + } + + fn register_splits( + &self, + layout: &Layout, + field_mask: &[FieldMask], + row_offset: u64, + splits: &mut BTreeSet, + ) -> VortexResult<()> { + todo!() + } +} diff --git a/extensions/geo/src/layout/reader.rs b/extensions/geo/src/layout/reader.rs new file mode 100644 index 0000000000..d8e9e58f37 --- /dev/null +++ b/extensions/geo/src/layout/reader.rs @@ -0,0 +1,71 @@ +use std::ops::Range; +use std::sync::Arc; + +use vortex_dtype::DType; +use vortex_error::{VortexResult, vortex_assert}; +use vortex_layout::{ + ArrayEvaluation, ExprEvaluator, Layout, LayoutReader, MaskEvaluation, NoOpPruningEvaluation, + PruningEvaluation, +}; + +/// Layout Reader +pub struct BBoxReader { + /// The layout definition provided from file or from client. + layout: Layout, +} + +impl BBoxReader { + pub fn try_new(layout: Layout) -> VortexResult { + vortex_assert!(layout.id() == crate::layout::ID, "Invalid layout ID"); + + Ok(Self { layout }) + } +} + +impl ExprEvaluator for BBoxReader { + fn pruning_evaluation( + &self, + row_range: &Range, + expr: &vortex_expr::ExprRef, + ) -> VortexResult> { + // TODO(aduffy): any expressions that touch the geometry. + // Could be function application, could be something else. + // Can we implement our own expressions externally that plugin? + Ok(Box::new(NoOpPruningEvaluation)) + } + + fn filter_evaluation( + &self, + row_range: &Range, + expr: &vortex_expr::ExprRef, + ) -> VortexResult> { + todo!() + } + + fn projection_evaluation( + &self, + row_range: &Range, + expr: &vortex_expr::ExprRef, + ) -> VortexResult> { + todo!() + } +} + +impl LayoutReader for BBoxReader { + fn layout(&self) -> &Layout { + todo!() + } + + fn row_count(&self) -> u64 { + todo!() + } + + fn dtype(&self) -> &DType { + todo!() + } + + fn children(&self) -> VortexResult>> { + // Children will be whichever children were inherited from the layouts. + todo!() + } +} diff --git a/extensions/geo/src/layout/writer.rs b/extensions/geo/src/layout/writer.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/extensions/geo/src/layouts.rs b/extensions/geo/src/layouts.rs new file mode 100644 index 0000000000..367f902659 --- /dev/null +++ b/extensions/geo/src/layouts.rs @@ -0,0 +1,42 @@ +use std::collections::BTreeSet; +use std::sync::Arc; + +use vortex_array::ArrayContext; +use vortex_dtype::{DType, FieldMask}; +use vortex_error::{VortexResult, vortex_assert}; +use vortex_layout::segments::SegmentSource; +use vortex_layout::{ + Layout, LayoutId, LayoutReader, LayoutRegistry, LayoutVTable, LayoutVTableRef, +}; + +/// Special layout that stores a bounding box for every geometry in a +/// file to make filtering easier. + +impl LayoutVTable for BBoxLayout { + fn id(&self) -> LayoutId { + LayoutId::new_ref("geovortex.flat") + } + + fn reader( + &self, + layout: Layout, + segment_source: &Arc, + ctx: &ArrayContext, + ) -> VortexResult> { + vortex_assert!( + layout.vtable().id() == self.id(), + "Invalid layout ID for bbox {}", + layout.vtable().id() + ); + } + + fn register_splits( + &self, + layout: &Layout, + field_mask: &[FieldMask], + row_offset: u64, + splits: &mut BTreeSet, + ) -> VortexResult<()> { + todo!() + } +} diff --git a/extensions/geo/src/lib.rs b/extensions/geo/src/lib.rs new file mode 100644 index 0000000000..be02807e58 --- /dev/null +++ b/extensions/geo/src/lib.rs @@ -0,0 +1,30 @@ +use std::sync::LazyLock; + +use vortex_dtype::ExtID; + +mod array; +pub mod arrow; +mod types; + +#[cfg(feature = "layouts")] +pub mod layouts; + +pub use array::*; +pub use types::*; + +/// An extension type for arrays containing [Well-known Binary](https://libgeos.org/specifications/wkb/). +/// +/// This is a fallback and will generally be less performant than one of the native encodings. +/// +/// See [`POINT_ID`], [`POLYGON_ID`] for examples of native encodings. +pub static WKB_ID: LazyLock = LazyLock::new(|| ExtID::from("geovortex.wkb")); + +/// Point is an N-dimensional point. It is stored using one value per variant here. +/// The actual point is based on something like a Struct of the components. +pub static POINT_ID: LazyLock = LazyLock::new(|| ExtID::from("geovortex.point")); + +pub static LINESTRING_ID: LazyLock = LazyLock::new(|| ExtID::from("geovortex.linestring")); + +/// Polygon is represented as a list of "rings". The first ring defines the Points that make up the exterior +/// of the shape. Subsequent rings are interior "holes" in the shape. +pub static POLYGON_ID: LazyLock = LazyLock::new(|| ExtID::from("geovortex.polygon")); diff --git a/extensions/geo/src/types.rs b/extensions/geo/src/types.rs new file mode 100644 index 0000000000..12644bf639 --- /dev/null +++ b/extensions/geo/src/types.rs @@ -0,0 +1,274 @@ +use std::convert::Into; +use std::sync::Arc; + +use arrow_schema::Field; +use geoarrow_schema::{CoordType, Crs, LineStringType, PointType, PolygonType, WkbType}; +use vortex_dtype::{DType, ExtDType, ExtMetadata, Nullability, PType, StructDType}; +use vortex_error::{VortexError, VortexResult, vortex_bail, vortex_err}; + +use crate::{LINESTRING_ID, POINT_ID, POLYGON_ID, WKB_ID}; + +/// Dimensions in the coordinate buffers for data. +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum Dimension { + /// Two dimensional coordinates. This is the default if there is no metadata provided for + /// the geometry. + #[default] + XY = 1, + /// Three-dimensional geometry. Commonly the Z coordinate will be height above the ellipsoid, + /// or height above sea level, determined by the CRS. + XYZ = 2, + /// Two-dimensional with an additional non-spatial measure (e.g. time). + XYM = 3, + /// Three-dimensional with an additional non-spatial measure (e.g. time). + XYZM = 4, +} + +/// Zero-allocation container for geometry metadata +pub struct GeoMetadata<'a> { + pub dimension: Dimension, + pub crs: Option<&'a str>, +} + +/// Owned version of a [`GeoMetadata`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OwnedGeoMetadata { + pub dimension: Dimension, + pub crs: Option, +} + +impl From for geoarrow_schema::Metadata { + fn from(value: OwnedGeoMetadata) -> Self { + let crs = value + .crs + .as_ref() + .map(|crs| match serde_json::from_str(crs) { + Ok(value) => Crs::from_projjson(value), + Err(_) => Crs::from_unknown_crs_type(crs.to_string()), + }) + .unwrap_or_default(); + + Self::new(crs, None) + } +} + +// TODO(aduffy): add more geometry types like MultiPolygon. +/// Zero-copy view of an `ExtDType` as one of the GeoVortex builtin geometry types. +/// +/// The [owned][ToOwned] version of this is an [`OwnedGeometryType`]. +pub enum GeometryType<'a> { + Point(GeoMetadata<'a>), + LineString(GeoMetadata<'a>), + Polygon(GeoMetadata<'a>), + WKB(GeoMetadata<'a>), +} + +impl GeometryType<'_> { + pub fn to_owned(&self) -> OwnedGeometryType { + macro_rules! owned_meta { + ($meta:expr) => { + OwnedGeoMetadata { + dimension: $meta.dimension, + crs: $meta.crs.map(String::from), + } + }; + } + + match self { + GeometryType::Point(meta) => OwnedGeometryType::Point(owned_meta!(meta)), + GeometryType::LineString(meta) => OwnedGeometryType::LineString(owned_meta!(meta)), + GeometryType::Polygon(meta) => OwnedGeometryType::Polygon(owned_meta!(meta)), + GeometryType::WKB(meta) => OwnedGeometryType::WKB(owned_meta!(meta)), + } + } +} + +/// An owned geometry type. +pub enum OwnedGeometryType { + Point(OwnedGeoMetadata), + LineString(OwnedGeoMetadata), + Polygon(OwnedGeoMetadata), + WKB(OwnedGeoMetadata), +} + +impl OwnedGeometryType { + /// Serialize the metadata the way it will be stored in [`ExtMetadata`]. + pub fn metadata(&self) -> Vec { + match self { + OwnedGeometryType::Point(OwnedGeoMetadata { dimension, crs }) + | OwnedGeometryType::LineString(OwnedGeoMetadata { dimension, crs }) + | OwnedGeometryType::Polygon(OwnedGeoMetadata { dimension, crs }) + | OwnedGeometryType::WKB(OwnedGeoMetadata { dimension, crs }) => { + let mut bytes = vec![*dimension as u8]; + if let Some(crs) = crs { + bytes.extend(crs.as_bytes()); + } + bytes + } + } + } +} + +impl OwnedGeometryType { + pub fn into_ext_dtype(self, nullability: Nullability) -> ExtDType { + let ext_meta = self.metadata(); + + match self { + OwnedGeometryType::Point(OwnedGeoMetadata { dimension, .. }) => ExtDType::new( + POINT_ID.clone(), + Arc::new(DType::Struct(Arc::new(point_dtype(dimension)), nullability)), + Some(ExtMetadata::new(ext_meta.into())), + ), + OwnedGeometryType::LineString(OwnedGeoMetadata { dimension, .. }) => { + let storage_dtype = DType::List( + Arc::new(DType::Struct( + Arc::new(point_dtype(dimension)), + false.into(), + )), + nullability, + ); + ExtDType::new( + LINESTRING_ID.clone(), + Arc::new(storage_dtype), + Some(ExtMetadata::new(ext_meta.into())), + ) + } + OwnedGeometryType::Polygon(OwnedGeoMetadata { dimension, .. }) => { + let storage_dtype = DType::List( + Arc::new(DType::List( + Arc::new(DType::Struct( + Arc::new(point_dtype(dimension)), + false.into(), + )), + false.into(), + )), + nullability, + ); + ExtDType::new( + POLYGON_ID.clone(), + Arc::new(storage_dtype), + Some(ExtMetadata::new(ext_meta.into())), + ) + } + OwnedGeometryType::WKB(..) => ExtDType::new( + WKB_ID.clone(), + Arc::new(DType::Binary(nullability)), + Some(ExtMetadata::new(ext_meta.into())), + ), + } + } + + pub fn into_arrow_field(self, nullability: Nullability) -> Field { + match self { + OwnedGeometryType::Point(meta) => PointType::new( + CoordType::Separated, + meta.dimension.into(), + Arc::new(meta.into()), + ) + .to_field("point_type", nullability.into()), + OwnedGeometryType::LineString(meta) => LineStringType::new( + CoordType::Separated, + meta.dimension.into(), + Arc::new(meta.into()), + ) + .to_field("line_string_type", nullability.into()), + OwnedGeometryType::Polygon(meta) => PolygonType::new( + CoordType::Separated, + meta.dimension.into(), + Arc::new(meta.into()), + ) + .to_field("polygon_type", nullability.into()), + OwnedGeometryType::WKB(meta) => { + WkbType::new(Arc::new(meta.into())).to_field("wkb_type", nullability.into(), false) + } + } + } +} + +fn point_dtype(dimension: Dimension) -> StructDType { + match dimension { + Dimension::XY => StructDType::from_iter([ + ("x", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("y", DType::Primitive(PType::F64, Nullability::NonNullable)), + ]), + Dimension::XYZ => StructDType::from_iter([ + ("x", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("y", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("z", DType::Primitive(PType::F64, Nullability::NonNullable)), + ]), + Dimension::XYM => StructDType::from_iter([ + ("x", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("y", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("m", DType::Primitive(PType::F64, Nullability::NonNullable)), + ]), + Dimension::XYZM => StructDType::from_iter([ + ("x", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("y", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("z", DType::Primitive(PType::F64, Nullability::NonNullable)), + ("m", DType::Primitive(PType::F64, Nullability::NonNullable)), + ]), + } +} + +impl<'a> TryFrom<&'a ExtMetadata> for GeoMetadata<'a> { + type Error = VortexError; + + fn try_from(metadata: &'a ExtMetadata) -> VortexResult { + let bytes = metadata.as_ref(); + if bytes.is_empty() { + vortex_bail!("If metadata is provided must not be empty"); + } else { + let dimension = match bytes[0] { + x if x == Dimension::XY as u8 => Dimension::XY, + x if x == Dimension::XYZ as u8 => Dimension::XYZ, + x if x == Dimension::XYM as u8 => Dimension::XYM, + x if x == Dimension::XYZM as u8 => Dimension::XYZM, + _ => vortex_bail!("Invalid dimension: {:?}", bytes), + }; + let crs = match bytes.len() { + 2.. => Some(validate_crs(&bytes[1..])?), + _ => None, + }; + + Ok(Self { dimension, crs }) + } + } +} + +// Validate and reinterpret-cast an `ExtDType` to a `GeometryType` in-place. +impl<'a> TryFrom<&'a ExtDType> for GeometryType<'a> { + type Error = VortexError; + + fn try_from(ext_dtype: &'a ExtDType) -> VortexResult { + // Metadata must be provided. + let Some(metadata) = ext_dtype.metadata() else { + vortex_bail!("Metadata must be provided for geometry types"); + }; + match ext_dtype.id().as_ref() { + x if x == POINT_ID.as_ref() => { + Ok(GeometryType::Point(GeoMetadata::try_from(metadata)?)) + } + x if x == LINESTRING_ID.as_ref() => { + Ok(GeometryType::LineString(GeoMetadata::try_from(metadata)?)) + } + x if x == POLYGON_ID.as_ref() => { + Ok(GeometryType::Polygon(GeoMetadata::try_from(metadata)?)) + } + x if x == WKB_ID.as_ref() => Ok(GeometryType::WKB(GeoMetadata::try_from(metadata)?)), + _ => Err(vortex_err!("Unsupported geometry type {}", ext_dtype.id())), + } + } +} + +fn validate_crs(bytes: &[u8]) -> VortexResult<&str> { + if bytes.is_empty() { + vortex_bail!("WKT CRS must not be empty"); + } + + match std::str::from_utf8(bytes) { + // TODO(aduffy): validate that the UTF-8 string is also valid WKT + Ok(s) => Ok(s), + Err(_) => vortex_bail!("Invalid CRS string: {:?}", bytes), + } +} diff --git a/pyvortex/src/arrays/mod.rs b/pyvortex/src/arrays/mod.rs index af0ff0df6c..ef3ac42545 100644 --- a/pyvortex/src/arrays/mod.rs +++ b/pyvortex/src/arrays/mod.rs @@ -220,7 +220,7 @@ impl PyArray { if let Some(chunked_array) = array.as_opt::() { // We figure out a single Arrow Data Type to convert all chunks into, otherwise // the preferred type of each chunk may be different. - let arrow_dtype = chunked_array.dtype().to_arrow_dtype()?; + let arrow_dtype = chunked_array.dtype().to_arrow()?; let chunks = chunked_array .chunks() diff --git a/pyvortex/src/dtype/mod.rs b/pyvortex/src/dtype/mod.rs index 2b7f7c73cd..51035dc0e9 100644 --- a/pyvortex/src/dtype/mod.rs +++ b/pyvortex/src/dtype/mod.rs @@ -137,7 +137,7 @@ impl PyDType { #[pymethods] impl PyDType { fn to_arrow_type(&self, py: Python) -> PyResult { - self.0.to_arrow_dtype()?.into_pyarrow(py) + self.0.to_arrow()?.into_pyarrow(py) } fn to_arrow_schema(&self, py: Python) -> PyResult { diff --git a/vortex-array/src/arrays/struct_/mod.rs b/vortex-array/src/arrays/struct_/mod.rs index b58f4984da..4bda632df7 100644 --- a/vortex-array/src/arrays/struct_/mod.rs +++ b/vortex-array/src/arrays/struct_/mod.rs @@ -103,7 +103,7 @@ impl StructArray { if &struct_dt != field.dtype() { vortex_bail!( - "Expected all struct fields to have dtype {}, found {}", + "Expected struct field to have type {}, found {}", struct_dt, field.dtype() ); diff --git a/vortex-array/src/arrow/compute/to_arrow/canonical.rs b/vortex-array/src/arrow/compute/to_arrow/canonical.rs index 5303959d0c..9c633f342c 100644 --- a/vortex-array/src/arrow/compute/to_arrow/canonical.rs +++ b/vortex-array/src/arrow/compute/to_arrow/canonical.rs @@ -47,9 +47,9 @@ impl Kernel for ToArrowCanonical { let arrow_type = arrow_type .cloned() .map(Ok) - .unwrap_or_else(|| array.dtype().to_arrow_dtype())?; + .unwrap_or_else(|| array.dtype().to_arrow())?; - let arrow_array = match (array.to_canonical()?, &arrow_type) { + let arrow_array = match (array.to_canonical()?, arrow_type) { (Canonical::Null(array), DataType::Null) => to_arrow_null(array), (Canonical::Bool(array), DataType::Boolean) => to_arrow_bool(array), (Canonical::Primitive(array), DataType::Int8) if matches!(array.ptype(), PType::I8) => { @@ -110,9 +110,9 @@ impl Kernel for ToArrowCanonical { (Canonical::Struct(array), DataType::Struct(fields)) => { to_arrow_struct(array, fields.as_ref()) } - (Canonical::List(array), DataType::List(field)) => to_arrow_list::(array, field), + (Canonical::List(array), DataType::List(field)) => to_arrow_list::(array, &field), (Canonical::List(array), DataType::LargeList(field)) => { - to_arrow_list::(array, field) + to_arrow_list::(array, &field) } (Canonical::VarBinView(array), DataType::BinaryView) if array.dtype().is_binary() => { to_arrow_varbinview::(array) @@ -141,14 +141,14 @@ impl Kernel for ToArrowCanonical { >(array)?) } (Canonical::Extension(_), _) => { - // Datetime and interval types are handled by a different kernel. + // Extension types must implement their own kernel for support return Ok(None); } - _ => vortex_bail!( + (_, arrow_type) => vortex_bail!( "Cannot convert canonical array {} with dtype {} to: {:?}", array.encoding(), array.dtype(), - &arrow_type + arrow_type ), }?; @@ -221,6 +221,7 @@ fn to_arrow_decimal256(array: DecimalArray) -> VortexResult { )) } +#[allow(clippy::use_debug)] fn to_arrow_struct(array: StructArray, fields: &[FieldRef]) -> VortexResult { let field_arrays = fields .iter() @@ -263,6 +264,7 @@ fn to_arrow_struct(array: StructArray, fields: &[FieldRef]) -> VortexResult(); @@ -352,7 +354,8 @@ mod tests { DecimalDType::new(19, 2), Validity::NonNullable, ); - let arrow = to_arrow(&decimal_vortex, &DataType::Decimal128(19, 2)).unwrap(); + let arrow_field = Field::new("_default", DataType::Decimal128(19, 2), false); + let arrow = to_arrow(&decimal_vortex, arrow_field.data_type()).unwrap(); assert_eq!(arrow.data_type(), &DataType::Decimal128(19, 2)); let decimal_array = arrow.as_any().downcast_ref::().unwrap(); assert_eq!( diff --git a/vortex-array/src/arrow/compute/to_arrow/mod.rs b/vortex-array/src/arrow/compute/to_arrow/mod.rs index cc9020bf50..f41dffeb67 100644 --- a/vortex-array/src/arrow/compute/to_arrow/mod.rs +++ b/vortex-array/src/arrow/compute/to_arrow/mod.rs @@ -118,6 +118,7 @@ impl ComputeFnVTable for ToArrow { let ToArrowArgs { array, arrow_type } = ToArrowArgs::try_from(args)?; Ok(arrow_type .map(|arrow_type| DType::from_arrow((arrow_type, array.dtype().nullability()))) + // .ok_or_else(|| vortex_err!("inconvertible DType")) .unwrap_or_else(|| array.dtype().clone())) } @@ -145,8 +146,8 @@ pub static TO_ARROW_FN: LazyLock = LazyLock::new(|| { }); pub struct ToArrowArgs<'a> { - array: &'a dyn Array, - arrow_type: Option<&'a DataType>, + pub array: &'a dyn Array, + pub arrow_type: Option<&'a DataType>, } impl<'a> TryFrom<&InvocationArgs<'a>> for ToArrowArgs<'a> { @@ -259,7 +260,7 @@ mod tests { ); assert_eq!( - &to_arrow(&array, &array.dtype().to_arrow_dtype().unwrap()).unwrap(), + &to_arrow(&array, &array.dtype().to_arrow().unwrap()).unwrap(), &arrow_array ); } diff --git a/vortex-array/src/arrow/compute/to_arrow/temporal.rs b/vortex-array/src/arrow/compute/to_arrow/temporal.rs index 626aa428b5..fa145db571 100644 --- a/vortex-array/src/arrow/compute/to_arrow/temporal.rs +++ b/vortex-array/src/arrow/compute/to_arrow/temporal.rs @@ -43,7 +43,7 @@ impl Kernel for ToArrowTemporal { let arrow_type = arrow_type .cloned() .map(Ok) - .unwrap_or_else(|| array.dtype().to_arrow_dtype())?; + .unwrap_or_else(|| array.dtype().to_arrow())?; let arrow_array: ArrowArrayRef = match (array.temporal_metadata(), &arrow_type) { (TemporalMetadata::Date(TimeUnit::D), DataType::Date32) => { diff --git a/vortex-array/src/arrow/convert.rs b/vortex-array/src/arrow/convert.rs index 3d0a637c7a..bb10878193 100644 --- a/vortex-array/src/arrow/convert.rs +++ b/vortex-array/src/arrow/convert.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow_array::array::{ Array as ArrowArray, ArrayRef as ArrowArrayRef, ArrowPrimitiveType, BooleanArray as ArrowBooleanArray, GenericByteArray, NullArray as ArrowNullArray, @@ -16,14 +18,15 @@ use arrow_buffer::buffer::{NullBuffer, OffsetBuffer}; use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer as ArrowBuffer, ScalarBuffer}; use arrow_schema::{DataType, TimeUnit as ArrowTimeUnit}; use vortex_buffer::{Alignment, Buffer, ByteBuffer}; +use vortex_dtype::arrow::arrow_field_to_dtype; use vortex_dtype::datetime::TimeUnit; -use vortex_dtype::{DType, DecimalDType, NativePType, PType}; +use vortex_dtype::{DType, DecimalDType, FieldNames, NativePType, PType}; use vortex_error::{VortexExpect as _, vortex_panic}; use vortex_scalar::i256; use crate::arrays::{ - BoolArray, DecimalArray, ListArray, NullArray, PrimitiveArray, StructArray, TemporalArray, - VarBinArray, VarBinViewArray, + BoolArray, DecimalArray, ExtensionArray, ListArray, NullArray, PrimitiveArray, StructArray, + TemporalArray, VarBinArray, VarBinViewArray, }; use crate::arrow::FromArrowArray; use crate::validity::Validity; @@ -231,14 +234,29 @@ impl FromArrowArray<&ArrowBooleanArray> for ArrayRef { impl FromArrowArray<&ArrowStructArray> for ArrayRef { fn from_arrow(value: &ArrowStructArray, nullable: bool) -> Self { + let mut field_names: Vec> = Vec::with_capacity(value.fields().len()); + let mut field_arrays = Vec::with_capacity(value.fields().len()); + + for (field, column) in value.fields().iter().zip(value.columns()) { + let array = match field.extension_type_name() { + Some(_) => { + match arrow_field_to_dtype(field).vortex_expect("arrow_field_to_dtype") { + Some(DType::Extension(ext_dtype)) => { + let storage = Self::from_arrow(column.clone(), field.is_nullable()); + ExtensionArray::new(ext_dtype, storage).into_array() + } + _ => Self::from_arrow(column.clone(), field.is_nullable()), + } + } + None => Self::from_arrow(column.clone(), field.is_nullable()), + }; + field_names.push(field.name().to_string().into()); + field_arrays.push(array); + } + StructArray::try_new( - value.column_names().iter().map(|s| (*s).into()).collect(), - value - .columns() - .iter() - .zip(value.fields()) - .map(|(c, field)| Self::from_arrow(c.clone(), field.is_nullable())) - .collect(), + FieldNames::from_iter(field_names), + field_arrays, value.len(), nulls(value.nulls(), nullable), ) diff --git a/vortex-array/src/arrow/mod.rs b/vortex-array/src/arrow/mod.rs index 7590e48efa..d9da10255f 100644 --- a/vortex-array/src/arrow/mod.rs +++ b/vortex-array/src/arrow/mod.rs @@ -10,6 +10,7 @@ mod convert; mod datum; mod record_batch; +pub use array::ArrowArray; pub use datum::*; use crate::arrow::compute::ToArrowOptions; diff --git a/vortex-array/src/arrow/record_batch.rs b/vortex-array/src/arrow/record_batch.rs index 12d62ef705..53e9a1c62c 100644 --- a/vortex-array/src/arrow/record_batch.rs +++ b/vortex-array/src/arrow/record_batch.rs @@ -1,31 +1,20 @@ -use arrow_array::RecordBatch; use arrow_array::cast::AsArray; +use arrow_array::{Array as _, RecordBatch}; use arrow_schema::{DataType, Schema}; use vortex_error::{VortexError, VortexResult, vortex_err}; use crate::arrays::StructArray; use crate::arrow::FromArrowArray; use crate::arrow::compute::{to_arrow, to_arrow_preferred}; -use crate::validity::Validity; use crate::{Array, ArrayRef, ToCanonical, TryIntoArray}; impl TryIntoArray for RecordBatch { fn try_into_array(self) -> VortexResult { - Ok(StructArray::try_new( - self.schema() - .fields() - .iter() - .map(|f| f.name().as_str().into()) - .collect(), - self.columns() - .iter() - .zip(self.schema().fields()) - .map(|(array, field)| ArrayRef::from_arrow(array.clone(), field.is_nullable())) - .collect(), - self.num_rows(), - Validity::NonNullable, // Must match FromArrowType for DType - )? - .into_array()) + let struct_array = arrow_array::StructArray::from(self); + Ok(ArrayRef::from_arrow( + &struct_array, + struct_array.is_nullable(), + )) } } diff --git a/vortex-array/src/builders/mod.rs b/vortex-array/src/builders/mod.rs index 7526f5bf7e..7545f3cbd0 100644 --- a/vortex-array/src/builders/mod.rs +++ b/vortex-array/src/builders/mod.rs @@ -5,7 +5,7 @@ //! //! ## Example: //! -//! ``` +//! ```rust //! use vortex_array::builders::{builder_with_capacity, ArrayBuilderExt}; //! use vortex_dtype::{DType, Nullability}; //! diff --git a/vortex-array/src/canonical.rs b/vortex-array/src/canonical.rs index 06d8e58ead..3dab7bc26c 100644 --- a/vortex-array/src/canonical.rs +++ b/vortex-array/src/canonical.rs @@ -96,7 +96,7 @@ impl Canonical { pub fn into_list(self) -> VortexResult { match self { Canonical::List(a) => Ok(a), - _ => vortex_bail!("Cannot unwrap StructArray from {:?}", &self), + _ => vortex_bail!("Cannot unwrap ListArray from {:?}", &self), } } diff --git a/vortex-btrblocks/src/float/stats.rs b/vortex-btrblocks/src/float/stats.rs index 76ca95f901..36f0b5f4c4 100644 --- a/vortex-btrblocks/src/float/stats.rs +++ b/vortex-btrblocks/src/float/stats.rs @@ -135,6 +135,7 @@ where .first() .vortex_expect("All null masks have been handled before"); let buff = array.buffer::(); + let _slice = buff.as_slice(); let mut prev = buff[head_idx]; let first_valid_buff = buff.slice(head_idx..array.len()); diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 34ad07687b..d3036353b9 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -336,7 +336,8 @@ impl BtrBlocksCompressor { } } - // Compress the underlying storage array. + // Otherwise: compress the underlying storage array. + // TODO(aduffy): figure out how to let extension types let compressed_storage = self.compress(ext_array.storage())?; Ok( diff --git a/vortex-dtype/Cargo.toml b/vortex-dtype/Cargo.toml index ae6d4c4b83..64814ed486 100644 --- a/vortex-dtype/Cargo.toml +++ b/vortex-dtype/Cargo.toml @@ -15,11 +15,14 @@ version = { workspace = true } [dependencies] arbitrary = { workspace = true, optional = true } +arcref = { workspace = true } arrow-schema = { workspace = true, optional = true } flatbuffers = { workspace = true } half = { workspace = true, features = ["num-traits"] } +inventory = { workspace = true, optional = true } itertools = { workspace = true } jiff = { workspace = true } +log = { workspace = true } num-traits = { workspace = true } num_enum = { workspace = true } prost = { workspace = true } @@ -38,5 +41,5 @@ workspace = true [features] default = ["arrow"] -arrow = ["dep:arrow-schema"] +arrow = ["dep:arrow-schema", "dep:inventory"] serde = ["dep:serde"] diff --git a/vortex-dtype/src/arrow.rs b/vortex-dtype/src/arrow.rs index 4129f59509..8fa0dba00b 100644 --- a/vortex-dtype/src/arrow.rs +++ b/vortex-dtype/src/arrow.rs @@ -1,25 +1,151 @@ -//! Convert between Vortex [`crate::DType`] and Apache Arrow [`arrow_schema::DataType`]. +//! Convert between Vortex [`DType`] and Apache Arrow [`DataType`]. //! -//! Apache Arrow's type system includes physical information, which could lead to ambiguities as -//! Vortex treats encodings as separate from logical types. +//! ## Arrow -> Vortex //! -//! [`DType::to_arrow_schema`] and its sibling [`DType::to_arrow_dtype`] use a simple algorithm, -//! where every logical type is encoded in its simplest corresponding Arrow type. This reflects the -//! reality that most compute engines don't make use of the entire type range arrow-rs supports. +//! Each Arrow `DataType` has a defined mapping onto its nearest Vortex data type, via +//! implementation of the `FromArrowType` trait. //! -//! For this reason, it's recommended to do as much computation as possible within Vortex, and then -//! materialize an Arrow ArrayRef at the very end of the processing chain. +//! All of the Arrow primitive types map onto the equivalent Vortex primitives: +//! +//! ```rust +//! use arrow_schema::DataType; +//! use vortex_dtype::{DType, Nullability, PType, arrow::FromArrowType}; +//! +//! let arrow_i32 = DataType::Int32; +//! let vortex_i32 = DType::from_arrow((&arrow_i32, false.into())); +//! assert_eq!(vortex_i32, DType::Primitive(PType::I32, false.into())); +//! ``` +//! +//! However, some types in Arrow do not map 1:1 onto Vortex. This is because +//! Arrow uses _physical_ type information, whereas Vortex is a pure logical +//! type system.For example, Arrow distinguishes between `String` and `LargeString` +//! layouts based on the size of the offset elements, whereas in Vortex they are +//! both just `Utf8`. +//! +//! +//! ```rust +//! use arrow_schema::DataType; +//! use vortex_dtype::{DType, Nullability, PType, arrow::FromArrowType}; +//! +//! // This type has no exact representation in Vortex +//! let arrow_large_string = DataType::LargeUtf8; +//! let vortex_string = DType::from_arrow((&arrow_large_string, false.into())); +//! // The "Large" is lost in the conversion. +//! assert_eq!(vortex_string, DType::Utf8(false.into())); +//! ``` +//! +//! There are many such cases where extra Arrow type information is lost. Users that +//! want to be able to round-trip back to Arrow later should save the original +//! `DataType`. +//! +//! ## Vortex -> Arrow +//! +//! `DType` has defined conversions into both [`Schema`] and [`DataType`], where the former +//! is only supported for struct types. +//! +//! [`DType::to_arrow_schema`] and its sibling [`DType::to_arrow`] follow a simple algorithm +//! for selecting the nearest Arrow type: +//! +//! * Vortex `Utf8` maps to Arrow `Utf8View` and Vortex `Binary` maps to `BinaryView` +//! * The non-`Large` variant of a type is always selected by default +//! * Extension types provide a mapping to +//! +//! ```rust +//! use std::sync::Arc; +//! use arrow_schema::DataType; +//! use vortex_dtype::{DType, Nullability, PType}; +//! +//! // Utf8 will be selected over LargeUtf8 +//! assert_eq!( +//! DType::Utf8(false.into()).to_arrow().unwrap(), +//! DataType::Utf8View, +//! ); +//! +//! // List will be selected over LargeList +//! assert_eq!( +//! DType::List(Arc::new(PType::I32.into()), false.into()).to_arrow().unwrap(), +//! DataType::new_list(DataType::Int32, false), +//! ); +//! ``` +//! +//! ## Extension type support +//! +//! Arrow provides an extension type mechanism, effectively a type alias to another `DataType` with +//! some additional key-value string metadata that is carried on the field schema. +//! +//! Vortex supports canonicalizing its types into Arrow extension types, via the implementation +//! of the [`ArrowTypeConversion`] trait, and the [`register_extension_type!`][register_extension_type] macro. +//! +//! ```rust +//! use std::collections::HashMap; +//! use std::sync::Arc; +//! +//! use arcref::ArcRef; +//! use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY}; +//! use vortex_dtype::{DType, ExtDType, ExtID, PType, StructDType}; +//! use vortex_dtype::arrow::{ArrowTypeConversion, ArrowTypeConversionRef}; +//! use vortex_error::VortexResult; +//! +//! #[derive(Debug)] +//! pub struct TemperatureConversion; +//! +//! impl ArrowTypeConversion for TemperatureConversion { +//! fn arrow_metadata(&self, dtype: &ExtDType) -> VortexResult>> { +//! if dtype.id().as_ref() != "vortex.temperature" { +//! return Ok(None); +//! } +//! +//! let mut metadata = HashMap::new(); +//! metadata.insert(EXTENSION_TYPE_NAME_KEY.to_string(), "vortex.temperature".to_string()); +//! metadata.insert(EXTENSION_TYPE_METADATA_KEY.to_string(), r#"{"unit": "F"}"#.to_string()); +//! Ok(Some(metadata)) +//! } +//! } +//! +//! // Register the extension type so `DType` methods are aware of it at runtime. +//! DType::register_extension_type(ArrowTypeConversionRef(ArcRef::new_ref(&TemperatureConversion))); +//! +//! // Attempt to use the extension type +//! let ext_type = ExtDType::new( +//! ExtID::new("vortex.temperature".into()), +//! Arc::new(DType::Primitive(PType::F32, false.into())), +//! None, +//! ); +//! +//! // Extension types only carry their metadata in a schema, so we +//! // use a struct type +//! let ext_type = DType::Extension(Arc::new(ext_type)); +//! let schema = StructDType::from_fields( +//! ["values".into()].into(), +//! vec![ext_type.into()], +//! ); +//! +//! let schema_type = DType::Struct(Arc::new(schema), false.into()); +//! +//! let arrow_schema = schema_type.to_arrow_schema().unwrap(); +//! let ext_field = arrow_schema.field(0); +//! assert_eq!( +//! ext_field.extension_type_name(), +//! Some("vortex.temperature"), +//! ); +//! +//! assert_eq!( +//! ext_field.extension_type_metadata(), +//! Some(r#"{"unit": "F"}"#), +//! ); +//! ``` -use std::sync::Arc; +use std::fmt::Debug; +use std::ops::Deref; +use std::sync::{Arc, LazyLock, RwLock}; -use arrow_schema::{ - DECIMAL128_MAX_SCALE, DataType, Field, FieldRef, Fields, Schema, SchemaBuilder, SchemaRef, -}; -use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err}; +use arcref::ArcRef; +use arrow_schema::{DECIMAL128_MAX_SCALE, DataType, Field, FieldRef, Fields, Schema, SchemaRef}; +use vortex_error::{VortexExpect, VortexResult, vortex_assert, vortex_bail, vortex_err}; use crate::datetime::arrow::{make_arrow_temporal_dtype, make_temporal_ext_dtype}; use crate::datetime::is_temporal_ext_type; -use crate::{DType, DecimalDType, FieldName, Nullability, PType, StructDType}; +use crate::{DType, DecimalDType, ExtDType, FieldName, Nullability, PType, StructDType}; /// Trait for converting Arrow types to Vortex types. pub trait FromArrowType: Sized { @@ -131,35 +257,84 @@ impl FromArrowType<(&DataType, Nullability)> for DType { impl FromArrowType<&Field> for DType { fn from_arrow(field: &Field) -> Self { + if let Some(ext_type) = field.extension_type_name() { + // Check the registry for any Vortex extension types that represent the Arrow + // extension type named here. + for converter in EXTENSION_TYPES.read().vortex_expect("poisoned").iter() { + if let Some(converted) = converter + .to_vortex(field) + .vortex_expect("Conversion from GeoArrow type to GeoVortex") + { + return converted; + } + } + + log::debug!( + "failed to resolve ArrowTypeConversion for unrecognized extension type \"{}\"", + ext_type + ); + } + Self::from_arrow((field.data_type(), field.is_nullable().into())) } } +/// Registry of extension types that can be updated at runtime to add new types to it. +static EXTENSION_TYPES: LazyLock>> = + LazyLock::new(|| RwLock::new(Vec::new())); + impl DType { - /// Convert a Vortex [`DType`] into an Arrow [`Schema`]. - pub fn to_arrow_schema(&self) -> VortexResult { - let DType::Struct(struct_dtype, nullable) = self else { - vortex_bail!("only DType::Struct can be converted to arrow schema"); - }; + /// Register a new Arrow extension type conversion function dynamically. + /// + /// This associated function should be called early in the program before any data is processed, to ensure that + /// subsequent calls to [`DType::to_arrow`] or [`DType::from_arrow`] are aware of it. + pub fn register_extension_type(extension: ArrowTypeConversionRef) { + EXTENSION_TYPES + .write() + .vortex_expect("poisoned") + .push(extension); + } +} - if *nullable != Nullability::NonNullable { - vortex_bail!("top-level struct in Schema must be NonNullable"); +/// If a suitable [conversion is registered][DType::register_extension_type], returns the Arrow `Field` +/// representing that this extension type converts into. Otherwise, `None` is returned. +/// +/// Any errors from the conversion function are propagated here via +/// the outer `VortexResult` wrapper. +pub fn try_extension_type_to_arrow(ext_type: &ExtDType) -> VortexResult> { + for kernel in EXTENSION_TYPES.read().vortex_expect("poisoned").iter() { + if let Some(metadata) = kernel.0.to_arrow(ext_type)? { + return Ok(Some(metadata)); } + } - let mut builder = SchemaBuilder::with_capacity(struct_dtype.names().len()); - for (field_name, field_dtype) in struct_dtype.names().iter().zip(struct_dtype.fields()) { - builder.push(FieldRef::from(Field::new( - field_name.to_string(), - field_dtype.to_arrow_dtype()?, - field_dtype.is_nullable(), - ))); - } + Ok(None) +} - Ok(builder.finish()) +impl DType { + /// Convert a Vortex [`DType`] into an Arrow [`Schema`]. + pub fn to_arrow_schema(&self) -> VortexResult { + vortex_assert!( + self.is_struct(), + "to_arrow_schema expected struct type, was {}", + self, + ); + vortex_assert!( + !self.is_nullable(), + "nullable Struct cannot be converted to Arrow Schema" + ); + + let DataType::Struct(fields) = self.to_arrow()? else { + vortex_bail!( + "Cannot convert non-struct dtype to Arrow schema: {:?}", + self + ) + }; + Ok(Schema::new(fields)) } - /// Returns the Arrow [`DataType`] that best corresponds to this Vortex [`DType`]. - pub fn to_arrow_dtype(&self) -> VortexResult { + /// Returns the Arrow [`Field`] that best represents the Vortex type. + pub fn to_arrow(&self) -> VortexResult { Ok(match self { DType::Null => DataType::Null, DType::Bool(_) => DataType::Boolean, @@ -189,11 +364,27 @@ impl DType { let mut fields = Vec::with_capacity(struct_dtype.names().len()); for (field_name, field_dt) in struct_dtype.names().iter().zip(struct_dtype.fields()) { - fields.push(FieldRef::from(Field::new( - field_name.to_string(), - field_dt.to_arrow_dtype()?, - field_dt.is_nullable(), - ))); + // Special handling for extension types, which may go through special conversion kernels + // that are registered in `DType::register_extension_type`. + let field = if let DType::Extension(ext_type) = &field_dt { + if let Some(f) = try_extension_type_to_arrow(ext_type.as_ref())? { + f.with_name(field_name.to_string()) + } else { + Field::new( + field_name.to_string(), + field_dt.to_arrow()?, + field_dt.is_nullable(), + ) + } + } else { + Field::new( + field_name.to_string(), + field_dt.to_arrow()?, + field_dt.is_nullable(), + ) + }; + + fields.push(field); } DataType::Struct(Fields::from(fields)) @@ -201,22 +392,78 @@ impl DType { // There are four kinds of lists: List (32-bit offsets), Large List (64-bit), List View // (32-bit), Large List View (64-bit). We cannot both guarantee zero-copy and commit to an // Arrow dtype because we do not how large our offsets are. - DType::List(l, _) => DataType::List(FieldRef::new(Field::new_list_field( - l.to_arrow_dtype()?, - l.nullability().into(), + DType::List(elem_type, _) => DataType::List(FieldRef::new(Field::new_list_field( + elem_type.to_arrow()?, + elem_type.nullability().into(), ))), DType::Extension(ext_dtype) => { // Try and match against the known extension DTypes. if is_temporal_ext_type(ext_dtype.id()) { make_arrow_temporal_dtype(ext_dtype) } else { - vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()) + ext_dtype.storage_dtype().to_arrow()? } } }) } } +/// Convert an Arrow [`Field`] to a `DType` using a [registered extension][ArrowTypeConversion]. +/// +/// If no suitable conversion has been registered, `Ok(None)` is returned. +pub fn arrow_field_to_dtype(field: impl AsRef) -> VortexResult> { + for converter in EXTENSION_TYPES.read().vortex_expect("poisoned").iter() { + if let Some(converted) = converter.to_vortex(field.as_ref())? { + return Ok(Some(converted)); + } + } + + Ok(None) +} + +/// Conversions between Arrow [`Field`] type and Vortex logical `DType`. +/// +/// This crate provides infra for registering and discovering extension types. +/// Once you implement this trait, you need to register it using [`register_extension_type`] +/// to have it get discovered. +/// +/// See also: [`register_extension_type`] +pub trait ArrowTypeConversion: 'static + Send + Sync + Debug { + /// Convert the given Arrow [`Field`] to a Vortex [`DType`]. + fn to_vortex(&self, _field: &Field) -> VortexResult> { + Ok(None) + } + + /// Map a type into an Arrow [`Field`] type. + /// + /// This method should be implemented if you want to add support for a Vortex + /// [extension type][ExtDType] that also has a corresponding canonical Arrow + /// extension type, to preserve round-trip between the two. + #[allow(clippy::disallowed_types)] + fn to_arrow(&self, _dtype: &ExtDType) -> VortexResult> { + Ok(None) + } +} + +/// Conversion token +#[derive(Debug)] +pub struct ArrowTypeConversionRef(pub ArcRef); + +impl Deref for ArrowTypeConversionRef { + type Target = dyn ArrowTypeConversion; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl ArrowTypeConversionRef { + /// Create a new `TypeConversionRef` from a pointer to an implementation. + pub const fn new(conversion: ArcRef) -> Self { + Self(conversion) + } +} + #[cfg(test)] mod test { use arrow_schema::{DataType, Field, FieldRef, Fields, Schema}; @@ -226,33 +473,27 @@ mod test { #[test] fn test_dtype_conversion_success() { - assert_eq!(DType::Null.to_arrow_dtype().unwrap(), DataType::Null); + assert_eq!(DType::Null.to_arrow().unwrap(), DataType::Null); assert_eq!( - DType::Bool(Nullability::NonNullable) - .to_arrow_dtype() - .unwrap(), + DType::Bool(Nullability::NonNullable).to_arrow().unwrap(), DataType::Boolean ); assert_eq!( DType::Primitive(PType::U64, Nullability::NonNullable) - .to_arrow_dtype() + .to_arrow() .unwrap(), DataType::UInt64 ); assert_eq!( - DType::Utf8(Nullability::NonNullable) - .to_arrow_dtype() - .unwrap(), + DType::Utf8(Nullability::NonNullable).to_arrow().unwrap(), DataType::Utf8View ); assert_eq!( - DType::Binary(Nullability::NonNullable) - .to_arrow_dtype() - .unwrap(), + DType::Binary(Nullability::NonNullable).to_arrow().unwrap(), DataType::BinaryView ); @@ -264,7 +505,7 @@ mod test { ])), Nullability::NonNullable, ) - .to_arrow_dtype() + .to_arrow() .unwrap(), DataType::Struct(Fields::from(vec![ FieldRef::from(Field::new("field_a", DataType::Boolean, false)), @@ -280,15 +521,14 @@ mod test { Nullability::Nullable, ); - let arrow_list_non_nullable = list_non_nullable.to_arrow_dtype().unwrap(); + let arrow_list_non_nullable = list_non_nullable.to_arrow().unwrap(); let list_nullable = DType::List( Arc::new(DType::Primitive(PType::I64, Nullability::Nullable)), Nullability::Nullable, ); - let arrow_list_nullable = list_nullable.to_arrow_dtype().unwrap(); + let arrow_list_nullable = list_nullable.to_arrow().unwrap(); - assert_ne!(arrow_list_non_nullable, arrow_list_nullable); assert_eq!( arrow_list_nullable, DataType::new_list(DataType::Int64, true) @@ -300,15 +540,15 @@ mod test { } #[test] - #[should_panic] - fn test_dtype_conversion_panics() { - let _ = DType::Extension(Arc::new(ExtDType::new( + fn test_unregistered_extension_conversion() { + let arrow = DType::Extension(Arc::new(ExtDType::new( ExtID::from("my-fake-ext-dtype"), Arc::new(DType::Utf8(Nullability::NonNullable)), None, ))) - .to_arrow_dtype() + .to_arrow() .unwrap(); + assert_eq!(arrow, DataType::Utf8View); } #[test] diff --git a/vortex-dtype/src/dtype.rs b/vortex-dtype/src/dtype.rs index cf32dd6234..8750cd2e60 100644 --- a/vortex-dtype/src/dtype.rs +++ b/vortex-dtype/src/dtype.rs @@ -196,6 +196,14 @@ impl DType { _ => None, } } + + /// Get the `ExtDType` if `self` is `Extension`, otherwise `None` + pub fn as_extension(&self) -> Option<&ExtDType> { + match self { + Extension(ext_dtype) => Some(ext_dtype.as_ref()), + _ => None, + } + } } impl Display for DType { diff --git a/vortex-dtype/src/lib.rs b/vortex-dtype/src/lib.rs index 608592d8e2..9912d4a4dd 100644 --- a/vortex-dtype/src/lib.rs +++ b/vortex-dtype/src/lib.rs @@ -12,6 +12,9 @@ pub use extension::*; pub use field::*; pub use field_mask::*; pub use half; +#[cfg(feature = "arrow")] +#[doc = "Re-export of [`inventory`](https://docs.rs/inventory/latest/inventory/)"] +pub use inventory; pub use nullability::*; pub use ptype::*; pub use struct_::*; diff --git a/vortex-error/src/lib.rs b/vortex-error/src/lib.rs index 4da2247600..a6515106e8 100644 --- a/vortex-error/src/lib.rs +++ b/vortex-error/src/lib.rs @@ -418,6 +418,21 @@ macro_rules! vortex_bail { }; } +/// Convenience macro that bails if an assertion fails. +#[macro_export] +macro_rules! vortex_assert { + ($condition:expr) => {{ + if !$condition { + $crate::vortex_bail!("Assertion failed: {}", stringify!{ $condition }) + } + }}; + ($condition:expr, $($tt:tt)+) => {{ + if !$condition { + $crate::vortex_bail!($($tt)*) + } + }}; +} + /// A convenient macro for panicking with a VortexError in the presence of a programmer error /// (e.g., an invariant has been violated). #[macro_export] diff --git a/vortex-jni/src/array.rs b/vortex-jni/src/array.rs index 9d668dea4f..9049a20917 100644 --- a/vortex-jni/src/array.rs +++ b/vortex-jni/src/array.rs @@ -77,7 +77,7 @@ pub extern "system" fn Java_dev_vortex_jni_NativeArrayMethods_exportToArrow<'loc let array_ref = unsafe { NativeArray::from_ptr(array_ptr) }; try_or_throw(&mut env, |env| { - let preferred_arrow_type = array_ref.inner.dtype().to_arrow_dtype()?; + let preferred_arrow_type = array_ref.inner.dtype().to_arrow()?; let viewless_arrow_type = data_type_no_views(preferred_arrow_type); let arrow_array = array_ref.inner.clone().into_arrow(&viewless_arrow_type)?; diff --git a/vortex-layout/src/reader.rs b/vortex-layout/src/reader.rs index b178417328..85693f5edc 100644 --- a/vortex-layout/src/reader.rs +++ b/vortex-layout/src/reader.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use async_trait::async_trait; use futures::FutureExt; use futures::future::{BoxFuture, Shared}; +use futures::stream::BoxStream; use vortex_array::ArrayRef; use vortex_dtype::DType; use vortex_error::{SharedVortexResult, VortexError, VortexResult}; @@ -34,6 +35,21 @@ pub trait LayoutReader: 'static + ExprEvaluator { fn children(&self) -> VortexResult>>; } +/// Layouts are how arrays can be read from stable storage, such as local NVMe or object stores. +/// +/// Layouts define an interface to read batches of data back out asynchronously. By implementing +/// using the async approach, we can make everything work in the synchronous model simply by +/// blocking execution on the current thread. +#[async_trait] +pub trait Layout2 { + /// Create a read for the given row range, and a given pruning expression. + async fn read_range( + &self, + expr: Option, + row_range: Range, + ) -> VortexResult>; +} + pub trait LayoutReaderExt: LayoutReader { /// Box the layout scan. fn into_arc(self) -> Arc diff --git a/vortex-scalar/src/list.rs b/vortex-scalar/src/list.rs index 33ffd62ea2..1a3420664a 100644 --- a/vortex-scalar/src/list.rs +++ b/vortex-scalar/src/list.rs @@ -138,10 +138,11 @@ impl<'a> ListScalar<'a> { impl Scalar { pub fn list( - element_dtype: Arc, + element_dtype: impl Into>, children: Vec, nullability: Nullability, ) -> Self { + let element_dtype = element_dtype.into(); for child in &children { if child.dtype() != &*element_dtype { vortex_panic!( diff --git a/vortex-tui/Cargo.toml b/vortex-tui/Cargo.toml index 4ac0b9e1ff..8ce9f1264d 100644 --- a/vortex-tui/Cargo.toml +++ b/vortex-tui/Cargo.toml @@ -26,6 +26,8 @@ taffy = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread"] } vortex = { workspace = true, features = ["tokio", "parquet"] } vortex-layout = { workspace = true } +vortex-geo = { workspace = true } +# Load the extensions [lints] workspace = true diff --git a/vortex-tui/src/convert.rs b/vortex-tui/src/convert.rs index 246bcb0ea2..00369d32ec 100644 --- a/vortex-tui/src/convert.rs +++ b/vortex-tui/src/convert.rs @@ -20,6 +20,9 @@ const BATCH_SIZE: usize = 8192; /// Convert Parquet files to Vortex. pub async fn exec_convert(input_path: impl AsRef, flags: Flags) -> VortexResult<()> { + // Register any relevant extension types here. + vortex_geo::arrow::REGISTER_EXTENSION_TYPES(); + if !flags.quiet { eprintln!( "Converting input Parquet file: {}",