From eeee89b57bcc091f7131e855b5d21337ea69eef3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 9 Apr 2025 15:24:24 -0500 Subject: [PATCH 1/3] hacky update to Arrow 55 --- benchmarks/src/cancellation.rs | 2 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 2 +- datafusion/core/Cargo.toml | 2 +- datafusion/core/benches/parquet_query_sql.rs | 6 +-- .../core/src/datasource/file_format/arrow.rs | 4 +- .../core/src/datasource/file_format/csv.rs | 12 ++--- .../src/datasource/file_format/parquet.rs | 4 +- datafusion/core/src/datasource/mod.rs | 2 +- .../datasource/physical_plan/arrow_file.rs | 14 +++--- .../src/datasource/physical_plan/parquet.rs | 8 ++-- datafusion/core/src/test/object_store.rs | 4 +- datafusion/core/src/test_util/parquet.rs | 2 +- .../datasource-parquet/src/file_format.rs | 26 +++++------ datafusion/datasource-parquet/src/mod.rs | 6 +-- datafusion/datasource-parquet/src/opener.rs | 2 +- datafusion/datasource-parquet/src/reader.rs | 28 ++++++------ .../src/row_group_filter.rs | 4 +- datafusion/datasource-parquet/src/source.rs | 4 +- datafusion/datasource/src/file_groups.rs | 10 +++-- datafusion/datasource/src/file_meta.rs | 2 +- datafusion/datasource/src/mod.rs | 44 +++++++++++-------- datafusion/datasource/src/write/demux.rs | 8 ++-- .../functions-aggregate/benches/array_agg.rs | 8 ++-- datafusion/functions/benches/find_in_set.rs | 2 +- datafusion/functions/benches/helper.rs | 2 +- datafusion/functions/benches/ltrim.rs | 2 +- datafusion/functions/benches/pad.rs | 4 +- datafusion/functions/benches/regx.rs | 4 +- datafusion/functions/benches/strpos.rs | 2 +- datafusion/functions/benches/substr_index.rs | 4 +- datafusion/functions/benches/to_char.rs | 2 +- datafusion/proto-common/src/from_proto/mod.rs | 2 +- .../proto/src/logical_plan/file_formats.rs | 2 +- .../proto/src/physical_plan/from_proto.rs | 2 +- test-utils/src/array_gen/primitive.rs | 2 +- test-utils/src/array_gen/random_data.rs | 4 +- test-utils/src/array_gen/string.rs | 2 +- 38 files changed, 126 insertions(+), 116 deletions(-) diff --git a/benchmarks/src/cancellation.rs b/benchmarks/src/cancellation.rs index f5740bdc96e0..eb78d11700f2 100644 --- a/benchmarks/src/cancellation.rs +++ b/benchmarks/src/cancellation.rs @@ -38,7 +38,7 @@ use futures::TryStreamExt; use object_store::ObjectStore; use parquet::arrow::async_writer::ParquetObjectWriter; use parquet::arrow::AsyncArrowWriter; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::rngs::ThreadRng; use rand::Rng; use structopt::StructOpt; diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 39b47a96bccf..bb1fb0c4f610 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -63,7 +63,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.23.5", optional = true } +pyo3 = { version = "0.24.1", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b0f17630c910..02283cf5e089 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -428,7 +428,7 @@ config_namespace! { /// bytes of the parquet file optimistically. If not specified, two reads are required: /// One read to fetch the 8-byte parquet footer and /// another to fetch the metadata length encoded in the footer - pub metadata_size_hint: Option, default = None + pub metadata_size_hint: Option, default = None /// (reading) If true, filter expressions are be applied during the parquet decoding operation to /// reduce the number of rows decoded. This optimization is sometimes called "late materialization". diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 56698e4d7e25..b25ca7ead715 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -156,7 +156,7 @@ env_logger = { workspace = true } insta = { workspace = true } paste = "^1.0" rand = { workspace = true, features = ["small_rng"] } -rand_distr = "0.4.3" +rand_distr = "0.5.1" regex = { workspace = true } rstest = { workspace = true } serde_json = { workspace = true } diff --git a/datafusion/core/benches/parquet_query_sql.rs b/datafusion/core/benches/parquet_query_sql.rs index f82a126c5652..9683ada7ff2b 100644 --- a/datafusion/core/benches/parquet_query_sql.rs +++ b/datafusion/core/benches/parquet_query_sql.rs @@ -29,9 +29,9 @@ use datafusion_common::instant::Instant; use futures::stream::StreamExt; use parquet::arrow::ArrowWriter; use parquet::file::properties::{WriterProperties, WriterVersion}; -use rand::distributions::uniform::SampleUniform; -use rand::distributions::Alphanumeric; -use rand::prelude::*; +use rand::distr::uniform::SampleUniform; +use rand::distr::Alphanumeric; +use rand::{prelude::*, thread_rng}; use std::fs::File; use std::io::Read; use std::ops::Range; diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 6c7c9463cf3b..ea027ae91a06 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -442,7 +442,7 @@ mod tests { let object_meta = ObjectMeta { location, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; @@ -485,7 +485,7 @@ mod tests { let object_meta = ObjectMeta { location, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 309458975ab6..2a6f98c6a291 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -103,7 +103,7 @@ mod tests { async fn get(&self, location: &Path) -> object_store::Result { let bytes = self.bytes_to_repeat.clone(); - let range = 0..bytes.len() * self.max_iterations; + let range = 0..bytes.len() as u64 * self.max_iterations as u64; let arc = self.iterations_detected.clone(); let stream = futures::stream::repeat_with(move || { let arc_inner = arc.clone(); @@ -138,7 +138,7 @@ mod tests { async fn get_ranges( &self, _location: &Path, - _ranges: &[Range], + _ranges: &[Range], ) -> object_store::Result> { unimplemented!() } @@ -153,8 +153,8 @@ mod tests { fn list( &self, - _prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + _: Option<&Path>, + ) -> BoxStream<'static, object_store::Result> { unimplemented!() } @@ -371,7 +371,7 @@ mod tests { let object_meta = ObjectMeta { location: Path::parse("/")?, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; @@ -429,7 +429,7 @@ mod tests { let object_meta = ObjectMeta { location: Path::parse("/")?, last_modified: DateTime::default(), - size: usize::MAX, + size: u64::MAX, e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 67a7ba8dc776..180dcca0ae0b 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -330,8 +330,8 @@ mod tests { fn list( &self, - _prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + _: Option<&Path>, + ) -> BoxStream<'static, object_store::Result> { Box::pin(futures::stream::once(async { Err(object_store::Error::NotImplemented) })) diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index a15b2b6ffe13..25a89644cd2a 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -106,7 +106,7 @@ mod tests { let meta = ObjectMeta { location, last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, }; diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 5dcf4df73f57..638f0dad617d 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -305,7 +305,7 @@ impl FileOpener for ArrowOpener { )?; // read footer according to footer_len let get_option = GetOptions { - range: Some(GetRange::Suffix(10 + footer_len)), + range: Some(GetRange::Suffix(10 + footer_len as u64)), ..Default::default() }; let get_result = object_store @@ -332,9 +332,9 @@ impl FileOpener for ArrowOpener { .iter() .flatten() .map(|block| { - let block_len = block.bodyLength() as usize - + block.metaDataLength() as usize; - let block_offset = block.offset() as usize; + let block_len = block.bodyLength() as u64 + + block.metaDataLength() as u64; + let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) .collect_vec(); @@ -364,9 +364,9 @@ impl FileOpener for ArrowOpener { let recordbatch_ranges = recordbatches .iter() .map(|block| { - let block_len = block.bodyLength() as usize - + block.metaDataLength() as usize; - let block_offset = block.offset() as usize; + let block_len = block.bodyLength() as u64 + + block.metaDataLength() as u64; + let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) .collect_vec(); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 5c06c3902c1c..3462fc7170ae 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -1806,7 +1806,7 @@ mod tests { #[derive(Debug, Clone)] struct TrackingParquetFileReaderFactory { inner: Arc, - metadata_size_hint_calls: Arc>>>, + metadata_size_hint_calls: Arc>>>, } impl TrackingParquetFileReaderFactory { @@ -1823,7 +1823,7 @@ mod tests { &self, partition_index: usize, file_meta: FileMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, metrics: &ExecutionPlanMetricsSet, ) -> Result> { @@ -1856,8 +1856,8 @@ mod tests { let schema = batch.schema(); let name_1 = "test1.parquet"; let name_2 = "test2.parquet"; - let total_size_1 = write_batch(name_1, store.clone(), batch.clone()).await; - let total_size_2 = write_batch(name_2, store.clone(), batch.clone()).await; + let total_size_1 = write_batch(name_1, store.clone(), batch.clone()).await as u64; + let total_size_2 = write_batch(name_2, store.clone(), batch.clone()).await as u64; let reader_factory = Arc::new(TrackingParquetFileReaderFactory::new(store.clone())); diff --git a/datafusion/core/src/test/object_store.rs b/datafusion/core/src/test/object_store.rs index e1328770cabd..8b19658bb147 100644 --- a/datafusion/core/src/test/object_store.rs +++ b/datafusion/core/src/test/object_store.rs @@ -66,7 +66,7 @@ pub fn local_unpartitioned_file(path: impl AsRef) -> ObjectMeta ObjectMeta { location, last_modified: metadata.modified().map(chrono::DateTime::from).unwrap(), - size: metadata.len() as usize, + size: metadata.len(), e_tag: None, version: None, } @@ -166,7 +166,7 @@ impl ObjectStore for BlockingObjectStore { fn list( &self, prefix: Option<&Path>, - ) -> BoxStream<'_, object_store::Result> { + ) -> BoxStream<'static, object_store::Result> { self.inner.list(prefix) } diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 084554eecbdb..f5753af64d93 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -102,7 +102,7 @@ impl TestParquetFile { println!("Generated test dataset with {num_rows} rows"); - let size = std::fs::metadata(&path)?.len() as usize; + let size = std::fs::metadata(&path)?.len(); let mut canonical_path = path.canonicalize()?; diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1d9a67fd2eb6..38880eba576a 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -193,13 +193,13 @@ impl ParquetFormat { /// another read to fetch the metadata length encoded in the footer. /// /// - If `None`, defaults to value on `config_options` - pub fn with_metadata_size_hint(mut self, size_hint: Option) -> Self { + pub fn with_metadata_size_hint(mut self, size_hint: Option) -> Self { self.options.global.metadata_size_hint = size_hint; self } /// Return the metadata size hint if set - pub fn metadata_size_hint(&self) -> Option { + pub fn metadata_size_hint(&self) -> Option { self.options.global.metadata_size_hint } @@ -290,7 +290,7 @@ fn clear_metadata( async fn fetch_schema_with_location( store: &dyn ObjectStore, file: &ObjectMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, ) -> Result<(Path, Schema)> { let loc_path = file.location.clone(); let schema = fetch_schema(store, file, metadata_size_hint).await?; @@ -735,15 +735,13 @@ impl<'a> ObjectStoreFetch<'a> { } impl MetadataFetch for ObjectStoreFetch<'_> { - fn fetch( - &mut self, - range: Range, - ) -> BoxFuture<'_, Result> { - async { + fn fetch(&mut self, range: Range) -> BoxFuture<'_, Result> { + async move { + let range_usize: Range = range.start..range.end; self.store - .get_range(&self.meta.location, range) + .get_range(&self.meta.location, range_usize) .await - .map_err(ParquetError::from) + .map_err(|e| ParquetError::External(Box::new(e))) } .boxed() } @@ -758,13 +756,13 @@ impl MetadataFetch for ObjectStoreFetch<'_> { pub async fn fetch_parquet_metadata( store: &dyn ObjectStore, meta: &ObjectMeta, - size_hint: Option, + size_hint: Option, ) -> Result { let file_size = meta.size; let fetch = ObjectStoreFetch::new(store, meta); ParquetMetaDataReader::new() - .with_prefetch_hint(size_hint) + .with_prefetch_hint(size_hint.map(|n| n.try_into().unwrap())) .load_and_finish(fetch, file_size) .await .map_err(DataFusionError::from) @@ -774,7 +772,7 @@ pub async fn fetch_parquet_metadata( async fn fetch_schema( store: &dyn ObjectStore, file: &ObjectMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, ) -> Result { let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?; let file_metadata = metadata.file_metadata(); @@ -792,7 +790,7 @@ pub async fn fetch_statistics( store: &dyn ObjectStore, table_schema: SchemaRef, file: &ObjectMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, ) -> Result { let metadata = fetch_parquet_metadata(store, file, metadata_size_hint).await?; statistics_from_parquet_meta_calc(&metadata, table_schema) diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index 516b13792189..471ed61d666b 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -104,7 +104,7 @@ impl From for ParquetExecBuilder { pub struct ParquetExecBuilder { file_scan_config: FileScanConfig, predicate: Option>, - metadata_size_hint: Option, + metadata_size_hint: Option, table_parquet_options: TableParquetOptions, parquet_file_reader_factory: Option>, schema_adapter_factory: Option>, @@ -154,7 +154,7 @@ impl ParquetExecBuilder { /// [`ParquetFileReaderFactory`] will request in the initial IO. If this is /// too small, the ParquetExec will need to make additional IO requests to /// read the footer. - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + pub fn with_metadata_size_hint(mut self, metadata_size_hint: u64) -> Self { self.metadata_size_hint = Some(metadata_size_hint); self } @@ -267,7 +267,7 @@ impl ParquetExec { builder = builder.with_predicate(predicate); } if let Some(metadata_size_hint) = metadata_size_hint { - builder = builder.with_metadata_size_hint(metadata_size_hint); + builder = builder.with_metadata_size_hint(metadata_size_hint.try_into().unwrap()); } builder.build() } diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 708a8035a4f7..06a89fa500fd 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -59,7 +59,7 @@ pub(super) struct ParquetOpener { pub table_schema: SchemaRef, /// Optional hint for how large the initial request to read parquet metadata /// should be - pub metadata_size_hint: Option, + pub metadata_size_hint: Option, /// Metrics for reporting pub metrics: ExecutionPlanMetricsSet, /// Factory for instantiating parquet reader diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 5924a5b5038f..14e937ca687e 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -23,6 +23,7 @@ use datafusion_datasource::file_meta::FileMeta; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use futures::future::BoxFuture; use object_store::ObjectStore; +use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader}; use parquet::file::metadata::ParquetMetaData; use std::fmt::Debug; @@ -57,7 +58,7 @@ pub trait ParquetFileReaderFactory: Debug + Send + Sync + 'static { &self, partition_index: usize, file_meta: FileMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, metrics: &ExecutionPlanMetricsSet, ) -> datafusion_common::Result>; } @@ -96,28 +97,29 @@ pub(crate) struct ParquetFileReader { impl AsyncFileReader for ParquetFileReader { fn get_bytes( &mut self, - range: Range, + range: Range, ) -> BoxFuture<'_, parquet::errors::Result> { - self.file_metrics.bytes_scanned.add(range.end - range.start); + let size: usize = (range.end - range.start).try_into().unwrap(); + self.file_metrics.bytes_scanned.add(size); self.inner.get_bytes(range) } fn get_byte_ranges( &mut self, - ranges: Vec>, + ranges: Vec>, ) -> BoxFuture<'_, parquet::errors::Result>> where Self: Send, { - let total = ranges.iter().map(|r| r.end - r.start).sum(); - self.file_metrics.bytes_scanned.add(total); + let total: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + self.file_metrics.bytes_scanned.add(total.try_into().unwrap()); self.inner.get_byte_ranges(ranges) } - fn get_metadata( - &mut self, - ) -> BoxFuture<'_, parquet::errors::Result>> { - self.inner.get_metadata() + fn get_metadata<'a>( + &'a mut self, options: Option<&'a ArrowReaderOptions> + ) -> BoxFuture<'a, parquet::errors::Result>> { + self.inner.get_metadata(options) } } @@ -126,7 +128,7 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { &self, partition_index: usize, file_meta: FileMeta, - metadata_size_hint: Option, + metadata_size_hint: Option, metrics: &ExecutionPlanMetricsSet, ) -> datafusion_common::Result> { let file_metrics = ParquetFileMetrics::new( @@ -135,10 +137,10 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory { metrics, ); let store = Arc::clone(&self.store); - let mut inner = ParquetObjectReader::new(store, file_meta.object_meta); + let mut inner = ParquetObjectReader::new(store, file_meta.object_meta.location); if let Some(hint) = metadata_size_hint { - inner = inner.with_footer_size_hint(hint) + inner = inner.with_footer_size_hint(hint as usize) }; Ok(Box::new(ParquetFileReader { diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 9d5f9fa16b6e..b75e88a40ac5 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -1513,7 +1513,7 @@ mod tests { let object_meta = ObjectMeta { location: object_store::path::Path::parse(file_name).expect("creating path"), last_modified: chrono::DateTime::from(std::time::SystemTime::now()), - size: data.len(), + size: data.len() as u64, e_tag: None, version: None, }; @@ -1527,7 +1527,7 @@ mod tests { let file_metrics = ParquetFileMetrics::new(0, object_meta.location.as_ref(), &metrics); let reader = ParquetFileReader { - inner: ParquetObjectReader::new(Arc::new(in_memory), object_meta), + inner: ParquetObjectReader::new(Arc::new(in_memory), object_meta.location), file_metrics: file_metrics.clone(), }; let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index a5629e43636a..45db88780210 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -272,7 +272,7 @@ pub struct ParquetSource { /// Batch size configuration pub(crate) batch_size: Option, /// Optional hint for the size of the parquet metadata - pub(crate) metadata_size_hint: Option, + pub(crate) metadata_size_hint: Option, pub(crate) projected_statistics: Option, } @@ -293,7 +293,7 @@ impl ParquetSource { /// [`ParquetFileReaderFactory`] will request in the initial IO. If this is /// too small, the ParquetSource will need to make additional IO requests to /// read the footer. - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + pub fn with_metadata_size_hint(mut self, metadata_size_hint: u64) -> Self { self.metadata_size_hint = Some(metadata_size_hint); self } diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs index a1f966c22f35..cb4156e79d35 100644 --- a/datafusion/datasource/src/file_groups.rs +++ b/datafusion/datasource/src/file_groups.rs @@ -239,7 +239,7 @@ impl FileGroupPartitioner { let mut range_start = 0; while range_start < source_file.object_meta.size { let range_end = min( - range_start + (target_partition_size - state.1), + range_start + (target_partition_size as u64 - state.1 as u64), source_file.object_meta.size, ); @@ -250,11 +250,13 @@ impl FileGroupPartitioner { }); produced_files.push((state.0, produced_file)); - if state.1 + (range_end - range_start) >= target_partition_size { + if state.1 as u64 + (range_end - range_start) + >= target_partition_size as u64 + { state.0 += 1; state.1 = 0; } else { - state.1 += range_end - range_start; + state.1 += range_end as usize - range_start as usize; } range_start = range_end; } @@ -296,7 +298,7 @@ impl FileGroupPartitioner { if group.len() == 1 { Some(ToRepartition { source_index: group_index, - file_size: group[0].object_meta.size, + file_size: group[0].object_meta.size as usize, new_groups: vec![group_index], }) } else { diff --git a/datafusion/datasource/src/file_meta.rs b/datafusion/datasource/src/file_meta.rs index 098a15eeb38a..fcc222e8130a 100644 --- a/datafusion/datasource/src/file_meta.rs +++ b/datafusion/datasource/src/file_meta.rs @@ -30,7 +30,7 @@ pub struct FileMeta { /// An optional field for user defined per object metadata pub extensions: Option>, /// Size hint for the metadata of this file - pub metadata_size_hint: Option, + pub metadata_size_hint: Option, } impl FileMeta { diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index c02f84c74d64..8e0f9a4336f4 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -113,7 +113,7 @@ pub struct PartitionedFile { /// An optional field for user defined per object metadata pub extensions: Option>, /// The estimated size of the parquet metadata, in bytes - pub metadata_size_hint: Option, + pub metadata_size_hint: Option, } impl PartitionedFile { @@ -123,7 +123,7 @@ impl PartitionedFile { object_meta: ObjectMeta { location: Path::from(path.into()), last_modified: chrono::Utc.timestamp_nanos(0), - size: size as usize, + size, e_tag: None, version: None, }, @@ -141,7 +141,7 @@ impl PartitionedFile { object_meta: ObjectMeta { location: Path::from(path), last_modified: chrono::Utc.timestamp_nanos(0), - size: size as usize, + size, e_tag: None, version: None, }, @@ -157,7 +157,7 @@ impl PartitionedFile { /// Provide a hint to the size of the file metadata. If a hint is provided /// the reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. /// Without an appropriate hint, two read may be required to fetch the metadata. - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + pub fn with_metadata_size_hint(mut self, metadata_size_hint: u64) -> Self { self.metadata_size_hint = Some(metadata_size_hint); self } @@ -224,7 +224,7 @@ impl From for PartitionedFile { /// Indicates that the range calculation determined no further action is /// necessary, possibly because the calculated range is empty or invalid. pub enum RangeCalculation { - Range(Option>), + Range(Option>), TerminateEarly, } @@ -250,21 +250,29 @@ pub async fn calculate_range( match file_meta.range { None => Ok(RangeCalculation::Range(None)), Some(FileRange { start, end }) => { - let (start, end) = (start as usize, end as usize); - - let start_delta = if start != 0 { - find_first_newline(store, location, start - 1, file_size, newline).await? + let ustart = start as u64; + let uend = end as u64; + let start_delta = if ustart != 0 { + find_first_newline(store, location, ustart - 1, file_size, newline) + .await? } else { 0 }; - let end_delta = if end != file_size { - find_first_newline(store, location, end - 1, file_size, newline).await? + let end_delta = if uend != file_size { + find_first_newline( + store, + location, + (end - 1).try_into().unwrap(), + file_size, + newline, + ) + .await? } else { 0 }; - let range = start + start_delta..end + end_delta; + let range = start as u64 + start_delta..uend + end_delta; if range.start == range.end { return Ok(RangeCalculation::TerminateEarly); @@ -289,10 +297,10 @@ pub async fn calculate_range( async fn find_first_newline( object_store: &Arc, location: &Path, - start: usize, - end: usize, + start: u64, + end: u64, newline: u8, -) -> Result { +) -> Result { let options = GetOptions { range: Some(GetRange::Bounded(start..end)), ..Default::default() @@ -301,14 +309,14 @@ async fn find_first_newline( let result = object_store.get_opts(location, options).await?; let mut result_stream = result.into_stream(); - let mut index = 0; + let mut index: u64 = 0; while let Some(chunk) = result_stream.next().await.transpose()? { if let Some(position) = chunk.iter().position(|&byte| byte == newline) { - return Ok(index + position); + return Ok(index + position as u64); } - index += chunk.len(); + index += chunk.len() as u64; } Ok(index) diff --git a/datafusion/datasource/src/write/demux.rs b/datafusion/datasource/src/write/demux.rs index fc2e5daf92b6..ac050a35567c 100644 --- a/datafusion/datasource/src/write/demux.rs +++ b/datafusion/datasource/src/write/demux.rs @@ -45,7 +45,7 @@ use datafusion_execution::TaskContext; use chrono::NaiveDate; use futures::StreamExt; use object_store::path::Path; -use rand::distributions::DistString; +use rand::distr::SampleString; use tokio::sync::mpsc::{self, Receiver, Sender, UnboundedReceiver, UnboundedSender}; type RecordBatchReceiver = Receiver; @@ -151,8 +151,7 @@ async fn row_count_demuxer( let max_buffered_batches = exec_options.max_buffered_batches_per_output_file; let minimum_parallel_files = exec_options.minimum_parallel_output_files; let mut part_idx = 0; - let write_id = - rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 16); + let write_id = rand::distr::Alphanumeric.sample_string(&mut rand::thread_rng(), 16); let mut open_file_streams = Vec::with_capacity(minimum_parallel_files); @@ -267,8 +266,7 @@ async fn hive_style_partitions_demuxer( file_extension: String, keep_partition_by_columns: bool, ) -> Result<()> { - let write_id = - rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 16); + let write_id = rand::distr::Alphanumeric.sample_string(&mut rand::thread_rng(), 16); let exec_options = &context.session_config().options().execution; let max_buffered_recordbatches = exec_options.max_buffered_batches_per_output_file; diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs index fb605e87ed0c..948be9eefa4e 100644 --- a/datafusion/functions-aggregate/benches/array_agg.rs +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -16,6 +16,9 @@ // under the License. use std::sync::Arc; +use rand::distr::Distribution; +use rand::distr::StandardUniform; +use rand::Rng; use arrow::array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder, @@ -28,8 +31,7 @@ use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; use arrow::buffer::OffsetBuffer; use arrow::util::test_util::seedable_rng; -use rand::distributions::{Distribution, Standard}; -use rand::Rng; + fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { let list_item_data_type = values.as_list::().values().data_type().clone(); @@ -55,7 +57,7 @@ pub fn create_list_array( ) -> ListArray where T: ArrowPrimitiveType, - Standard: Distribution, + StandardUniform: Distribution, { let mut nulls_builder = NullBufferBuilder::new(size); let mut rng = seedable_rng(); diff --git a/datafusion/functions/benches/find_in_set.rs b/datafusion/functions/benches/find_in_set.rs index 9307525482c2..87ca6cba842d 100644 --- a/datafusion/functions/benches/find_in_set.rs +++ b/datafusion/functions/benches/find_in_set.rs @@ -25,7 +25,7 @@ use arrow::util::bench_util::{ use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; use std::sync::Arc; diff --git a/datafusion/functions/benches/helper.rs b/datafusion/functions/benches/helper.rs index 0dbb4b0027d4..91d42df94aa3 100644 --- a/datafusion/functions/benches/helper.rs +++ b/datafusion/functions/benches/helper.rs @@ -17,7 +17,7 @@ use arrow::array::{StringArray, StringViewArray}; use datafusion_expr::ColumnarValue; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::{rngs::StdRng, Rng, SeedableRng}; use std::sync::Arc; diff --git a/datafusion/functions/benches/ltrim.rs b/datafusion/functions/benches/ltrim.rs index 457fb499f5a1..1c8a4f2faea3 100644 --- a/datafusion/functions/benches/ltrim.rs +++ b/datafusion/functions/benches/ltrim.rs @@ -26,7 +26,7 @@ use criterion::{ use datafusion_common::ScalarValue; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF}; use datafusion_functions::string; -use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; +use rand::{distr::Alphanumeric, rngs::StdRng, Rng, SeedableRng}; use std::{fmt, sync::Arc}; pub fn seedable_rng() -> StdRng { diff --git a/datafusion/functions/benches/pad.rs b/datafusion/functions/benches/pad.rs index f78a53fbee19..d199998bc529 100644 --- a/datafusion/functions/benches/pad.rs +++ b/datafusion/functions/benches/pad.rs @@ -23,7 +23,7 @@ use arrow::util::bench_util::{ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::unicode::{lpad, rpad}; -use rand::distributions::{Distribution, Uniform}; +use rand::distr::{Distribution, Uniform}; use rand::Rng; use std::sync::Arc; @@ -49,7 +49,7 @@ where T: ArrowPrimitiveType, { let dist = Filter { - dist: Uniform::new_inclusive::(0, len as i64), + dist: Uniform::new_inclusive::(0, len as i64).unwrap(), }; let mut rng = rand::thread_rng(); diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index 3a1a6a71173e..5922b8de98f6 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -26,9 +26,9 @@ use datafusion_functions::regex::regexpcount::regexp_count_func; use datafusion_functions::regex::regexplike::regexp_like; use datafusion_functions::regex::regexpmatch::regexp_match; use datafusion_functions::regex::regexpreplace::regexp_replace; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::rngs::ThreadRng; -use rand::seq::SliceRandom; +use rand::seq::IndexedRandom; use rand::Rng; use std::iter; use std::sync::Arc; diff --git a/datafusion/functions/benches/strpos.rs b/datafusion/functions/benches/strpos.rs index df57c229e0ad..3b1ff4b4e203 100644 --- a/datafusion/functions/benches/strpos.rs +++ b/datafusion/functions/benches/strpos.rs @@ -21,7 +21,7 @@ use arrow::array::{StringArray, StringViewArray}; use arrow::datatypes::DataType; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; -use rand::distributions::Alphanumeric; +use rand::distr::Alphanumeric; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; use std::str::Chars; diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs index b1c1c3c34a95..79705c680498 100644 --- a/datafusion/functions/benches/substr_index.rs +++ b/datafusion/functions/benches/substr_index.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use arrow::array::{ArrayRef, Int64Array, StringArray}; use arrow::datatypes::DataType; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use rand::distributions::{Alphanumeric, Uniform}; +use rand::distr::{Alphanumeric, Uniform}; use rand::prelude::Distribution; use rand::Rng; @@ -51,7 +51,7 @@ where fn data() -> (StringArray, StringArray, Int64Array) { let dist = Filter { - dist: Uniform::new(-4, 5), + dist: Uniform::new(-4, 5).unwrap(), test: |x: &i64| x != &0, }; let mut rng = rand::thread_rng(); diff --git a/datafusion/functions/benches/to_char.rs b/datafusion/functions/benches/to_char.rs index 6f20a20dc219..cf295955e5e6 100644 --- a/datafusion/functions/benches/to_char.rs +++ b/datafusion/functions/benches/to_char.rs @@ -25,7 +25,7 @@ use chrono::prelude::*; use chrono::TimeDelta; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rand::rngs::ThreadRng; -use rand::seq::SliceRandom; +use rand::seq::IndexedRandom; use rand::Rng; use datafusion_common::ScalarValue; diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index da43a9789956..4099b89d597d 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -918,7 +918,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { metadata_size_hint: value .metadata_size_hint_opt .map(|opt| match opt { - protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => Some(v as usize), + protobuf::parquet_options::MetadataSizeHintOpt::MetadataSizeHint(v) => Some(v), }) .unwrap_or(None), pushdown_filters: value.pushdown_filters, diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index e22738973284..2d675c629da6 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -465,7 +465,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions { pruning: proto.pruning, skip_metadata: proto.skip_metadata, metadata_size_hint: proto.metadata_size_hint_opt.as_ref().map(|opt| match opt { - parquet_options::MetadataSizeHintOpt::MetadataSizeHint(size) => *size as usize, + parquet_options::MetadataSizeHintOpt::MetadataSizeHint(size) => *size, }), pushdown_filters: proto.pushdown_filters, reorder_filters: proto.reorder_filters, diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index cb4017afaeac..a886fc242545 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -555,7 +555,7 @@ impl TryFrom<&protobuf::PartitionedFile> for PartitionedFile { object_meta: ObjectMeta { location: Path::from(val.path.as_str()), last_modified: Utc.timestamp_nanos(val.last_modified_ns as i64), - size: val.size as usize, + size: val.size, e_tag: None, version: None, }, diff --git a/test-utils/src/array_gen/primitive.rs b/test-utils/src/array_gen/primitive.rs index 58d39c14e65d..88dd0b4db023 100644 --- a/test-utils/src/array_gen/primitive.rs +++ b/test-utils/src/array_gen/primitive.rs @@ -18,7 +18,7 @@ use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, UInt32Array}; use arrow::datatypes::DataType; use chrono_tz::{Tz, TZ_VARIANTS}; -use rand::{rngs::StdRng, seq::SliceRandom, thread_rng, Rng}; +use rand::{rngs::StdRng, seq::IndexedRandom, thread_rng, Rng}; use std::sync::Arc; use super::random_data::RandomNativeData; diff --git a/test-utils/src/array_gen/random_data.rs b/test-utils/src/array_gen/random_data.rs index a7297d45fdf0..028db9f85672 100644 --- a/test-utils/src/array_gen/random_data.rs +++ b/test-utils/src/array_gen/random_data.rs @@ -25,7 +25,7 @@ use arrow::datatypes::{ TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; -use rand::distributions::Standard; +use rand::distr::StandardUniform; use rand::prelude::Distribution; use rand::rngs::StdRng; use rand::Rng; @@ -40,7 +40,7 @@ macro_rules! basic_random_data { ($ARROW_TYPE: ty) => { impl RandomNativeData for $ARROW_TYPE where - Standard: Distribution, + StandardUniform: Distribution, { #[inline] fn generate_random_native_data(rng: &mut StdRng) -> Self::Native { diff --git a/test-utils/src/array_gen/string.rs b/test-utils/src/array_gen/string.rs index ac659ae67bc0..42a9bf6a94e7 100644 --- a/test-utils/src/array_gen/string.rs +++ b/test-utils/src/array_gen/string.rs @@ -95,7 +95,7 @@ fn random_string(rng: &mut StdRng, max_len: usize) -> String { 1 => String::from(rng.gen::()), _ => { let len = rng.gen_range(1..=max_len); - rng.sample_iter::(rand::distributions::Standard) + rng.sample_iter::(rand::distr::StandardUniform) .take(len) .collect() } From ee5dea226ad7bcce86f53e4feccc70d2fa521d8e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 9 Apr 2025 15:25:30 -0500 Subject: [PATCH 2/3] format arrow 55 update --- .../core/src/datasource/physical_plan/arrow_file.rs | 8 ++++---- datafusion/datasource-parquet/src/mod.rs | 3 ++- datafusion/datasource-parquet/src/reader.rs | 7 +++++-- datafusion/functions-aggregate/benches/array_agg.rs | 3 +-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 638f0dad617d..1d32377547ad 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -332,8 +332,8 @@ impl FileOpener for ArrowOpener { .iter() .flatten() .map(|block| { - let block_len = block.bodyLength() as u64 - + block.metaDataLength() as u64; + let block_len = + block.bodyLength() as u64 + block.metaDataLength() as u64; let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) @@ -364,8 +364,8 @@ impl FileOpener for ArrowOpener { let recordbatch_ranges = recordbatches .iter() .map(|block| { - let block_len = block.bodyLength() as u64 - + block.metaDataLength() as u64; + let block_len = + block.bodyLength() as u64 + block.metaDataLength() as u64; let block_offset = block.offset() as u64; block_offset..block_offset + block_len }) diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index 471ed61d666b..9fd444b0e619 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -267,7 +267,8 @@ impl ParquetExec { builder = builder.with_predicate(predicate); } if let Some(metadata_size_hint) = metadata_size_hint { - builder = builder.with_metadata_size_hint(metadata_size_hint.try_into().unwrap()); + builder = + builder.with_metadata_size_hint(metadata_size_hint.try_into().unwrap()); } builder.build() } diff --git a/datafusion/datasource-parquet/src/reader.rs b/datafusion/datasource-parquet/src/reader.rs index 14e937ca687e..6f4fb424d457 100644 --- a/datafusion/datasource-parquet/src/reader.rs +++ b/datafusion/datasource-parquet/src/reader.rs @@ -112,12 +112,15 @@ impl AsyncFileReader for ParquetFileReader { Self: Send, { let total: u64 = ranges.iter().map(|r| r.end - r.start).sum(); - self.file_metrics.bytes_scanned.add(total.try_into().unwrap()); + self.file_metrics + .bytes_scanned + .add(total.try_into().unwrap()); self.inner.get_byte_ranges(ranges) } fn get_metadata<'a>( - &'a mut self, options: Option<&'a ArrowReaderOptions> + &'a mut self, + options: Option<&'a ArrowReaderOptions>, ) -> BoxFuture<'a, parquet::errors::Result>> { self.inner.get_metadata(options) } diff --git a/datafusion/functions-aggregate/benches/array_agg.rs b/datafusion/functions-aggregate/benches/array_agg.rs index 948be9eefa4e..a90a093e94e4 100644 --- a/datafusion/functions-aggregate/benches/array_agg.rs +++ b/datafusion/functions-aggregate/benches/array_agg.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; use rand::distr::Distribution; use rand::distr::StandardUniform; use rand::Rng; +use std::sync::Arc; use arrow::array::{ Array, ArrayRef, ArrowPrimitiveType, AsArray, ListArray, NullBufferBuilder, @@ -32,7 +32,6 @@ use datafusion_functions_aggregate::array_agg::ArrayAggAccumulator; use arrow::buffer::OffsetBuffer; use arrow::util::test_util::seedable_rng; - fn merge_batch_bench(c: &mut Criterion, name: &str, values: ArrayRef) { let list_item_data_type = values.as_list::().values().data_type().clone(); c.bench_function(name, |b| { From e37ef60d68d74bf62f251d3f2f11021b83a4effb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 9 Apr 2025 15:26:16 -0500 Subject: [PATCH 3/3] Add Extension to type enums --- Cargo.lock | 350 ++++++++++-------- Cargo.toml | 18 +- datafusion/common/src/scalar/mod.rs | 3 +- datafusion/common/src/types/native.rs | 1 + .../src/avro_to_arrow/schema.rs | 1 + datafusion/expr/src/utils.rs | 3 +- datafusion/proto-common/src/to_proto/mod.rs | 2 +- datafusion/sql/src/unparser/expr.rs | 3 + 8 files changed, 222 insertions(+), 159 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0bf2432de0b9..635a3b4332b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,9 +246,8 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-arith", "arrow-array", @@ -265,14 +264,13 @@ dependencies = [ "arrow-string", "half", "pyo3", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] name = "arrow-arith" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -284,9 +282,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -301,9 +298,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "bytes", "half", @@ -312,9 +308,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,9 +328,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1644877d8bc9a0ef022d9153dc29375c2bda244c39aec05a91d0e87ccf77995f" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-cast", @@ -349,9 +343,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-buffer", "arrow-schema", @@ -361,9 +354,8 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a194f47959a4e111463cb6d02c8576fe084b3d7a3c092314baf3b9629b62595b" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-arith", "arrow-array", @@ -388,9 +380,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -402,9 +393,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee5b4ca98a7fb2efb9ab3309a5d1c88b5116997ff93f3147efdc1062a6158e9" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -424,9 +414,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -437,9 +426,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -450,9 +438,8 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "bitflags 2.8.0", "serde", @@ -460,9 +447,8 @@ dependencies = [ [[package]] name = "arrow-select" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -474,9 +460,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "arrow-array", "arrow-buffer", @@ -652,9 +637,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.12.6" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dabb68eb3a7aa08b46fddfd59a3d55c978243557a90ab804769f7e20e67d2b01" +checksum = "19b756939cb2f8dc900aa6dcd505e6e2428e9cae7ff7b028c49e3946efa70878" dependencies = [ "aws-lc-sys", "zeroize", @@ -662,9 +647,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbe221bbf523b625a4dd8585c7f38166e31167ec2ca98051dbcb4c3b6e825d2" +checksum = "b9f7720b74ed28ca77f90769a71fd8c637a0137f6fae4ae947e1050229cff57f" dependencies = [ "bindgen", "cc", @@ -700,9 +685,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.63.0" +version = "1.64.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1cb45b83b53b5cd55ee33fd9fd8a70750255a3f286e4dca20e882052f2b256f" +checksum = "02d4bdb0e5f80f0689e61c77ab678b2b9304af329616af38aef5b6b967b8e736" dependencies = [ "aws-credential-types", "aws-runtime", @@ -723,9 +708,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.64.0" +version = "1.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d4d9bc075ea6238778ed3951b65d3cde8c3864282d64fdcd19f2a90c0609f1" +checksum = "acbbb3ce8da257aedbccdcb1aadafbbb6a5fe9adf445db0e1ea897bdc7e22d08" dependencies = [ "aws-credential-types", "aws-runtime", @@ -746,9 +731,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.64.0" +version = "1.65.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819ccba087f403890fee4825eeab460e64c59345667d2b83a12cf544b581e3a7" +checksum = "96a78a8f50a1630db757b60f679c8226a8a70ee2ab5f5e6e51dc67f6c61c7cfd" dependencies = [ "aws-credential-types", "aws-runtime", @@ -825,9 +810,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0497ef5d53065b7cd6a35e9c1654bd1fefeae5c52900d91d1b188b0af0f29324" +checksum = "8aff1159006441d02e57204bf57a1b890ba68bedb6904ffd2873c1c4c11c546b" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -1860,7 +1845,7 @@ dependencies = [ "parking_lot", "parquet", "paste", - "rand 0.8.5", + "rand 0.9.0", "rand_distr", "regex", "rstest", @@ -1891,7 +1876,7 @@ dependencies = [ "mimalloc", "object_store", "parquet", - "rand 0.8.5", + "rand 0.9.0", "serde", "serde_json", "snmalloc-rs", @@ -1996,7 +1981,7 @@ dependencies = [ "parquet", "paste", "pyo3", - "rand 0.8.5", + "rand 0.9.0", "recursive", "sqlparser", "tokio", @@ -2038,7 +2023,7 @@ dependencies = [ "log", "object_store", "parquet", - "rand 0.8.5", + "rand 0.9.0", "tempfile", "tokio", "tokio-util", @@ -2144,7 +2129,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.8.5", + "rand 0.9.0", "tokio", ] @@ -2194,7 +2179,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.8.5", + "rand 0.9.0", "tempfile", "url", ] @@ -2270,7 +2255,7 @@ dependencies = [ "itertools 0.14.0", "log", "md-5", - "rand 0.8.5", + "rand 0.9.0", "regex", "sha2", "tokio", @@ -2296,7 +2281,7 @@ dependencies = [ "half", "log", "paste", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] @@ -2309,7 +2294,7 @@ dependencies = [ "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] @@ -2330,7 +2315,7 @@ dependencies = [ "itertools 0.14.0", "log", "paste", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] @@ -2425,7 +2410,7 @@ dependencies = [ "log", "paste", "petgraph", - "rand 0.8.5", + "rand 0.9.0", "rstest", ] @@ -2489,7 +2474,7 @@ dependencies = [ "log", "parking_lot", "pin-project-lite", - "rand 0.8.5", + "rand 0.9.0", "rstest", "rstest_reuse", "tempfile", @@ -2865,13 +2850,13 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fd-lock" -version = "4.0.2" +version = "4.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" +checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", - "rustix 0.38.44", - "windows-sys 0.52.0", + "rustix 1.0.2", + "windows-sys 0.59.0", ] [[package]] @@ -2924,11 +2909,11 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "24.12.23" +version = "25.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.8.0", "rustc_version", ] @@ -2939,6 +2924,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc" dependencies = [ "crc32fast", + "libz-rs-sys", "miniz_oxide", ] @@ -3143,8 +3129,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.13.3+wasi-0.2.2", + "wasm-bindgen", "windows-targets 0.52.6", ] @@ -3960,6 +3948,15 @@ dependencies = [ "escape8259", ] +[[package]] +name = "libz-rs-sys" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d" +dependencies = [ + "zlib-rs", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4006,7 +4003,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" dependencies = [ - "twox-hash", + "twox-hash 1.6.3", ] [[package]] @@ -4262,18 +4259,21 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.2" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" +checksum = "e9ce831b09395f933addbc56d894d889e4b226eba304d4e7adbab591e26daf1e" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", + "form_urlencoded", "futures", + "http 1.2.0", + "http-body-util", "humantime", "hyper", - "itertools 0.13.0", + "itertools 0.14.0", "md-5", "parking_lot", "percent-encoding", @@ -4284,7 +4284,8 @@ dependencies = [ "rustls-pemfile", "serde", "serde_json", - "snafu", + "serde_urlencoded", + "thiserror 2.0.12", "tokio", "tracing", "url", @@ -4367,9 +4368,8 @@ dependencies = [ [[package]] name = "parquet" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" +version = "55.0.0" +source = "git+https://github.com/paleolimbot/arrow-rs.git?branch=type-extension-maybe#7ddf616657fafa32c64b8cd0d66b9a56181d9f71" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4397,7 +4397,7 @@ dependencies = [ "snap", "thrift", "tokio", - "twox-hash", + "twox-hash 2.1.0", "zstd", ] @@ -4845,9 +4845,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.23.5" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229" dependencies = [ "cfg-if", "indoc", @@ -4863,9 +4863,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.23.5" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1" dependencies = [ "once_cell", "target-lexicon", @@ -4873,9 +4873,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.23.5" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc" dependencies = [ "libc", "pyo3-build-config", @@ -4883,9 +4883,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.23.5" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -4895,9 +4895,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.23.5" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -4914,9 +4914,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.37.2" +version = "0.37.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +checksum = "a4ce8c88de324ff838700f36fb6ab86c96df0e3c4ab6ef3a9b2044465cce1369" dependencies = [ "memchr", "serde", @@ -4924,11 +4924,12 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.6" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" +checksum = "c3bd15a6f2967aef83887dcb9fec0014580467e33720d073560cf015a5683012" dependencies = [ "bytes", + "cfg_aliases", "pin-project-lite", "quinn-proto", "quinn-udp", @@ -4938,17 +4939,18 @@ dependencies = [ "thiserror 2.0.12", "tokio", "tracing", + "web-time", ] [[package]] name = "quinn-proto" -version = "0.11.9" +version = "0.11.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" +checksum = "b820744eb4dc9b57a3398183639c511b5a26d2ed702cedd3febaa1393caa22cc" dependencies = [ "bytes", - "getrandom 0.2.15", - "rand 0.8.5", + "getrandom 0.3.1", + "rand 0.9.0", "ring", "rustc-hash 2.1.1", "rustls", @@ -4962,9 +4964,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.10" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" +checksum = "541d0f57c6ec747a90738a52741d3221f7960e8ac2f0ff4b1a63680e033b4ab5" dependencies = [ "cfg_aliases", "libc", @@ -5062,12 +5064,12 @@ dependencies = [ [[package]] name = "rand_distr" -version = "0.4.3" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] @@ -5210,9 +5212,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.12" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" +checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb" dependencies = [ "base64 0.22.1", "bytes", @@ -5795,27 +5797,6 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" -[[package]] -name = "snafu" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" -dependencies = [ - "snafu-derive", -] - -[[package]] -name = "snafu-derive" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "quote", - "syn 2.0.100", -] - [[package]] name = "snap" version = "1.1.1" @@ -6133,9 +6114,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.12.16" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" [[package]] name = "tempfile" @@ -6164,7 +6145,7 @@ dependencies = [ "chrono-tz", "datafusion-common", "env_logger", - "rand 0.8.5", + "rand 0.9.0", ] [[package]] @@ -6636,6 +6617,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" + [[package]] name = "typed-arena" version = "2.0.2" @@ -7136,13 +7123,13 @@ checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" [[package]] name = "windows-registry" -version = "0.2.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3" dependencies = [ - "windows-result 0.2.0", + "windows-result 0.3.2", "windows-strings", - "windows-targets 0.52.6", + "windows-targets 0.53.0", ] [[package]] @@ -7156,21 +7143,20 @@ dependencies = [ [[package]] name = "windows-result" -version = "0.2.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252" dependencies = [ - "windows-targets 0.52.6", + "windows-link", ] [[package]] name = "windows-strings" -version = "0.1.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319" dependencies = [ - "windows-result 0.2.0", - "windows-targets 0.52.6", + "windows-link", ] [[package]] @@ -7224,13 +7210,29 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -7243,6 +7245,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -7255,6 +7263,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -7267,12 +7281,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -7285,6 +7311,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -7297,6 +7329,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -7309,6 +7347,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -7321,6 +7365,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.2" @@ -7500,6 +7550,12 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "zlib-rs" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05" + [[package]] name = "zstd" version = "0.13.3" diff --git a/Cargo.toml b/Cargo.toml index 920629a23d1c..4c88c1d8811e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,19 +87,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.17", default-features = false } -arrow = { version = "54.3.1", features = [ +arrow = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "54.3.0", default-features = false } -arrow-flight = { version = "54.3.1", features = [ +arrow-buffer = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", default-features = false } +arrow-flight = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "54.3.0", default-features = false, features = [ +arrow-ipc = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "54.3.0", default-features = false } -arrow-schema = { version = "54.3.0", default-features = false } +arrow-ord = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", default-features = false } +arrow-schema = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", default-features = false } async-trait = "0.1.88" bigdecimal = "0.4.8" bytes = "1.10" @@ -147,9 +147,9 @@ hashbrown = { version = "0.14.5", features = ["raw"] } indexmap = "2.8.0" itertools = "0.14" log = "^0.4" -object_store = { version = "0.11.0", default-features = false } +object_store = { version = "0.12.0", default-features = false, features = ["fs"] } parking_lot = "0.12" -parquet = { version = "54.3.1", default-features = false, features = [ +parquet = { git = "https://github.com/paleolimbot/arrow-rs.git", branch = "type-extension-maybe", default-features = false, features = [ "arrow", "async", "object_store", @@ -159,7 +159,7 @@ pbjson-types = "0.7" # Should match arrow-flight's version of prost. insta = { version = "1.41.1", features = ["glob", "filters"] } prost = "0.13.1" -rand = "0.8.5" +rand = { version = "0.9", features = ["std_rng"] } recursive = "0.1.1" regex = "1.8" rstest = "0.24.0" diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index b8d9aea810f0..4827d724803f 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -2103,7 +2103,8 @@ impl ScalarValue { | DataType::Time64(TimeUnit::Millisecond) | DataType::RunEndEncoded(_, _) | DataType::ListView(_) - | DataType::LargeListView(_) => { + | DataType::LargeListView(_) + | DataType::Extension(_) => { return _not_impl_err!( "Unsupported creation of {:?} array from ScalarValue {:?}", data_type, diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 39c79b4b9974..336d8ffdde4d 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -411,6 +411,7 @@ impl From for NativeType { DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), + DataType::Extension(extension) => extension.storage_type().clone().into(), } } } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs index 276056c24c01..ee72fe1709d8 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs @@ -239,6 +239,7 @@ fn default_field_name(dt: &DataType) -> &str { } DataType::Decimal128(_, _) => "decimal", DataType::Decimal256(_, _) => "decimal", + DataType::Extension(_) => unimplemented!("Extension support not implemented"), } } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 552ce1502d46..b1629f7a2f12 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -841,7 +841,8 @@ pub fn can_hash(data_type: &DataType) -> bool { DataType::ListView(_) | DataType::LargeListView(_) | DataType::Union(_, _) - | DataType::RunEndEncoded(_, _) => false, + | DataType::RunEndEncoded(_, _) + | DataType::Extension(_) => false, } } diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index decd0cf63038..1b52310c840b 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -211,7 +211,7 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { "Proto serialization error: The RunEndEncoded data type is not yet supported".to_owned() )) } - DataType::ListView(_) | DataType::LargeListView(_) => { + DataType::ListView(_) | DataType::LargeListView(_) | DataType::Extension(_) => { return Err(Error::General(format!("Proto serialization error: {val} not yet supported"))) } }; diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 064adde55bdf..b38b1cb3d4bb 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1680,6 +1680,9 @@ impl Unparser<'_> { DataType::RunEndEncoded(_, _) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } + DataType::Extension(_) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } } } }