Skip to content

Commit dca498a

Browse files
committed
Workaround missing Utf8View statistics support in arrow
1 parent 4e43a8e commit dca498a

File tree

1 file changed

+46
-1
lines changed
  • datafusion/core/src/datasource/physical_plan/parquet

1 file changed

+46
-1
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

+46-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
use arrow::array::{
2323
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
24+
StringViewBuilder,
2425
};
2526
use arrow::datatypes::i256;
2627
use arrow::{array::ArrayRef, datatypes::DataType};
@@ -438,6 +439,25 @@ macro_rules! get_statistics {
438439
}
439440
Ok(Arc::new(builder.finish()))
440441
},
442+
DataType::Utf8View => {
443+
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
444+
let mut builder = StringViewBuilder::new();
445+
for x in iterator {
446+
let Some(x) = x else {
447+
builder.append_null(); // no statistics value
448+
continue;
449+
};
450+
451+
let Ok(x) = std::str::from_utf8(x) else {
452+
log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
453+
builder.append_null();
454+
continue;
455+
};
456+
457+
builder.append_value(x);
458+
}
459+
Ok(Arc::new(builder.finish()))
460+
},
441461
DataType::FixedSizeBinary(size) => {
442462
let iterator = [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator);
443463
let mut builder = FixedSizeBinaryBuilder::new(*size);
@@ -482,8 +502,8 @@ macro_rules! get_statistics {
482502
DataType::Duration(_) |
483503
DataType::Interval(_) |
484504
DataType::Null |
505+
// TODO binary view
485506
DataType::BinaryView |
486-
DataType::Utf8View |
487507
DataType::List(_) |
488508
DataType::ListView(_) |
489509
DataType::FixedSizeList(_, _) |
@@ -901,6 +921,29 @@ macro_rules! get_data_page_statistics {
901921
}
902922
Ok(Arc::new(builder.finish()))
903923
},
924+
// TODO file upstream in Arrowrs --
925+
// support Utf8View and BinaryView in statistics
926+
Some(DataType::Utf8View) => {
927+
let mut builder = StringViewBuilder::new();
928+
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
929+
for x in iterator {
930+
for x in x.into_iter() {
931+
let Some(x) = x else {
932+
builder.append_null(); // no statistics value
933+
continue;
934+
};
935+
936+
let Ok(x) = std::str::from_utf8(x.data()) else {
937+
log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
938+
builder.append_null();
939+
continue;
940+
};
941+
942+
builder.append_value(x);
943+
}
944+
}
945+
Ok(Arc::new(builder.finish()))
946+
},
904947
Some(DataType::Dictionary(_, value_type)) => {
905948
[<$stat_type_prefix:lower _ page_statistics>](Some(value_type), $iterator)
906949
},
@@ -983,6 +1026,7 @@ macro_rules! get_data_page_statistics {
9831026
}
9841027
Ok(Arc::new(builder.finish()))
9851028
},
1029+
// TODO file upstream in arrow-rs -- return not implemented for unsupported types rather than panic
9861030
_ => unimplemented!()
9871031
}
9881032
}
@@ -1104,6 +1148,7 @@ where
11041148
.iter()
11051149
.map(|x| x.null_count.map(|x| x as u64))
11061150
.collect::<Vec<_>>(),
1151+
// TODO file upstream in Arrow-rs -- return not implemented
11071152
_ => unimplemented!(),
11081153
});
11091154

0 commit comments

Comments
 (0)