Skip to content

Commit 7f1bae2

Browse files
authored
Make it clear that StatisticsConverter can not panic (#6187)
1 parent 12ff1ea commit 7f1bae2

File tree

1 file changed

+45
-57
lines changed

1 file changed

+45
-57
lines changed

parquet/src/arrow/arrow_reader/statistics.rs

+45-57
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,7 @@ macro_rules! get_data_page_statistics {
758758
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
759759
paste! {
760760
match $data_type {
761-
Some(DataType::Boolean) => {
761+
DataType::Boolean => {
762762
let iterator = [<$stat_type_prefix BooleanDataPageStatsIterator>]::new($iterator);
763763
let mut builder = BooleanBuilder::new();
764764
for x in iterator {
@@ -772,7 +772,7 @@ macro_rules! get_data_page_statistics {
772772
}
773773
Ok(Arc::new(builder.finish()))
774774
},
775-
Some(DataType::UInt8) => Ok(Arc::new(
775+
DataType::UInt8 => Ok(Arc::new(
776776
UInt8Array::from_iter(
777777
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
778778
.map(|x| {
@@ -783,7 +783,7 @@ macro_rules! get_data_page_statistics {
783783
.flatten()
784784
)
785785
)),
786-
Some(DataType::UInt16) => Ok(Arc::new(
786+
DataType::UInt16 => Ok(Arc::new(
787787
UInt16Array::from_iter(
788788
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
789789
.map(|x| {
@@ -794,7 +794,7 @@ macro_rules! get_data_page_statistics {
794794
.flatten()
795795
)
796796
)),
797-
Some(DataType::UInt32) => Ok(Arc::new(
797+
DataType::UInt32 => Ok(Arc::new(
798798
UInt32Array::from_iter(
799799
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
800800
.map(|x| {
@@ -804,7 +804,7 @@ macro_rules! get_data_page_statistics {
804804
})
805805
.flatten()
806806
))),
807-
Some(DataType::UInt64) => Ok(Arc::new(
807+
DataType::UInt64 => Ok(Arc::new(
808808
UInt64Array::from_iter(
809809
[<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator)
810810
.map(|x| {
@@ -814,7 +814,7 @@ macro_rules! get_data_page_statistics {
814814
})
815815
.flatten()
816816
))),
817-
Some(DataType::Int8) => Ok(Arc::new(
817+
DataType::Int8 => Ok(Arc::new(
818818
Int8Array::from_iter(
819819
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
820820
.map(|x| {
@@ -825,7 +825,7 @@ macro_rules! get_data_page_statistics {
825825
.flatten()
826826
)
827827
)),
828-
Some(DataType::Int16) => Ok(Arc::new(
828+
DataType::Int16 => Ok(Arc::new(
829829
Int16Array::from_iter(
830830
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
831831
.map(|x| {
@@ -836,9 +836,9 @@ macro_rules! get_data_page_statistics {
836836
.flatten()
837837
)
838838
)),
839-
Some(DataType::Int32) => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
840-
Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))),
841-
Some(DataType::Float16) => Ok(Arc::new(
839+
DataType::Int32 => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
840+
DataType::Int64 => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))),
841+
DataType::Float16 => Ok(Arc::new(
842842
Float16Array::from_iter(
843843
[<$stat_type_prefix Float16DataPageStatsIterator>]::new($iterator)
844844
.map(|x| {
@@ -849,11 +849,11 @@ macro_rules! get_data_page_statistics {
849849
.flatten()
850850
)
851851
)),
852-
Some(DataType::Float32) => Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix Float32DataPageStatsIterator>]::new($iterator).flatten()))),
853-
Some(DataType::Float64) => Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix Float64DataPageStatsIterator>]::new($iterator).flatten()))),
854-
Some(DataType::Binary) => Ok(Arc::new(BinaryArray::from_iter([<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
855-
Some(DataType::LargeBinary) => Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
856-
Some(DataType::Utf8) => {
852+
DataType::Float32 => Ok(Arc::new(Float32Array::from_iter([<$stat_type_prefix Float32DataPageStatsIterator>]::new($iterator).flatten()))),
853+
DataType::Float64 => Ok(Arc::new(Float64Array::from_iter([<$stat_type_prefix Float64DataPageStatsIterator>]::new($iterator).flatten()))),
854+
DataType::Binary => Ok(Arc::new(BinaryArray::from_iter([<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
855+
DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
856+
DataType::Utf8 => {
857857
let mut builder = StringBuilder::new();
858858
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
859859
for x in iterator {
@@ -873,7 +873,7 @@ macro_rules! get_data_page_statistics {
873873
}
874874
Ok(Arc::new(builder.finish()))
875875
},
876-
Some(DataType::LargeUtf8) => {
876+
DataType::LargeUtf8 => {
877877
let mut builder = LargeStringBuilder::new();
878878
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
879879
for x in iterator {
@@ -893,10 +893,10 @@ macro_rules! get_data_page_statistics {
893893
}
894894
Ok(Arc::new(builder.finish()))
895895
},
896-
Some(DataType::Dictionary(_, value_type)) => {
897-
[<$stat_type_prefix:lower _ page_statistics>](Some(value_type), $iterator)
896+
DataType::Dictionary(_, value_type) => {
897+
[<$stat_type_prefix:lower _ page_statistics>](value_type, $iterator)
898898
},
899-
Some(DataType::Timestamp(unit, timezone)) => {
899+
DataType::Timestamp(unit, timezone) => {
900900
let iter = [<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten();
901901
Ok(match unit {
902902
TimeUnit::Second => Arc::new(TimestampSecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
@@ -905,8 +905,8 @@ macro_rules! get_data_page_statistics {
905905
TimeUnit::Nanosecond => Arc::new(TimestampNanosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())),
906906
})
907907
},
908-
Some(DataType::Date32) => Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
909-
Some(DataType::Date64) => Ok(
908+
DataType::Date32 => Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
909+
DataType::Date64 => Ok(
910910
Arc::new(
911911
Date64Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
912912
.map(|x| {
@@ -919,11 +919,11 @@ macro_rules! get_data_page_statistics {
919919
)
920920
)
921921
),
922-
Some(DataType::Decimal128(precision, scale)) => Ok(Arc::new(
922+
DataType::Decimal128(precision, scale) => Ok(Arc::new(
923923
Decimal128Array::from_iter([<$stat_type_prefix Decimal128DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)),
924-
Some(DataType::Decimal256(precision, scale)) => Ok(Arc::new(
924+
DataType::Decimal256(precision, scale) => Ok(Arc::new(
925925
Decimal256Array::from_iter([<$stat_type_prefix Decimal256DataPageStatsIterator>]::new($iterator).flatten()).with_precision_and_scale(*precision, *scale)?)),
926-
Some(DataType::Time32(unit)) => {
926+
DataType::Time32(unit) => {
927927
Ok(match unit {
928928
TimeUnit::Second => Arc::new(Time32SecondArray::from_iter(
929929
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten(),
@@ -937,7 +937,7 @@ macro_rules! get_data_page_statistics {
937937
}
938938
})
939939
}
940-
Some(DataType::Time64(unit)) => {
940+
DataType::Time64(unit) => {
941941
Ok(match unit {
942942
TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter(
943943
[<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten(),
@@ -951,7 +951,7 @@ macro_rules! get_data_page_statistics {
951951
}
952952
})
953953
},
954-
Some(DataType::FixedSizeBinary(size)) => {
954+
DataType::FixedSizeBinary(size) => {
955955
let mut builder = FixedSizeBinaryBuilder::new(*size);
956956
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
957957
for x in iterator {
@@ -964,18 +964,13 @@ macro_rules! get_data_page_statistics {
964964
if x.len() == *size as usize {
965965
let _ = builder.append_value(x.data());
966966
} else {
967-
// log::debug!(
968-
// "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
969-
// size,
970-
// x.len(),
971-
// );
972967
builder.append_null();
973968
}
974969
}
975970
}
976971
Ok(Arc::new(builder.finish()))
977972
},
978-
Some(DataType::Utf8View) => {
973+
DataType::Utf8View => {
979974
let mut builder = StringViewBuilder::new();
980975
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
981976
for x in iterator {
@@ -995,7 +990,7 @@ macro_rules! get_data_page_statistics {
995990
}
996991
Ok(Arc::new(builder.finish()))
997992
},
998-
Some(DataType::BinaryView) => {
993+
DataType::BinaryView => {
999994
let mut builder = BinaryViewBuilder::new();
1000995
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
1001996
for x in iterator {
@@ -1010,23 +1005,22 @@ macro_rules! get_data_page_statistics {
10101005
}
10111006
Ok(Arc::new(builder.finish()))
10121007
},
1013-
Some(DataType::Null) |
1014-
Some(DataType::Duration(_)) |
1015-
Some(DataType::Interval(_)) |
1016-
Some(DataType::List(_)) |
1017-
Some(DataType::ListView(_)) |
1018-
Some(DataType::FixedSizeList(_, _)) |
1019-
Some(DataType::LargeList(_)) |
1020-
Some(DataType::LargeListView(_)) |
1021-
Some(DataType::Struct(_)) |
1022-
Some(DataType::Union(_, _)) |
1023-
Some(DataType::Map(_, _)) |
1024-
Some(DataType::RunEndEncoded(_, _)) => {
1008+
DataType::Null |
1009+
DataType::Duration(_) |
1010+
DataType::Interval(_) |
1011+
DataType::List(_) |
1012+
DataType::ListView(_) |
1013+
DataType::FixedSizeList(_, _) |
1014+
DataType::LargeList(_) |
1015+
DataType::LargeListView(_) |
1016+
DataType::Struct(_) |
1017+
DataType::Union(_, _) |
1018+
DataType::Map(_, _) |
1019+
DataType::RunEndEncoded(_, _) => {
10251020
let len = $iterator.count();
10261021
// don't know how to extract statistics, so return a null array
1027-
Ok(new_null_array($data_type.unwrap(), len))
1022+
Ok(new_null_array($data_type, len))
10281023
},
1029-
None => unimplemented!() // not sure how to handle this
10301024
}
10311025
}
10321026
}
@@ -1054,10 +1048,7 @@ fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
10541048

10551049
/// Extracts the min statistics from an iterator
10561050
/// of parquet page [`Index`]'es to an [`ArrayRef`]
1057-
pub(crate) fn min_page_statistics<'a, I>(
1058-
data_type: Option<&DataType>,
1059-
iterator: I,
1060-
) -> Result<ArrayRef>
1051+
pub(crate) fn min_page_statistics<'a, I>(data_type: &DataType, iterator: I) -> Result<ArrayRef>
10611052
where
10621053
I: Iterator<Item = (usize, &'a Index)>,
10631054
{
@@ -1066,10 +1057,7 @@ where
10661057

10671058
/// Extracts the max statistics from an iterator
10681059
/// of parquet page [`Index`]'es to an [`ArrayRef`]
1069-
pub(crate) fn max_page_statistics<'a, I>(
1070-
data_type: Option<&DataType>,
1071-
iterator: I,
1072-
) -> Result<ArrayRef>
1060+
pub(crate) fn max_page_statistics<'a, I>(data_type: &DataType, iterator: I) -> Result<ArrayRef>
10731061
where
10741062
I: Iterator<Item = (usize, &'a Index)>,
10751063
{
@@ -1439,7 +1427,7 @@ impl<'a> StatisticsConverter<'a> {
14391427
(*num_data_pages, column_page_index_per_row_group_per_column)
14401428
});
14411429

1442-
min_page_statistics(Some(data_type), iter)
1430+
min_page_statistics(data_type, iter)
14431431
}
14441432

14451433
/// Extract the maximum values from Data Page statistics.
@@ -1470,7 +1458,7 @@ impl<'a> StatisticsConverter<'a> {
14701458
(*num_data_pages, column_page_index_per_row_group_per_column)
14711459
});
14721460

1473-
max_page_statistics(Some(data_type), iter)
1461+
max_page_statistics(data_type, iter)
14741462
}
14751463

14761464
/// Returns a [`UInt64Array`] with null counts for each data page.

0 commit comments

Comments
 (0)