From 6d8c7ad9361021c66330c95e8f137e661efa18b7 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Thu, 13 Mar 2025 17:53:25 +0800 Subject: [PATCH 1/2] feat: support ApproxDistinct with utf8view --- .../src/approx_distinct.rs | 40 ++++++++++++++++++- .../test_files/aggregate_skip_partial.slt | 21 ++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 1d378fff176f..82eeefba19a8 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -18,7 +18,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution use crate::hyperloglog::HyperLogLog; -use arrow::array::BinaryArray; +use arrow::array::{BinaryArray, StringViewArray}; use arrow::array::{ GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; @@ -126,6 +126,28 @@ where } } +#[derive(Debug)] +struct StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + hll: HyperLogLog, + phantom_data: PhantomData, +} + +impl StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + /// new approx_distinct accumulator + pub fn new() -> Self { + Self { + hll: HyperLogLog::new(), + phantom_data: PhantomData, + } + } +} + #[derive(Debug)] struct BinaryHLLAccumulator where @@ -197,6 +219,21 @@ where default_accumulator_impl!(); } +impl Accumulator for StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array: &StringViewArray = downcast_value!(values[0], StringViewArray); + // flatten because we would skip nulls + self.hll + .extend(array.iter().flatten().map(|s| s.to_string())); + Ok(()) + } + + default_accumulator_impl!(); +} + impl Accumulator for StringHLLAccumulator where T: OffsetSizeTrait, @@ -311,6 +348,7 @@ impl AggregateUDFImpl for ApproxDistinct { DataType::Int64 => Box::new(NumericHLLAccumulator::::new()), DataType::Utf8 => Box::new(StringHLLAccumulator::::new()), DataType::LargeUtf8 => Box::new(StringHLLAccumulator::::new()), + DataType::Utf8View => Box::new(StringViewHLLAccumulator::::new()), DataType::Binary => Box::new(BinaryHLLAccumulator::::new()), DataType::LargeBinary => Box::new(BinaryHLLAccumulator::::new()), other => { diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 3a4d641abf68..163e27f435cb 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100 GROU 4 5 23 5 5 14 +# Test approx_distinct for varchar(with Utf8View) / int +statement ok +CREATE TABLE aggregate_test_100_utf8view AS SELECT + arrow_cast(c1, 'Utf8View') as c1, + c2, + c5 +FROM aggregate_test_100; + +# Test approx_distinct for varchar / int +query III +SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2; +---- +1 5 22 +2 5 22 +3 5 19 +4 5 23 +5 5 14 + +statement ok +DROP TABLE aggregate_test_100_utf8view; + # Test count with nullable fields query III SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2; From 473151cf512047914802968cc4f919d7bf0b65f5 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Thu, 13 Mar 2025 22:58:50 +0800 Subject: [PATCH 2/2] Address comment --- datafusion/functions-aggregate/src/approx_distinct.rs | 1 - datafusion/sqllogictest/test_files/aggregate_skip_partial.slt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 82eeefba19a8..c97dba1925ca 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -139,7 +139,6 @@ impl StringViewHLLAccumulator where T: OffsetSizeTrait, { - /// new approx_distinct accumulator pub fn new() -> Self { Self { hll: HyperLogLog::new(), diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 163e27f435cb..8755918cd16c 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -306,7 +306,7 @@ CREATE TABLE aggregate_test_100_utf8view AS SELECT c5 FROM aggregate_test_100; -# Test approx_distinct for varchar / int +# Test approx_distinct for varchar(with Utf8View) / int query III SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2; ----