diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index 1d378fff176f..c97dba1925ca 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -18,7 +18,7 @@ //! Defines physical expressions that can evaluated at runtime during query execution use crate::hyperloglog::HyperLogLog; -use arrow::array::BinaryArray; +use arrow::array::{BinaryArray, StringViewArray}; use arrow::array::{ GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; @@ -126,6 +126,27 @@ where } } +#[derive(Debug)] +struct StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + hll: HyperLogLog, + phantom_data: PhantomData, +} + +impl StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + pub fn new() -> Self { + Self { + hll: HyperLogLog::new(), + phantom_data: PhantomData, + } + } +} + #[derive(Debug)] struct BinaryHLLAccumulator where @@ -197,6 +218,21 @@ where default_accumulator_impl!(); } +impl Accumulator for StringViewHLLAccumulator +where + T: OffsetSizeTrait, +{ + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array: &StringViewArray = downcast_value!(values[0], StringViewArray); + // flatten because we would skip nulls + self.hll + .extend(array.iter().flatten().map(|s| s.to_string())); + Ok(()) + } + + default_accumulator_impl!(); +} + impl Accumulator for StringHLLAccumulator where T: OffsetSizeTrait, @@ -311,6 +347,7 @@ impl AggregateUDFImpl for ApproxDistinct { DataType::Int64 => Box::new(NumericHLLAccumulator::::new()), DataType::Utf8 => Box::new(StringHLLAccumulator::::new()), DataType::LargeUtf8 => Box::new(StringHLLAccumulator::::new()), + DataType::Utf8View => Box::new(StringViewHLLAccumulator::::new()), DataType::Binary => Box::new(BinaryHLLAccumulator::::new()), DataType::LargeBinary => Box::new(BinaryHLLAccumulator::::new()), other => { diff --git a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt index 3a4d641abf68..8755918cd16c 100644 --- a/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt +++ b/datafusion/sqllogictest/test_files/aggregate_skip_partial.slt @@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100 GROU 4 5 23 5 5 14 +# Test approx_distinct for varchar(with Utf8View) / int +statement ok +CREATE TABLE aggregate_test_100_utf8view AS SELECT + arrow_cast(c1, 'Utf8View') as c1, + c2, + c5 +FROM aggregate_test_100; + +# Test approx_distinct for varchar(with Utf8View) / int +query III +SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2; +---- +1 5 22 +2 5 22 +3 5 19 +4 5 23 +5 5 14 + +statement ok +DROP TABLE aggregate_test_100_utf8view; + # Test count with nullable fields query III SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;