Merge pull request JanKaul#265 from JanKaul/improve-cardinality-estimate

JanKaul · web-flow · commit 07d3b8cb49e5 · 2025-10-05T18:20:50.000+02:00
Improve cardinality estimate
diff --git a/datafusion_iceberg/src/statistics.rs b/datafusion_iceberg/src/statistics.rs
@@ -4,6 +4,7 @@ use datafusion::{
     scalar::ScalarValue,
 };
 use iceberg_rust::error::Error;
+use iceberg_rust::file_format::parquet::estimate_distinct_count;
 use iceberg_rust::spec::{
     manifest::{ManifestEntry, Status},
     schema::Schema,
@@ -45,12 +46,16 @@ pub(crate) fn statistics_from_datafiles(
                         .column_statistics
                         .into_iter()
                         .zip(column_stats)
-                        .map(|(acc, x)| ColumnStatistics {
-                            null_count: acc.null_count.add(&x.null_count),
-                            max_value: acc.max_value.max(&x.max_value),
-                            min_value: acc.min_value.min(&x.min_value),
-                            distinct_count: acc.distinct_count.add(&x.distinct_count),
-                            sum_value: acc.sum_value.add(&x.sum_value),
+                        .map(|(acc, x)| {
+                            let new_distinct_count = new_distinct_count(&acc, &x);
+
+                            ColumnStatistics {
+                                null_count: acc.null_count.add(&x.null_count),
+                                max_value: acc.max_value.max(&x.max_value),
+                                min_value: acc.min_value.min(&x.min_value),
+                                distinct_count: new_distinct_count,
+                                sum_value: acc.sum_value.add(&x.sum_value),
+                            }
                         })
                         .collect(),
                 }
@@ -134,3 +139,49 @@ fn convert_value_to_scalar_value(value: Value) -> Result<ScalarValue, Error> {
         )),
     }
 }
+
+fn new_distinct_count(acc: &ColumnStatistics, x: &ColumnStatistics) -> Precision<usize> {
+    match (
+        &acc.distinct_count,
+        &x.distinct_count,
+        &acc.min_value,
+        &acc.max_value,
+        &x.min_value,
+        &x.max_value,
+    ) {
+        (
+            Precision::Exact(old_count),
+            Precision::Exact(new_count),
+            Precision::Exact(ScalarValue::Int32(Some(old_min))),
+            Precision::Exact(ScalarValue::Int32(Some(old_max))),
+            Precision::Exact(ScalarValue::Int32(Some(new_min))),
+            Precision::Exact(ScalarValue::Int32(Some(new_max))),
+        ) => {
+            let estimated = estimate_distinct_count(
+                &[old_min, old_max],
+                &[new_min, new_max],
+                *old_count as i64,
+                *new_count as i64,
+            );
+            Precision::Inexact(*old_count + estimated as usize)
+        }
+        (
+            Precision::Exact(old_count),
+            Precision::Exact(new_count),
+            Precision::Exact(ScalarValue::Int64(Some(old_min))),
+            Precision::Exact(ScalarValue::Int64(Some(old_max))),
+            Precision::Exact(ScalarValue::Int64(Some(new_min))),
+            Precision::Exact(ScalarValue::Int64(Some(new_max))),
+        ) => {
+            let estimated = estimate_distinct_count(
+                &[old_min, old_max],
+                &[new_min, new_max],
+                *old_count as i64,
+                *new_count as i64,
+            );
+            Precision::Inexact(*old_count + estimated as usize)
+        }
+        (Precision::Absent, Precision::Exact(_), _, _, _, _) => x.distinct_count,
+        _ => acc.distinct_count.add(&x.distinct_count),
+    }
+}
diff --git a/iceberg-rust/src/file_format/parquet.rs b/iceberg-rust/src/file_format/parquet.rs
@@ -4,6 +4,7 @@
 
 use std::{
     collections::{hash_map::Entry, HashMap},
+    ops::Sub,
     sync::Arc,
 };
 
@@ -89,18 +90,66 @@ pub fn parquet_to_datafile(
                         .and_modify(|x| *x += null_count as i64)
                         .or_insert(null_count as i64);
                 }
-                if let Some(distinct_count) = statistics.distinct_count_opt() {
-                    distinct_counts
-                        .entry(id)
-                        .and_modify(|x| *x += distinct_count as i64)
-                        .or_insert(distinct_count as i64);
-                }
+
                 let data_type = &schema
                     .fields()
                     .get(id as usize)
                     .ok_or_else(|| Error::Schema(column_name.to_string(), "".to_string()))?
                     .field_type;
 
+                if let (Some(distinct_count), Some(min_bytes), Some(max_bytes)) = (
+                    statistics.distinct_count_opt(),
+                    statistics.min_bytes_opt(),
+                    statistics.max_bytes_opt(),
+                ) {
+                    let min = Value::try_from_bytes(min_bytes, data_type)?;
+                    let max = Value::try_from_bytes(max_bytes, data_type)?;
+                    let current_min = lower_bounds.get(&id);
+                    let current_max = upper_bounds.get(&id);
+                    match (min, max, current_min, current_max) {
+                        (
+                            Value::Int(min),
+                            Value::Int(max),
+                            Some(Value::Int(current_min)),
+                            Some(Value::Int(current_max)),
+                        ) => {
+                            distinct_counts
+                                .entry(id)
+                                .and_modify(|x| {
+                                    *x += estimate_distinct_count(
+                                        &[current_min, current_max],
+                                        &[&min, &max],
+                                        *x,
+                                        distinct_count as i64,
+                                    );
+                                })
+                                .or_insert(distinct_count as i64);
+                        }
+                        (
+                            Value::LongInt(min),
+                            Value::LongInt(max),
+                            Some(Value::LongInt(current_min)),
+                            Some(Value::LongInt(current_max)),
+                        ) => {
+                            distinct_counts
+                                .entry(id)
+                                .and_modify(|x| {
+                                    *x += estimate_distinct_count(
+                                        &[current_min, current_max],
+                                        &[&min, &max],
+                                        *x,
+                                        distinct_count as i64,
+                                    );
+                                })
+                                .or_insert(distinct_count as i64);
+                        }
+                        (_, _, None, None) => {
+                            distinct_counts.entry(id).or_insert(distinct_count as i64);
+                        }
+                        _ => (),
+                    }
+                }
+
                 if let Some(min_bytes) = statistics.min_bytes_opt() {
                     if let Type::Primitive(_) = &data_type {
                         let new = Value::try_from_bytes(min_bytes, data_type)?;
@@ -275,3 +324,113 @@ pub fn thrift_size<T: TSerializable>(metadata: &T) -> Result<usize, Error> {
     metadata.write_to_out_protocol(&mut protocol)?;
     Ok(buffer.bytes_written())
 }
+
+fn range_overlap<T: Ord + Sub + Copy>(
+    old_range: &[&T; 2],
+    new_range: &[&T; 2],
+) -> <T as Sub>::Output {
+    let overlap_start = (*old_range[0]).max(*new_range[0]);
+    let overlap_end = (*old_range[1]).min(*new_range[1]);
+    overlap_end - overlap_start
+}
+
+/// Helper trait to convert numeric types to f64 for statistical calculations.
+///
+/// This trait provides a uniform interface for converting integer types to f64,
+/// which is necessary for the statistical estimation algorithms. The conversion
+/// may be lossy for very large i64 values (beyond 2^53), but this is acceptable
+/// for statistical approximations.
+pub trait ToF64 {
+    /// Converts the value to f64.
+    ///
+    /// # Note
+    ///
+    /// For i64 values larger than 2^53, precision may be lost in the conversion.
+    /// This is acceptable for statistical calculations where exact precision is
+    /// not required.
+    fn to_f64(self) -> f64;
+}
+
+impl ToF64 for i32 {
+    fn to_f64(self) -> f64 {
+        self as f64
+    }
+}
+
+impl ToF64 for i64 {
+    fn to_f64(self) -> f64 {
+        self as f64
+    }
+}
+
+/// Estimates the number of new distinct values when merging two sets of statistics.
+///
+/// This function assumes uniform distribution of distinct values within their respective ranges
+/// and uses an independence approximation to estimate overlap probability.
+///
+/// # Algorithm
+///
+/// The estimation is split into two parts:
+/// 1. **Non-overlapping region**: All values in the new range that fall outside the old range
+///    are guaranteed to be new.
+/// 2. **Overlapping region**: Uses the independence approximation:
+///    - P(specific value not covered) = ((R-1)/R)^k
+///    - where R is the overlap size and k is the expected number of old values in the overlap
+///    - Expected new values = n2_overlap × P(not covered)
+///
+/// # Parameters
+///
+/// * `old_range` - [min, max] of the existing value range
+/// * `new_range` - [min, max] of the new value range
+/// * `old_distinct_count` - Number of distinct values in the old range
+/// * `new_distinct_count` - Number of distinct values in the new range
+///
+/// # Returns
+///
+/// Estimated number of new distinct values to add to the running total
+///
+/// # Example
+///
+/// ```ignore
+/// // Old range [0, 1000] with 100 distinct values
+/// // New range [500, 1500] with 50 distinct values
+/// let new_count = estimate_distinct_count(&[&0, &1000], &[&500, &1500], 100, 50);
+/// ```
+pub fn estimate_distinct_count<T>(
+    old_range: &[&T; 2],
+    new_range: &[&T; 2],
+    old_distinct_count: i64,
+    new_distinct_count: i64,
+) -> i64
+where
+    T: Ord + Sub<Output = T> + Copy + Default + ToF64,
+{
+    let new_range_size = (*new_range[1] - *new_range[0]).to_f64();
+    let current_range_size = (*old_range[1] - *old_range[0]).to_f64();
+    let overlap = range_overlap(old_range, new_range);
+    let overlap_size: f64 = if overlap >= T::default() {
+        overlap.to_f64()
+    } else {
+        0.0
+    };
+    let n2 = new_distinct_count as f64;
+    let n1 = old_distinct_count as f64;
+
+    // Values outside overlap are definitely new
+    let outside_overlap = ((new_range_size - overlap_size) / new_range_size * n2).max(0.0);
+
+    // For overlap region: estimate how many new values exist
+    // using independence approximation: P(value not covered) = ((R-1)/R)^k
+    // Expected new values in overlap = n2_overlap * ((R-1)/R)^(n1_overlap)
+    let n2_overlap = (overlap_size / new_range_size * n2).max(0.0);
+    let expected_n1_in_overlap = (overlap_size / current_range_size * n1).max(0.0);
+
+    let new_in_overlap = if overlap_size > 0.0 {
+        let prob_not_covered = ((overlap_size - 1.0) / overlap_size).powf(expected_n1_in_overlap);
+        n2_overlap * prob_not_covered
+    } else {
+        0.0
+    };
+
+    (outside_overlap + new_in_overlap).round() as i64
+}