|
4 | 4 |
|
5 | 5 | use std::{ |
6 | 6 | collections::{hash_map::Entry, HashMap}, |
| 7 | + ops::Sub, |
7 | 8 | sync::Arc, |
8 | 9 | }; |
9 | 10 |
|
@@ -89,18 +90,66 @@ pub fn parquet_to_datafile( |
89 | 90 | .and_modify(|x| *x += null_count as i64) |
90 | 91 | .or_insert(null_count as i64); |
91 | 92 | } |
92 | | - if let Some(distinct_count) = statistics.distinct_count_opt() { |
93 | | - distinct_counts |
94 | | - .entry(id) |
95 | | - .and_modify(|x| *x += distinct_count as i64) |
96 | | - .or_insert(distinct_count as i64); |
97 | | - } |
| 93 | + |
98 | 94 | let data_type = &schema |
99 | 95 | .fields() |
100 | 96 | .get(id as usize) |
101 | 97 | .ok_or_else(|| Error::Schema(column_name.to_string(), "".to_string()))? |
102 | 98 | .field_type; |
103 | 99 |
|
| 100 | + if let (Some(distinct_count), Some(min_bytes), Some(max_bytes)) = ( |
| 101 | + statistics.distinct_count_opt(), |
| 102 | + statistics.min_bytes_opt(), |
| 103 | + statistics.max_bytes_opt(), |
| 104 | + ) { |
| 105 | + let min = Value::try_from_bytes(min_bytes, data_type)?; |
| 106 | + let max = Value::try_from_bytes(max_bytes, data_type)?; |
| 107 | + let current_min = lower_bounds.get(&id); |
| 108 | + let current_max = upper_bounds.get(&id); |
| 109 | + match (min, max, current_min, current_max) { |
| 110 | + ( |
| 111 | + Value::Int(min), |
| 112 | + Value::Int(max), |
| 113 | + Some(Value::Int(current_min)), |
| 114 | + Some(Value::Int(current_max)), |
| 115 | + ) => { |
| 116 | + distinct_counts |
| 117 | + .entry(id) |
| 118 | + .and_modify(|x| { |
| 119 | + *x += estimate_distinct_count( |
| 120 | + &[current_min, current_max], |
| 121 | + &[&min, &max], |
| 122 | + *x, |
| 123 | + distinct_count as i64, |
| 124 | + ); |
| 125 | + }) |
| 126 | + .or_insert(distinct_count as i64); |
| 127 | + } |
| 128 | + ( |
| 129 | + Value::LongInt(min), |
| 130 | + Value::LongInt(max), |
| 131 | + Some(Value::LongInt(current_min)), |
| 132 | + Some(Value::LongInt(current_max)), |
| 133 | + ) => { |
| 134 | + distinct_counts |
| 135 | + .entry(id) |
| 136 | + .and_modify(|x| { |
| 137 | + *x += estimate_distinct_count( |
| 138 | + &[current_min, current_max], |
| 139 | + &[&min, &max], |
| 140 | + *x, |
| 141 | + distinct_count as i64, |
| 142 | + ); |
| 143 | + }) |
| 144 | + .or_insert(distinct_count as i64); |
| 145 | + } |
| 146 | + (_, _, None, None) => { |
| 147 | + distinct_counts.entry(id).or_insert(distinct_count as i64); |
| 148 | + } |
| 149 | + _ => (), |
| 150 | + } |
| 151 | + } |
| 152 | + |
104 | 153 | if let Some(min_bytes) = statistics.min_bytes_opt() { |
105 | 154 | if let Type::Primitive(_) = &data_type { |
106 | 155 | let new = Value::try_from_bytes(min_bytes, data_type)?; |
@@ -275,3 +324,113 @@ pub fn thrift_size<T: TSerializable>(metadata: &T) -> Result<usize, Error> { |
275 | 324 | metadata.write_to_out_protocol(&mut protocol)?; |
276 | 325 | Ok(buffer.bytes_written()) |
277 | 326 | } |
| 327 | + |
| 328 | +fn range_overlap<T: Ord + Sub + Copy>( |
| 329 | + old_range: &[&T; 2], |
| 330 | + new_range: &[&T; 2], |
| 331 | +) -> <T as Sub>::Output { |
| 332 | + let overlap_start = (*old_range[0]).max(*new_range[0]); |
| 333 | + let overlap_end = (*old_range[1]).min(*new_range[1]); |
| 334 | + overlap_end - overlap_start |
| 335 | +} |
| 336 | + |
| 337 | +/// Helper trait to convert numeric types to f64 for statistical calculations. |
| 338 | +/// |
| 339 | +/// This trait provides a uniform interface for converting integer types to f64, |
| 340 | +/// which is necessary for the statistical estimation algorithms. The conversion |
| 341 | +/// may be lossy for very large i64 values (beyond 2^53), but this is acceptable |
| 342 | +/// for statistical approximations. |
| 343 | +pub trait ToF64 { |
| 344 | + /// Converts the value to f64. |
| 345 | + /// |
| 346 | + /// # Note |
| 347 | + /// |
| 348 | + /// For i64 values larger than 2^53, precision may be lost in the conversion. |
| 349 | + /// This is acceptable for statistical calculations where exact precision is |
| 350 | + /// not required. |
| 351 | + fn to_f64(self) -> f64; |
| 352 | +} |
| 353 | + |
| 354 | +impl ToF64 for i32 { |
| 355 | + fn to_f64(self) -> f64 { |
| 356 | + self as f64 |
| 357 | + } |
| 358 | +} |
| 359 | + |
| 360 | +impl ToF64 for i64 { |
| 361 | + fn to_f64(self) -> f64 { |
| 362 | + self as f64 |
| 363 | + } |
| 364 | +} |
| 365 | + |
| 366 | +/// Estimates the number of new distinct values when merging two sets of statistics. |
| 367 | +/// |
| 368 | +/// This function assumes uniform distribution of distinct values within their respective ranges |
| 369 | +/// and uses an independence approximation to estimate overlap probability. |
| 370 | +/// |
| 371 | +/// # Algorithm |
| 372 | +/// |
| 373 | +/// The estimation is split into two parts: |
| 374 | +/// 1. **Non-overlapping region**: All values in the new range that fall outside the old range |
| 375 | +/// are guaranteed to be new. |
| 376 | +/// 2. **Overlapping region**: Uses the independence approximation: |
| 377 | +/// - P(specific value not covered) = ((R-1)/R)^k |
| 378 | +/// - where R is the overlap size and k is the expected number of old values in the overlap |
| 379 | +/// - Expected new values = n2_overlap × P(not covered) |
| 380 | +/// |
| 381 | +/// # Parameters |
| 382 | +/// |
| 383 | +/// * `old_range` - [min, max] of the existing value range |
| 384 | +/// * `new_range` - [min, max] of the new value range |
| 385 | +/// * `old_distinct_count` - Number of distinct values in the old range |
| 386 | +/// * `new_distinct_count` - Number of distinct values in the new range |
| 387 | +/// |
| 388 | +/// # Returns |
| 389 | +/// |
| 390 | +/// Estimated number of new distinct values to add to the running total |
| 391 | +/// |
| 392 | +/// # Example |
| 393 | +/// |
| 394 | +/// ```ignore |
| 395 | +/// // Old range [0, 1000] with 100 distinct values |
| 396 | +/// // New range [500, 1500] with 50 distinct values |
| 397 | +/// let new_count = estimate_distinct_count(&[&0, &1000], &[&500, &1500], 100, 50); |
| 398 | +/// ``` |
| 399 | +pub fn estimate_distinct_count<T>( |
| 400 | + old_range: &[&T; 2], |
| 401 | + new_range: &[&T; 2], |
| 402 | + old_distinct_count: i64, |
| 403 | + new_distinct_count: i64, |
| 404 | +) -> i64 |
| 405 | +where |
| 406 | + T: Ord + Sub<Output = T> + Copy + Default + ToF64, |
| 407 | +{ |
| 408 | + let new_range_size = (*new_range[1] - *new_range[0]).to_f64(); |
| 409 | + let current_range_size = (*old_range[1] - *old_range[0]).to_f64(); |
| 410 | + let overlap = range_overlap(old_range, new_range); |
| 411 | + let overlap_size: f64 = if overlap >= T::default() { |
| 412 | + overlap.to_f64() |
| 413 | + } else { |
| 414 | + 0.0 |
| 415 | + }; |
| 416 | + let n2 = new_distinct_count as f64; |
| 417 | + let n1 = old_distinct_count as f64; |
| 418 | + |
| 419 | + // Values outside overlap are definitely new |
| 420 | + let outside_overlap = ((new_range_size - overlap_size) / new_range_size * n2).max(0.0); |
| 421 | + |
| 422 | + // For overlap region: estimate how many new values exist |
| 423 | + // using independence approximation: P(value not covered) = ((R-1)/R)^k |
| 424 | + // Expected new values in overlap = n2_overlap * ((R-1)/R)^(n1_overlap) |
| 425 | + let n2_overlap = (overlap_size / new_range_size * n2).max(0.0); |
| 426 | + let expected_n1_in_overlap = (overlap_size / current_range_size * n1).max(0.0); |
| 427 | + |
| 428 | + let new_in_overlap = if overlap_size > 0.0 { |
| 429 | + let prob_not_covered = ((overlap_size - 1.0) / overlap_size).powf(expected_n1_in_overlap); |
| 430 | + n2_overlap * prob_not_covered |
| 431 | + } else { |
| 432 | + 0.0 |
| 433 | + }; |
| 434 | + |
| 435 | + (outside_overlap + new_in_overlap).round() as i64 |
| 436 | +} |
0 commit comments