Skip to content

Commit dd9f55f

Browse files
committed
Implement conversion from ColumnStatistics to NullableInterval
This change addresses part of apache#10456.
1 parent a2eca29 commit dd9f55f

File tree

1 file changed

+129
-13
lines changed

1 file changed

+129
-13
lines changed

datafusion/expr/src/interval_arithmetic.rs

+129-13
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ use arrow::compute::{cast_with_options, CastOptions};
2828
use arrow::datatypes::DataType;
2929
use arrow::datatypes::{IntervalUnit, TimeUnit};
3030
use datafusion_common::rounding::{alter_fp_rounding_mode, next_down, next_up};
31-
use datafusion_common::{internal_err, Result, ScalarValue};
31+
use datafusion_common::stats::Precision;
32+
use datafusion_common::{
33+
internal_err, ColumnStatistics, DataFusionError, Result, ScalarValue,
34+
};
3235

3336
macro_rules! get_extreme_value {
3437
($extreme:ident, $value:expr) => {
@@ -1469,6 +1472,8 @@ pub enum NullableInterval {
14691472
MaybeNull { values: Interval },
14701473
/// The value is definitely not null, and is within the specified range.
14711474
NotNull { values: Interval },
1475+
/// Added to handle cases with insufficient statistics
1476+
Unknown,
14721477
}
14731478

14741479
impl Display for NullableInterval {
@@ -1479,6 +1484,7 @@ impl Display for NullableInterval {
14791484
write!(f, "NullableInterval: {} U {{NULL}}", values)
14801485
}
14811486
Self::NotNull { values } => write!(f, "NullableInterval: {}", values),
1487+
Self::Unknown => write!(f, "NullableInterval: Unknown"),
14821488
}
14831489
}
14841490
}
@@ -1501,36 +1507,82 @@ impl From<ScalarValue> for NullableInterval {
15011507
}
15021508
}
15031509

1510+
impl From<ColumnStatistics> for NullableInterval {
1511+
fn from(stats: ColumnStatistics) -> Self {
1512+
let null_count = match stats.null_count {
1513+
Precision::Exact(value) | Precision::Inexact(value) => value,
1514+
Precision::Absent => return NullableInterval::Unknown, // Insufficient data
1515+
};
1516+
1517+
let distinct_count = match stats.distinct_count {
1518+
Precision::Exact(value) | Precision::Inexact(value) => value,
1519+
Precision::Absent => return NullableInterval::Unknown, // Insufficient data
1520+
};
1521+
1522+
let lower_value = match stats.min_value {
1523+
Precision::Exact(ref value) | Precision::Inexact(ref value) => value.clone(),
1524+
Precision::Absent => return NullableInterval::Unknown, // Insufficient data
1525+
};
1526+
1527+
let upper_value = match stats.max_value {
1528+
Precision::Exact(ref value) | Precision::Inexact(ref value) => value.clone(),
1529+
Precision::Absent => return NullableInterval::Unknown,
1530+
};
1531+
1532+
let datatype = lower_value.data_type();
1533+
1534+
if null_count == 0 {
1535+
NullableInterval::NotNull {
1536+
values: Interval {
1537+
lower: lower_value,
1538+
upper: upper_value,
1539+
},
1540+
}
1541+
} else if null_count == distinct_count {
1542+
NullableInterval::Null { datatype }
1543+
} else {
1544+
NullableInterval::MaybeNull {
1545+
values: Interval {
1546+
lower: lower_value,
1547+
upper: upper_value,
1548+
},
1549+
}
1550+
}
1551+
}
1552+
}
1553+
15041554
impl NullableInterval {
15051555
/// Get the values interval, or None if this interval is definitely null.
15061556
pub fn values(&self) -> Option<&Interval> {
15071557
match self {
1508-
Self::Null { .. } => None,
1558+
Self::Null { .. } | Self::Unknown => None,
15091559
Self::MaybeNull { values } | Self::NotNull { values } => Some(values),
15101560
}
15111561
}
15121562

15131563
/// Get the data type
1514-
pub fn data_type(&self) -> DataType {
1564+
pub fn data_type(&self) -> Option<DataType> {
15151565
match self {
1516-
Self::Null { datatype } => datatype.clone(),
1517-
Self::MaybeNull { values } | Self::NotNull { values } => values.data_type(),
1566+
Self::Null { datatype } => Some(datatype.clone()),
1567+
Self::MaybeNull { values } | Self::NotNull { values } => {
1568+
Some(values.data_type())
1569+
}
1570+
Self::Unknown => None,
15181571
}
15191572
}
15201573

15211574
/// Return true if the value is definitely true (and not null).
15221575
pub fn is_certainly_true(&self) -> bool {
15231576
match self {
1524-
Self::Null { .. } | Self::MaybeNull { .. } => false,
1577+
Self::Null { .. } | Self::MaybeNull { .. } | Self::Unknown => false,
15251578
Self::NotNull { values } => values == &Interval::CERTAINLY_TRUE,
15261579
}
15271580
}
15281581

15291582
/// Return true if the value is definitely false (and not null).
15301583
pub fn is_certainly_false(&self) -> bool {
15311584
match self {
1532-
Self::Null { .. } => false,
1533-
Self::MaybeNull { .. } => false,
1585+
Self::Null { .. } | Self::MaybeNull { .. } | Self::Unknown => false,
15341586
Self::NotNull { values } => values == &Interval::CERTAINLY_FALSE,
15351587
}
15361588
}
@@ -1547,6 +1599,7 @@ impl NullableInterval {
15471599
Self::NotNull { values } => Ok(Self::NotNull {
15481600
values: values.not()?,
15491601
}),
1602+
Self::Unknown => Ok(Self::Unknown),
15501603
}
15511604
}
15521605

@@ -1640,9 +1693,12 @@ impl NullableInterval {
16401693
datatype: DataType::Boolean,
16411694
})
16421695
} else {
1643-
Ok(Self::Null {
1644-
datatype: self.data_type(),
1645-
})
1696+
match self.data_type() {
1697+
Some(datatype) => Ok(Self::Null { datatype }),
1698+
None => Err(DataFusionError::NotImplemented(
1699+
"Cannot determine data type for operation".to_string(),
1700+
)),
1701+
}
16461702
}
16471703
}
16481704
}
@@ -1714,10 +1770,13 @@ impl NullableInterval {
17141770

17151771
#[cfg(test)]
17161772
mod tests {
1717-
use crate::interval_arithmetic::{next_value, prev_value, satisfy_greater, Interval};
1773+
use crate::interval_arithmetic::{
1774+
next_value, prev_value, satisfy_greater, Interval, NullableInterval,
1775+
};
17181776

17191777
use arrow::datatypes::DataType;
1720-
use datafusion_common::{Result, ScalarValue};
1778+
use datafusion_common::stats::Precision;
1779+
use datafusion_common::{ColumnStatistics, Result, ScalarValue};
17211780

17221781
#[test]
17231782
fn test_next_prev_value() -> Result<()> {
@@ -3212,7 +3271,64 @@ mod tests {
32123271

32133272
Ok(())
32143273
}
3274+
#[test]
3275+
fn test_interval_from_column_statistics() {
3276+
let stats_null = ColumnStatistics {
3277+
null_count: Precision::Exact(10),
3278+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
3279+
min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
3280+
distinct_count: Precision::Exact(10),
3281+
};
3282+
assert_eq!(
3283+
NullableInterval::from(stats_null),
3284+
NullableInterval::Null {
3285+
datatype: DataType::Int32
3286+
}
3287+
);
3288+
3289+
let stats_not_null = ColumnStatistics {
3290+
null_count: Precision::Exact(0),
3291+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
3292+
min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
3293+
distinct_count: Precision::Exact(20),
3294+
};
3295+
assert_eq!(
3296+
NullableInterval::from(stats_not_null),
3297+
NullableInterval::NotNull {
3298+
values: Interval {
3299+
lower: ScalarValue::Int32(Some(1)),
3300+
upper: ScalarValue::Int32(Some(100)),
3301+
}
3302+
}
3303+
);
32153304

3305+
let stats_maybe_null = ColumnStatistics {
3306+
null_count: Precision::Exact(5),
3307+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
3308+
min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
3309+
distinct_count: Precision::Exact(20),
3310+
};
3311+
assert_eq!(
3312+
NullableInterval::from(stats_maybe_null),
3313+
NullableInterval::MaybeNull {
3314+
values: Interval {
3315+
lower: ScalarValue::Int32(Some(1)),
3316+
upper: ScalarValue::Int32(Some(100)),
3317+
}
3318+
}
3319+
);
3320+
3321+
let stats_unknown = ColumnStatistics {
3322+
null_count: Precision::Absent,
3323+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
3324+
min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
3325+
distinct_count: Precision::Exact(20),
3326+
};
3327+
assert_eq!(
3328+
NullableInterval::from(stats_unknown),
3329+
NullableInterval::Unknown
3330+
);
3331+
}
32163332
#[test]
32173333
fn test_interval_display() {
32183334
let interval = Interval::make(Some(0.25_f32), Some(0.50_f32)).unwrap();

0 commit comments

Comments
 (0)