This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new f8063e840e Add `ColumnStatistics::Sum` (#14074)
f8063e840e is described below
commit f8063e840ecd80c36a0d9d8f32f3d1e38e79de1f
Author: Nicholas Gates <[email protected]>
AuthorDate: Tue Jan 28 20:19:44 2025 +0000
Add `ColumnStatistics::Sum` (#14074)
* Add sum statistic
* Add sum statistic
* Add sum statistic
* Add sum statistic
* Add sum statistic
* Add sum statistic
* Add tests and Cargo fmt
* fix up
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/common/src/stats.rs | 173 +++++++++++++++++++--
datafusion/core/src/datasource/statistics.rs | 6 +-
.../core/tests/custom_sources_cases/statistics.rs | 2 +
datafusion/physical-plan/src/common.rs | 3 +
datafusion/physical-plan/src/filter.rs | 2 +
datafusion/physical-plan/src/joins/cross_join.rs | 52 ++++++-
datafusion/physical-plan/src/joins/utils.rs | 1 +
datafusion/physical-plan/src/memory.rs | 1 +
datafusion/physical-plan/src/projection.rs | 7 +
datafusion/physical-plan/src/union.rs | 10 ++
datafusion/physical-plan/src/values.rs | 1 +
.../proto-common/proto/datafusion_common.proto | 1 +
datafusion/proto-common/src/from_proto/mod.rs | 5 +
datafusion/proto-common/src/generated/pbjson.rs | 18 +++
datafusion/proto-common/src/generated/prost.rs | 2 +
datafusion/proto-common/src/to_proto/mod.rs | 1 +
.../proto/src/generated/datafusion_proto_common.rs | 2 +
17 files changed, 269 insertions(+), 18 deletions(-)
diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index d2ce965c5c..dd8848d249 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -21,7 +21,7 @@ use std::fmt::{self, Debug, Display};
use crate::{Result, ScalarValue};
-use arrow_schema::{Schema, SchemaRef};
+use arrow_schema::{DataType, Schema, SchemaRef};
/// Represents a value with a degree of certainty. `Precision` is used to
/// propagate information the precision of statistical values.
@@ -170,24 +170,63 @@ impl Precision<ScalarValue> {
pub fn add(&self, other: &Precision<ScalarValue>) ->
Precision<ScalarValue> {
match (self, other) {
(Precision::Exact(a), Precision::Exact(b)) => {
- if let Ok(result) = a.add(b) {
- Precision::Exact(result)
- } else {
- Precision::Absent
- }
+ a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
}
(Precision::Inexact(a), Precision::Exact(b))
| (Precision::Exact(a), Precision::Inexact(b))
- | (Precision::Inexact(a), Precision::Inexact(b)) => {
- if let Ok(result) = a.add(b) {
- Precision::Inexact(result)
- } else {
- Precision::Absent
- }
+ | (Precision::Inexact(a), Precision::Inexact(b)) => a
+ .add(b)
+ .map(Precision::Inexact)
+ .unwrap_or(Precision::Absent),
+ (_, _) => Precision::Absent,
+ }
+ }
+
+ /// Calculates the difference of two (possibly inexact) [`ScalarValue`]
values,
+ /// conservatively propagating exactness information. If one of the input
+ /// values is [`Precision::Absent`], the result is `Absent` too.
+ pub fn sub(&self, other: &Precision<ScalarValue>) ->
Precision<ScalarValue> {
+ match (self, other) {
+ (Precision::Exact(a), Precision::Exact(b)) => {
+ a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
}
+ (Precision::Inexact(a), Precision::Exact(b))
+ | (Precision::Exact(a), Precision::Inexact(b))
+ | (Precision::Inexact(a), Precision::Inexact(b)) => a
+ .sub(b)
+ .map(Precision::Inexact)
+ .unwrap_or(Precision::Absent),
+ (_, _) => Precision::Absent,
+ }
+ }
+
+ /// Calculates the multiplication of two (possibly inexact)
[`ScalarValue`] values,
+ /// conservatively propagating exactness information. If one of the input
+ /// values is [`Precision::Absent`], the result is `Absent` too.
+ pub fn multiply(&self, other: &Precision<ScalarValue>) ->
Precision<ScalarValue> {
+ match (self, other) {
+ (Precision::Exact(a), Precision::Exact(b)) => a
+ .mul_checked(b)
+ .map(Precision::Exact)
+ .unwrap_or(Precision::Absent),
+ (Precision::Inexact(a), Precision::Exact(b))
+ | (Precision::Exact(a), Precision::Inexact(b))
+ | (Precision::Inexact(a), Precision::Inexact(b)) => a
+ .mul_checked(b)
+ .map(Precision::Inexact)
+ .unwrap_or(Precision::Absent),
(_, _) => Precision::Absent,
}
}
+
+ /// Casts the value to the given data type, propagating exactness
information.
+ pub fn cast_to(&self, data_type: &DataType) ->
Result<Precision<ScalarValue>> {
+ match self {
+ Precision::Exact(value) =>
value.cast_to(data_type).map(Precision::Exact),
+ Precision::Inexact(value) =>
value.cast_to(data_type).map(Precision::Inexact),
+ Precision::Absent => Ok(Precision::Absent),
+ }
+ }
}
impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
@@ -210,6 +249,18 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd>
Display for Precision<T> {
}
}
+impl From<Precision<usize>> for Precision<ScalarValue> {
+ fn from(value: Precision<usize>) -> Self {
+ match value {
+ Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v
as u64))),
+ Precision::Inexact(v) => {
+ Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
+ }
+ Precision::Absent => Precision::Absent,
+ }
+ }
+}
+
/// Statistics for a relation
/// Fields are optional and can be inexact because the sources
/// sometimes provide approximate estimates for performance reasons
@@ -401,6 +452,11 @@ impl Display for Statistics {
} else {
s
};
+ let s = if cs.sum_value != Precision::Absent {
+ format!("{} Sum={}", s, cs.sum_value)
+ } else {
+ s
+ };
let s = if cs.null_count != Precision::Absent {
format!("{} Null={}", s, cs.null_count)
} else {
@@ -436,6 +492,8 @@ pub struct ColumnStatistics {
pub max_value: Precision<ScalarValue>,
/// Minimum value of column
pub min_value: Precision<ScalarValue>,
+ /// Sum value of a column
+ pub sum_value: Precision<ScalarValue>,
/// Number of distinct values
pub distinct_count: Precision<usize>,
}
@@ -458,6 +516,7 @@ impl ColumnStatistics {
null_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
distinct_count: Precision::Absent,
}
}
@@ -469,6 +528,7 @@ impl ColumnStatistics {
self.null_count = self.null_count.to_inexact();
self.max_value = self.max_value.to_inexact();
self.min_value = self.min_value.to_inexact();
+ self.sum_value = self.sum_value.to_inexact();
self.distinct_count = self.distinct_count.to_inexact();
self
}
@@ -563,6 +623,26 @@ mod tests {
assert_eq!(precision1.add(&absent_precision), Precision::Absent);
}
+ #[test]
+ fn test_add_scalar() {
+ let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+ assert_eq!(
+ precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
+ Precision::Exact(ScalarValue::Int32(Some(65))),
+ );
+ assert_eq!(
+ precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
+ Precision::Inexact(ScalarValue::Int32(Some(65))),
+ );
+ assert_eq!(
+ precision.add(&Precision::Exact(ScalarValue::Int32(None))),
+ // As per behavior of ScalarValue::add
+ Precision::Exact(ScalarValue::Int32(None)),
+ );
+ assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
+ }
+
#[test]
fn test_sub() {
let precision1 = Precision::Exact(42);
@@ -575,6 +655,26 @@ mod tests {
assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
}
+ #[test]
+ fn test_sub_scalar() {
+ let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+ assert_eq!(
+ precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
+ Precision::Exact(ScalarValue::Int32(Some(19))),
+ );
+ assert_eq!(
+ precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
+ Precision::Inexact(ScalarValue::Int32(Some(19))),
+ );
+ assert_eq!(
+ precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
+ // As per behavior of ScalarValue::sub
+ Precision::Exact(ScalarValue::Int32(None)),
+ );
+ assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
+ }
+
#[test]
fn test_multiply() {
let precision1 = Precision::Exact(6);
@@ -588,6 +688,54 @@ mod tests {
assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
}
+ #[test]
+ fn test_multiply_scalar() {
+ let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
+
+ assert_eq!(
+ precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
+ Precision::Exact(ScalarValue::Int32(Some(30))),
+ );
+ assert_eq!(
+
precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
+ Precision::Inexact(ScalarValue::Int32(Some(30))),
+ );
+ assert_eq!(
+ precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
+ // As per behavior of ScalarValue::mul_checked
+ Precision::Exact(ScalarValue::Int32(None)),
+ );
+ assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
+ }
+
+ #[test]
+ fn test_cast_to() {
+ // Valid
+ assert_eq!(
+ Precision::Exact(ScalarValue::Int32(Some(42)))
+ .cast_to(&DataType::Int64)
+ .unwrap(),
+ Precision::Exact(ScalarValue::Int64(Some(42))),
+ );
+ assert_eq!(
+ Precision::Inexact(ScalarValue::Int32(Some(42)))
+ .cast_to(&DataType::Int64)
+ .unwrap(),
+ Precision::Inexact(ScalarValue::Int64(Some(42))),
+ );
+ // Null
+ assert_eq!(
+ Precision::Exact(ScalarValue::Int32(None))
+ .cast_to(&DataType::Int64)
+ .unwrap(),
+ Precision::Exact(ScalarValue::Int64(None)),
+ );
+ // Overflow returns error
+ assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
+ .cast_to(&DataType::Int8)
+ .is_err());
+ }
+
#[test]
fn test_precision_cloning() {
// Precision<usize> is copy
@@ -646,6 +794,7 @@ mod tests {
null_count: Precision::Exact(null_count),
max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
distinct_count: Precision::Exact(100),
}
}
diff --git a/datafusion/core/src/datasource/statistics.rs
b/datafusion/core/src/datasource/statistics.rs
index 201bbfd5c0..f02927619a 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -76,6 +76,7 @@ pub async fn get_statistics_with_limit(
col_stats_set[index].null_count = file_column.null_count;
col_stats_set[index].max_value = file_column.max_value;
col_stats_set[index].min_value = file_column.min_value;
+ col_stats_set[index].sum_value = file_column.sum_value;
}
// If the number of rows exceeds the limit, we can stop processing
@@ -113,12 +114,14 @@ pub async fn get_statistics_with_limit(
null_count: file_nc,
max_value: file_max,
min_value: file_min,
+ sum_value: file_sum,
distinct_count: _,
} = file_col_stats;
col_stats.null_count = add_row_stats(*file_nc,
col_stats.null_count);
set_max_if_greater(file_max, &mut col_stats.max_value);
- set_min_if_lesser(file_min, &mut col_stats.min_value)
+ set_min_if_lesser(file_min, &mut col_stats.min_value);
+ col_stats.sum_value = file_sum.add(&col_stats.sum_value);
}
// If the number of rows exceeds the limit, we can stop
processing
@@ -204,6 +207,7 @@ pub(crate) fn get_col_stats(
null_count: null_counts[i],
max_value:
max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
min_value:
min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
+ sum_value: Precision::Absent,
distinct_count: Precision::Absent,
}
})
diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs
b/datafusion/core/tests/custom_sources_cases/statistics.rs
index 9d3bd594a9..1fd6dfec79 100644
--- a/datafusion/core/tests/custom_sources_cases/statistics.rs
+++ b/datafusion/core/tests/custom_sources_cases/statistics.rs
@@ -200,12 +200,14 @@ fn fully_defined() -> (Statistics, Schema) {
distinct_count: Precision::Exact(2),
max_value:
Precision::Exact(ScalarValue::Int32(Some(1023))),
min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(13),
max_value:
Precision::Exact(ScalarValue::Int64(Some(5486))),
min_value:
Precision::Exact(ScalarValue::Int64(Some(-6783))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
null_count: Precision::Exact(5),
},
],
diff --git a/datafusion/physical-plan/src/common.rs
b/datafusion/physical-plan/src/common.rs
index aefb90d1d1..20a4e89dba 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -333,12 +333,14 @@ mod tests {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
null_count: Precision::Exact(0),
},
],
@@ -371,6 +373,7 @@ mod tests {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
}],
};
diff --git a/datafusion/physical-plan/src/filter.rs
b/datafusion/physical-plan/src/filter.rs
index ec860b3a9f..39f022b58c 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -476,6 +476,7 @@ fn collect_new_statistics(
null_count:
input_column_stats[idx].null_count.to_inexact(),
max_value,
min_value,
+ sum_value: Precision::Absent,
distinct_count: distinct_count.to_inexact(),
}
},
@@ -1196,6 +1197,7 @@ mod tests {
null_count: Precision::Absent,
min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
+ sum_value: Precision::Absent,
distinct_count: Precision::Absent,
}],
};
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs
b/datafusion/physical-plan/src/joins/cross_join.rs
index 87fd0f9658..ab94c132a2 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -411,12 +411,36 @@ fn stats_cartesian_product(
distinct_count: s.distinct_count,
min_value: s.min_value,
max_value: s.max_value,
+ sum_value: s
+ .sum_value
+ .get_value()
+ // Cast the row count into the same type as any existing sum
value
+ .and_then(|v| {
+ Precision::<ScalarValue>::from(right_row_count)
+ .cast_to(&v.data_type())
+ .ok()
+ })
+ .map(|row_count| s.sum_value.multiply(&row_count))
+ .unwrap_or(Precision::Absent),
})
- .chain(right_col_stats.into_iter().map(|s| ColumnStatistics {
- null_count: s.null_count.multiply(&left_row_count),
- distinct_count: s.distinct_count,
- min_value: s.min_value,
- max_value: s.max_value,
+ .chain(right_col_stats.into_iter().map(|s| {
+ ColumnStatistics {
+ null_count: s.null_count.multiply(&left_row_count),
+ distinct_count: s.distinct_count,
+ min_value: s.min_value,
+ max_value: s.max_value,
+ sum_value: s
+ .sum_value
+ .get_value()
+ // Cast the row count into the same type as any existing
sum value
+ .and_then(|v| {
+ Precision::<ScalarValue>::from(left_row_count)
+ .cast_to(&v.data_type())
+ .ok()
+ })
+ .map(|row_count| s.sum_value.multiply(&row_count))
+ .unwrap_or(Precision::Absent),
+ }
}))
.collect();
@@ -650,12 +674,14 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
},
],
@@ -668,6 +694,7 @@ mod tests {
distinct_count: Precision::Exact(3),
max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
null_count: Precision::Exact(2),
}],
};
@@ -682,18 +709,25 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(
+ 42 * right_row_count as i64,
+ ))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3 * right_row_count),
},
ColumnStatistics {
distinct_count: Precision::Exact(3),
max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(
+ 20 * left_row_count as i64,
+ ))),
null_count: Precision::Exact(2 * left_row_count),
},
],
@@ -714,12 +748,14 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
},
],
@@ -732,6 +768,7 @@ mod tests {
distinct_count: Precision::Exact(3),
max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
null_count: Precision::Exact(2),
}],
};
@@ -746,18 +783,23 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Absent, // we don't know the row
count on the right
null_count: Precision::Absent, // we don't know the row
count on the right
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Absent, // we don't know the row
count on the right
},
ColumnStatistics {
distinct_count: Precision::Exact(3),
max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(
+ 20 * left_row_count as i64,
+ ))),
null_count: Precision::Exact(2 * left_row_count),
},
],
diff --git a/datafusion/physical-plan/src/joins/utils.rs
b/datafusion/physical-plan/src/joins/utils.rs
index dea4305fa6..5327793d01 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -2026,6 +2026,7 @@ mod tests {
distinct_count,
min_value: min.map(ScalarValue::from),
max_value: max.map(ScalarValue::from),
+ sum_value: Absent,
null_count,
}
}
diff --git a/datafusion/physical-plan/src/memory.rs
b/datafusion/physical-plan/src/memory.rs
index 5ad3c4881b..198b8ccd69 100644
--- a/datafusion/physical-plan/src/memory.rs
+++ b/datafusion/physical-plan/src/memory.rs
@@ -937,6 +937,7 @@ mod tests {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
},],
}
);
diff --git a/datafusion/physical-plan/src/projection.rs
b/datafusion/physical-plan/src/projection.rs
index b364d4a870..3ebfd8f8ca 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -1106,18 +1106,21 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value:
Precision::Exact(ScalarValue::Float32(Some(1.1))),
min_value:
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+ sum_value:
Precision::Exact(ScalarValue::Float32(Some(5.5))),
null_count: Precision::Absent,
},
],
@@ -1150,12 +1153,14 @@ mod tests {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
},
ColumnStatistics {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
],
@@ -1184,12 +1189,14 @@ mod tests {
distinct_count: Precision::Absent,
max_value:
Precision::Exact(ScalarValue::Float32(Some(1.1))),
min_value:
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+ sum_value:
Precision::Exact(ScalarValue::Float32(Some(5.5))),
null_count: Precision::Absent,
},
ColumnStatistics {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
],
diff --git a/datafusion/physical-plan/src/union.rs
b/datafusion/physical-plan/src/union.rs
index bacd02398e..a41336ea6e 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -610,6 +610,7 @@ fn col_stats_union(
left.distinct_count = Precision::Absent;
left.min_value = left.min_value.min(&right.min_value);
left.max_value = left.max_value.max(&right.max_value);
+ left.sum_value = left.sum_value.add(&right.sum_value);
left.null_count = left.null_count.add(&right.null_count);
left
@@ -702,18 +703,21 @@ mod tests {
distinct_count: Precision::Exact(5),
max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(0),
},
ColumnStatistics {
distinct_count: Precision::Exact(1),
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Exact(3),
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value:
Precision::Exact(ScalarValue::Float32(Some(1.1))),
min_value:
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+ sum_value:
Precision::Exact(ScalarValue::Float32(Some(42.0))),
null_count: Precision::Absent,
},
],
@@ -727,18 +731,21 @@ mod tests {
distinct_count: Precision::Exact(3),
max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
null_count: Precision::Exact(1),
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Exact(ScalarValue::from("c")),
min_value: Precision::Exact(ScalarValue::from("b")),
+ sum_value: Precision::Absent,
null_count: Precision::Absent,
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
null_count: Precision::Absent,
},
],
@@ -753,18 +760,21 @@ mod tests {
distinct_count: Precision::Absent,
max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+ sum_value: Precision::Exact(ScalarValue::Int64(Some(84))),
null_count: Precision::Exact(1),
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Exact(ScalarValue::from("x")),
min_value: Precision::Exact(ScalarValue::from("a")),
+ sum_value: Precision::Absent,
null_count: Precision::Absent,
},
ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
null_count: Precision::Absent,
},
],
diff --git a/datafusion/physical-plan/src/values.rs
b/datafusion/physical-plan/src/values.rs
index b000f9335a..960e3f544e 100644
--- a/datafusion/physical-plan/src/values.rs
+++ b/datafusion/physical-plan/src/values.rs
@@ -321,6 +321,7 @@ mod tests {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
+ sum_value: Precision::Absent,
},],
}
);
diff --git a/datafusion/proto-common/proto/datafusion_common.proto
b/datafusion/proto-common/proto/datafusion_common.proto
index 6a7dc1604b..1c2807f390 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -570,6 +570,7 @@ message Statistics {
message ColumnStats {
Precision min_value = 1;
Precision max_value = 2;
+ Precision sum_value = 5;
Precision null_count = 3;
Precision distinct_count = 4;
}
diff --git a/datafusion/proto-common/src/from_proto/mod.rs
b/datafusion/proto-common/src/from_proto/mod.rs
index d88186fbf3..b022e52b6a 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -701,6 +701,11 @@ impl From<&protobuf::ColumnStats> for ColumnStatistics {
} else {
Precision::Absent
},
+ sum_value: if let Some(sum) = &cs.sum_value {
+ sum.clone().into()
+ } else {
+ Precision::Absent
+ },
distinct_count: if let Some(dc) = &cs.distinct_count {
dc.clone().into()
} else {
diff --git a/datafusion/proto-common/src/generated/pbjson.rs
b/datafusion/proto-common/src/generated/pbjson.rs
index e9f9de09d4..40687de098 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -985,6 +985,9 @@ impl serde::Serialize for ColumnStats {
if self.max_value.is_some() {
len += 1;
}
+ if self.sum_value.is_some() {
+ len += 1;
+ }
if self.null_count.is_some() {
len += 1;
}
@@ -998,6 +1001,9 @@ impl serde::Serialize for ColumnStats {
if let Some(v) = self.max_value.as_ref() {
struct_ser.serialize_field("maxValue", v)?;
}
+ if let Some(v) = self.sum_value.as_ref() {
+ struct_ser.serialize_field("sumValue", v)?;
+ }
if let Some(v) = self.null_count.as_ref() {
struct_ser.serialize_field("nullCount", v)?;
}
@@ -1018,6 +1024,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
"minValue",
"max_value",
"maxValue",
+ "sum_value",
+ "sumValue",
"null_count",
"nullCount",
"distinct_count",
@@ -1028,6 +1036,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
enum GeneratedField {
MinValue,
MaxValue,
+ SumValue,
NullCount,
DistinctCount,
}
@@ -1053,6 +1062,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
match value {
"minValue" | "min_value" =>
Ok(GeneratedField::MinValue),
"maxValue" | "max_value" =>
Ok(GeneratedField::MaxValue),
+ "sumValue" | "sum_value" =>
Ok(GeneratedField::SumValue),
"nullCount" | "null_count" =>
Ok(GeneratedField::NullCount),
"distinctCount" | "distinct_count" =>
Ok(GeneratedField::DistinctCount),
_ => Err(serde::de::Error::unknown_field(value,
FIELDS)),
@@ -1076,6 +1086,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
{
let mut min_value__ = None;
let mut max_value__ = None;
+ let mut sum_value__ = None;
let mut null_count__ = None;
let mut distinct_count__ = None;
while let Some(k) = map_.next_key()? {
@@ -1092,6 +1103,12 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
}
max_value__ = map_.next_value()?;
}
+ GeneratedField::SumValue => {
+ if sum_value__.is_some() {
+ return
Err(serde::de::Error::duplicate_field("sumValue"));
+ }
+ sum_value__ = map_.next_value()?;
+ }
GeneratedField::NullCount => {
if null_count__.is_some() {
return
Err(serde::de::Error::duplicate_field("nullCount"));
@@ -1109,6 +1126,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
Ok(ColumnStats {
min_value: min_value__,
max_value: max_value__,
+ sum_value: sum_value__,
null_count: null_count__,
distinct_count: distinct_count__,
})
diff --git a/datafusion/proto-common/src/generated/prost.rs
b/datafusion/proto-common/src/generated/prost.rs
index 3263c1c755..9e4a1ecb6b 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -873,6 +873,8 @@ pub struct ColumnStats {
pub min_value: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "2")]
pub max_value: ::core::option::Option<Precision>,
+ #[prost(message, optional, tag = "5")]
+ pub sum_value: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "3")]
pub null_count: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "4")]
diff --git a/datafusion/proto-common/src/to_proto/mod.rs
b/datafusion/proto-common/src/to_proto/mod.rs
index 88bbbfd604..ced1865795 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -750,6 +750,7 @@ impl From<&ColumnStatistics> for protobuf::ColumnStats {
protobuf::ColumnStats {
min_value: Some(protobuf::Precision::from(&s.min_value)),
max_value: Some(protobuf::Precision::from(&s.max_value)),
+ sum_value: Some(protobuf::Precision::from(&s.sum_value)),
null_count: Some(protobuf::Precision::from(&s.null_count)),
distinct_count: Some(protobuf::Precision::from(&s.distinct_count)),
}
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs
b/datafusion/proto/src/generated/datafusion_proto_common.rs
index 3263c1c755..9e4a1ecb6b 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -873,6 +873,8 @@ pub struct ColumnStats {
pub min_value: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "2")]
pub max_value: ::core::option::Option<Precision>,
+ #[prost(message, optional, tag = "5")]
+ pub sum_value: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "3")]
pub null_count: ::core::option::Option<Precision>,
#[prost(message, optional, tag = "4")]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]