This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new f8063e840e Add `ColumnStatistics::Sum` (#14074)
f8063e840e is described below

commit f8063e840ecd80c36a0d9d8f32f3d1e38e79de1f
Author: Nicholas Gates <[email protected]>
AuthorDate: Tue Jan 28 20:19:44 2025 +0000

    Add `ColumnStatistics::Sum` (#14074)
    
    * Add sum statistic
    
    * Add sum statistic
    
    * Add sum statistic
    
    * Add sum statistic
    
    * Add sum statistic
    
    * Add sum statistic
    
    * Add tests and Cargo fmt
    
    * fix up
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/common/src/stats.rs                     | 173 +++++++++++++++++++--
 datafusion/core/src/datasource/statistics.rs       |   6 +-
 .../core/tests/custom_sources_cases/statistics.rs  |   2 +
 datafusion/physical-plan/src/common.rs             |   3 +
 datafusion/physical-plan/src/filter.rs             |   2 +
 datafusion/physical-plan/src/joins/cross_join.rs   |  52 ++++++-
 datafusion/physical-plan/src/joins/utils.rs        |   1 +
 datafusion/physical-plan/src/memory.rs             |   1 +
 datafusion/physical-plan/src/projection.rs         |   7 +
 datafusion/physical-plan/src/union.rs              |  10 ++
 datafusion/physical-plan/src/values.rs             |   1 +
 .../proto-common/proto/datafusion_common.proto     |   1 +
 datafusion/proto-common/src/from_proto/mod.rs      |   5 +
 datafusion/proto-common/src/generated/pbjson.rs    |  18 +++
 datafusion/proto-common/src/generated/prost.rs     |   2 +
 datafusion/proto-common/src/to_proto/mod.rs        |   1 +
 .../proto/src/generated/datafusion_proto_common.rs |   2 +
 17 files changed, 269 insertions(+), 18 deletions(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index d2ce965c5c..dd8848d249 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -21,7 +21,7 @@ use std::fmt::{self, Debug, Display};
 
 use crate::{Result, ScalarValue};
 
-use arrow_schema::{Schema, SchemaRef};
+use arrow_schema::{DataType, Schema, SchemaRef};
 
 /// Represents a value with a degree of certainty. `Precision` is used to
 /// propagate information the precision of statistical values.
@@ -170,24 +170,63 @@ impl Precision<ScalarValue> {
     pub fn add(&self, other: &Precision<ScalarValue>) -> 
Precision<ScalarValue> {
         match (self, other) {
             (Precision::Exact(a), Precision::Exact(b)) => {
-                if let Ok(result) = a.add(b) {
-                    Precision::Exact(result)
-                } else {
-                    Precision::Absent
-                }
+                a.add(b).map(Precision::Exact).unwrap_or(Precision::Absent)
             }
             (Precision::Inexact(a), Precision::Exact(b))
             | (Precision::Exact(a), Precision::Inexact(b))
-            | (Precision::Inexact(a), Precision::Inexact(b)) => {
-                if let Ok(result) = a.add(b) {
-                    Precision::Inexact(result)
-                } else {
-                    Precision::Absent
-                }
+            | (Precision::Inexact(a), Precision::Inexact(b)) => a
+                .add(b)
+                .map(Precision::Inexact)
+                .unwrap_or(Precision::Absent),
+            (_, _) => Precision::Absent,
+        }
+    }
+
+    /// Calculates the difference of two (possibly inexact) [`ScalarValue`] 
values,
+    /// conservatively propagating exactness information. If one of the input
+    /// values is [`Precision::Absent`], the result is `Absent` too.
+    pub fn sub(&self, other: &Precision<ScalarValue>) -> 
Precision<ScalarValue> {
+        match (self, other) {
+            (Precision::Exact(a), Precision::Exact(b)) => {
+                a.sub(b).map(Precision::Exact).unwrap_or(Precision::Absent)
             }
+            (Precision::Inexact(a), Precision::Exact(b))
+            | (Precision::Exact(a), Precision::Inexact(b))
+            | (Precision::Inexact(a), Precision::Inexact(b)) => a
+                .sub(b)
+                .map(Precision::Inexact)
+                .unwrap_or(Precision::Absent),
+            (_, _) => Precision::Absent,
+        }
+    }
+
+    /// Calculates the multiplication of two (possibly inexact) 
[`ScalarValue`] values,
+    /// conservatively propagating exactness information. If one of the input
+    /// values is [`Precision::Absent`], the result is `Absent` too.
+    pub fn multiply(&self, other: &Precision<ScalarValue>) -> 
Precision<ScalarValue> {
+        match (self, other) {
+            (Precision::Exact(a), Precision::Exact(b)) => a
+                .mul_checked(b)
+                .map(Precision::Exact)
+                .unwrap_or(Precision::Absent),
+            (Precision::Inexact(a), Precision::Exact(b))
+            | (Precision::Exact(a), Precision::Inexact(b))
+            | (Precision::Inexact(a), Precision::Inexact(b)) => a
+                .mul_checked(b)
+                .map(Precision::Inexact)
+                .unwrap_or(Precision::Absent),
             (_, _) => Precision::Absent,
         }
     }
+
+    /// Casts the value to the given data type, propagating exactness 
information.
+    pub fn cast_to(&self, data_type: &DataType) -> 
Result<Precision<ScalarValue>> {
+        match self {
+            Precision::Exact(value) => 
value.cast_to(data_type).map(Precision::Exact),
+            Precision::Inexact(value) => 
value.cast_to(data_type).map(Precision::Inexact),
+            Precision::Absent => Ok(Precision::Absent),
+        }
+    }
 }
 
 impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
@@ -210,6 +249,18 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> 
Display for Precision<T> {
     }
 }
 
+impl From<Precision<usize>> for Precision<ScalarValue> {
+    fn from(value: Precision<usize>) -> Self {
+        match value {
+            Precision::Exact(v) => Precision::Exact(ScalarValue::UInt64(Some(v 
as u64))),
+            Precision::Inexact(v) => {
+                Precision::Inexact(ScalarValue::UInt64(Some(v as u64)))
+            }
+            Precision::Absent => Precision::Absent,
+        }
+    }
+}
+
 /// Statistics for a relation
 /// Fields are optional and can be inexact because the sources
 /// sometimes provide approximate estimates for performance reasons
@@ -401,6 +452,11 @@ impl Display for Statistics {
                 } else {
                     s
                 };
+                let s = if cs.sum_value != Precision::Absent {
+                    format!("{} Sum={}", s, cs.sum_value)
+                } else {
+                    s
+                };
                 let s = if cs.null_count != Precision::Absent {
                     format!("{} Null={}", s, cs.null_count)
                 } else {
@@ -436,6 +492,8 @@ pub struct ColumnStatistics {
     pub max_value: Precision<ScalarValue>,
     /// Minimum value of column
     pub min_value: Precision<ScalarValue>,
+    /// Sum value of a column
+    pub sum_value: Precision<ScalarValue>,
     /// Number of distinct values
     pub distinct_count: Precision<usize>,
 }
@@ -458,6 +516,7 @@ impl ColumnStatistics {
             null_count: Precision::Absent,
             max_value: Precision::Absent,
             min_value: Precision::Absent,
+            sum_value: Precision::Absent,
             distinct_count: Precision::Absent,
         }
     }
@@ -469,6 +528,7 @@ impl ColumnStatistics {
         self.null_count = self.null_count.to_inexact();
         self.max_value = self.max_value.to_inexact();
         self.min_value = self.min_value.to_inexact();
+        self.sum_value = self.sum_value.to_inexact();
         self.distinct_count = self.distinct_count.to_inexact();
         self
     }
@@ -563,6 +623,26 @@ mod tests {
         assert_eq!(precision1.add(&absent_precision), Precision::Absent);
     }
 
+    #[test]
+    fn test_add_scalar() {
+        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+        assert_eq!(
+            precision.add(&Precision::Exact(ScalarValue::Int32(Some(23)))),
+            Precision::Exact(ScalarValue::Int32(Some(65))),
+        );
+        assert_eq!(
+            precision.add(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
+            Precision::Inexact(ScalarValue::Int32(Some(65))),
+        );
+        assert_eq!(
+            precision.add(&Precision::Exact(ScalarValue::Int32(None))),
+            // As per behavior of ScalarValue::add
+            Precision::Exact(ScalarValue::Int32(None)),
+        );
+        assert_eq!(precision.add(&Precision::Absent), Precision::Absent);
+    }
+
     #[test]
     fn test_sub() {
         let precision1 = Precision::Exact(42);
@@ -575,6 +655,26 @@ mod tests {
         assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
     }
 
+    #[test]
+    fn test_sub_scalar() {
+        let precision = Precision::Exact(ScalarValue::Int32(Some(42)));
+
+        assert_eq!(
+            precision.sub(&Precision::Exact(ScalarValue::Int32(Some(23)))),
+            Precision::Exact(ScalarValue::Int32(Some(19))),
+        );
+        assert_eq!(
+            precision.sub(&Precision::Inexact(ScalarValue::Int32(Some(23)))),
+            Precision::Inexact(ScalarValue::Int32(Some(19))),
+        );
+        assert_eq!(
+            precision.sub(&Precision::Exact(ScalarValue::Int32(None))),
+            // As per behavior of ScalarValue::sub
+            Precision::Exact(ScalarValue::Int32(None)),
+        );
+        assert_eq!(precision.sub(&Precision::Absent), Precision::Absent);
+    }
+
     #[test]
     fn test_multiply() {
         let precision1 = Precision::Exact(6);
@@ -588,6 +688,54 @@ mod tests {
         assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
     }
 
+    #[test]
+    fn test_multiply_scalar() {
+        let precision = Precision::Exact(ScalarValue::Int32(Some(6)));
+
+        assert_eq!(
+            precision.multiply(&Precision::Exact(ScalarValue::Int32(Some(5)))),
+            Precision::Exact(ScalarValue::Int32(Some(30))),
+        );
+        assert_eq!(
+            
precision.multiply(&Precision::Inexact(ScalarValue::Int32(Some(5)))),
+            Precision::Inexact(ScalarValue::Int32(Some(30))),
+        );
+        assert_eq!(
+            precision.multiply(&Precision::Exact(ScalarValue::Int32(None))),
+            // As per behavior of ScalarValue::mul_checked
+            Precision::Exact(ScalarValue::Int32(None)),
+        );
+        assert_eq!(precision.multiply(&Precision::Absent), Precision::Absent);
+    }
+
+    #[test]
+    fn test_cast_to() {
+        // Valid
+        assert_eq!(
+            Precision::Exact(ScalarValue::Int32(Some(42)))
+                .cast_to(&DataType::Int64)
+                .unwrap(),
+            Precision::Exact(ScalarValue::Int64(Some(42))),
+        );
+        assert_eq!(
+            Precision::Inexact(ScalarValue::Int32(Some(42)))
+                .cast_to(&DataType::Int64)
+                .unwrap(),
+            Precision::Inexact(ScalarValue::Int64(Some(42))),
+        );
+        // Null
+        assert_eq!(
+            Precision::Exact(ScalarValue::Int32(None))
+                .cast_to(&DataType::Int64)
+                .unwrap(),
+            Precision::Exact(ScalarValue::Int64(None)),
+        );
+        // Overflow returns error
+        assert!(Precision::Exact(ScalarValue::Int32(Some(256)))
+            .cast_to(&DataType::Int8)
+            .is_err());
+    }
+
     #[test]
     fn test_precision_cloning() {
         // Precision<usize> is copy
@@ -646,6 +794,7 @@ mod tests {
             null_count: Precision::Exact(null_count),
             max_value: Precision::Exact(ScalarValue::Int64(Some(42))),
             min_value: Precision::Exact(ScalarValue::Int64(Some(64))),
+            sum_value: Precision::Exact(ScalarValue::Int64(Some(4600))),
             distinct_count: Precision::Exact(100),
         }
     }
diff --git a/datafusion/core/src/datasource/statistics.rs 
b/datafusion/core/src/datasource/statistics.rs
index 201bbfd5c0..f02927619a 100644
--- a/datafusion/core/src/datasource/statistics.rs
+++ b/datafusion/core/src/datasource/statistics.rs
@@ -76,6 +76,7 @@ pub async fn get_statistics_with_limit(
             col_stats_set[index].null_count = file_column.null_count;
             col_stats_set[index].max_value = file_column.max_value;
             col_stats_set[index].min_value = file_column.min_value;
+            col_stats_set[index].sum_value = file_column.sum_value;
         }
 
         // If the number of rows exceeds the limit, we can stop processing
@@ -113,12 +114,14 @@ pub async fn get_statistics_with_limit(
                         null_count: file_nc,
                         max_value: file_max,
                         min_value: file_min,
+                        sum_value: file_sum,
                         distinct_count: _,
                     } = file_col_stats;
 
                     col_stats.null_count = add_row_stats(*file_nc, 
col_stats.null_count);
                     set_max_if_greater(file_max, &mut col_stats.max_value);
-                    set_min_if_lesser(file_min, &mut col_stats.min_value)
+                    set_min_if_lesser(file_min, &mut col_stats.min_value);
+                    col_stats.sum_value = file_sum.add(&col_stats.sum_value);
                 }
 
                 // If the number of rows exceeds the limit, we can stop 
processing
@@ -204,6 +207,7 @@ pub(crate) fn get_col_stats(
                 null_count: null_counts[i],
                 max_value: 
max_value.map(Precision::Exact).unwrap_or(Precision::Absent),
                 min_value: 
min_value.map(Precision::Exact).unwrap_or(Precision::Absent),
+                sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
             }
         })
diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs 
b/datafusion/core/tests/custom_sources_cases/statistics.rs
index 9d3bd594a9..1fd6dfec79 100644
--- a/datafusion/core/tests/custom_sources_cases/statistics.rs
+++ b/datafusion/core/tests/custom_sources_cases/statistics.rs
@@ -200,12 +200,14 @@ fn fully_defined() -> (Statistics, Schema) {
                     distinct_count: Precision::Exact(2),
                     max_value: 
Precision::Exact(ScalarValue::Int32(Some(1023))),
                     min_value: Precision::Exact(ScalarValue::Int32(Some(-24))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(13),
                     max_value: 
Precision::Exact(ScalarValue::Int64(Some(5486))),
                     min_value: 
Precision::Exact(ScalarValue::Int64(Some(-6783))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(10))),
                     null_count: Precision::Exact(5),
                 },
             ],
diff --git a/datafusion/physical-plan/src/common.rs 
b/datafusion/physical-plan/src/common.rs
index aefb90d1d1..20a4e89dba 100644
--- a/datafusion/physical-plan/src/common.rs
+++ b/datafusion/physical-plan/src/common.rs
@@ -333,12 +333,14 @@ mod tests {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(0),
                 },
             ],
@@ -371,6 +373,7 @@ mod tests {
                 distinct_count: Precision::Absent,
                 max_value: Precision::Absent,
                 min_value: Precision::Absent,
+                sum_value: Precision::Absent,
                 null_count: Precision::Exact(3),
             }],
         };
diff --git a/datafusion/physical-plan/src/filter.rs 
b/datafusion/physical-plan/src/filter.rs
index ec860b3a9f..39f022b58c 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -476,6 +476,7 @@ fn collect_new_statistics(
                     null_count: 
input_column_stats[idx].null_count.to_inexact(),
                     max_value,
                     min_value,
+                    sum_value: Precision::Absent,
                     distinct_count: distinct_count.to_inexact(),
                 }
             },
@@ -1196,6 +1197,7 @@ mod tests {
                 null_count: Precision::Absent,
                 min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
                 max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
+                sum_value: Precision::Absent,
                 distinct_count: Precision::Absent,
             }],
         };
diff --git a/datafusion/physical-plan/src/joins/cross_join.rs 
b/datafusion/physical-plan/src/joins/cross_join.rs
index 87fd0f9658..ab94c132a2 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -411,12 +411,36 @@ fn stats_cartesian_product(
             distinct_count: s.distinct_count,
             min_value: s.min_value,
             max_value: s.max_value,
+            sum_value: s
+                .sum_value
+                .get_value()
+                // Cast the row count into the same type as any existing sum 
value
+                .and_then(|v| {
+                    Precision::<ScalarValue>::from(right_row_count)
+                        .cast_to(&v.data_type())
+                        .ok()
+                })
+                .map(|row_count| s.sum_value.multiply(&row_count))
+                .unwrap_or(Precision::Absent),
         })
-        .chain(right_col_stats.into_iter().map(|s| ColumnStatistics {
-            null_count: s.null_count.multiply(&left_row_count),
-            distinct_count: s.distinct_count,
-            min_value: s.min_value,
-            max_value: s.max_value,
+        .chain(right_col_stats.into_iter().map(|s| {
+            ColumnStatistics {
+                null_count: s.null_count.multiply(&left_row_count),
+                distinct_count: s.distinct_count,
+                min_value: s.min_value,
+                max_value: s.max_value,
+                sum_value: s
+                    .sum_value
+                    .get_value()
+                    // Cast the row count into the same type as any existing 
sum value
+                    .and_then(|v| {
+                        Precision::<ScalarValue>::from(left_row_count)
+                            .cast_to(&v.data_type())
+                            .ok()
+                    })
+                    .map(|row_count| s.sum_value.multiply(&row_count))
+                    .unwrap_or(Precision::Absent),
+            }
         }))
         .collect();
 
@@ -650,12 +674,14 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
                 },
             ],
@@ -668,6 +694,7 @@ mod tests {
                 distinct_count: Precision::Exact(3),
                 max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
             }],
         };
@@ -682,18 +709,25 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(
+                        42 * right_row_count as i64,
+                    ))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3 * right_row_count),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(
+                        20 * left_row_count as i64,
+                    ))),
                     null_count: Precision::Exact(2 * left_row_count),
                 },
             ],
@@ -714,12 +748,14 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
                 },
             ],
@@ -732,6 +768,7 @@ mod tests {
                 distinct_count: Precision::Exact(3),
                 max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
                 min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                sum_value: Precision::Exact(ScalarValue::Int64(Some(20))),
                 null_count: Precision::Exact(2),
             }],
         };
@@ -746,18 +783,23 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Absent, // we don't know the row 
count on the right
                     null_count: Precision::Absent, // we don't know the row 
count on the right
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Absent, // we don't know the row 
count on the right
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(3),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(12))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(
+                        20 * left_row_count as i64,
+                    ))),
                     null_count: Precision::Exact(2 * left_row_count),
                 },
             ],
diff --git a/datafusion/physical-plan/src/joins/utils.rs 
b/datafusion/physical-plan/src/joins/utils.rs
index dea4305fa6..5327793d01 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -2026,6 +2026,7 @@ mod tests {
             distinct_count,
             min_value: min.map(ScalarValue::from),
             max_value: max.map(ScalarValue::from),
+            sum_value: Absent,
             null_count,
         }
     }
diff --git a/datafusion/physical-plan/src/memory.rs 
b/datafusion/physical-plan/src/memory.rs
index 5ad3c4881b..198b8ccd69 100644
--- a/datafusion/physical-plan/src/memory.rs
+++ b/datafusion/physical-plan/src/memory.rs
@@ -937,6 +937,7 @@ mod tests {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                 },],
             }
         );
diff --git a/datafusion/physical-plan/src/projection.rs 
b/datafusion/physical-plan/src/projection.rs
index b364d4a870..3ebfd8f8ca 100644
--- a/datafusion/physical-plan/src/projection.rs
+++ b/datafusion/physical-plan/src/projection.rs
@@ -1106,18 +1106,21 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: 
Precision::Exact(ScalarValue::Float32(Some(1.1))),
                     min_value: 
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+                    sum_value: 
Precision::Exact(ScalarValue::Float32(Some(5.5))),
                     null_count: Precision::Absent,
                 },
             ],
@@ -1150,12 +1153,14 @@ mod tests {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
             ],
@@ -1184,12 +1189,14 @@ mod tests {
                     distinct_count: Precision::Absent,
                     max_value: 
Precision::Exact(ScalarValue::Float32(Some(1.1))),
                     min_value: 
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+                    sum_value: 
Precision::Exact(ScalarValue::Float32(Some(5.5))),
                     null_count: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
             ],
diff --git a/datafusion/physical-plan/src/union.rs 
b/datafusion/physical-plan/src/union.rs
index bacd02398e..a41336ea6e 100644
--- a/datafusion/physical-plan/src/union.rs
+++ b/datafusion/physical-plan/src/union.rs
@@ -610,6 +610,7 @@ fn col_stats_union(
     left.distinct_count = Precision::Absent;
     left.min_value = left.min_value.min(&right.min_value);
     left.max_value = left.max_value.max(&right.max_value);
+    left.sum_value = left.sum_value.add(&right.sum_value);
     left.null_count = left.null_count.add(&right.null_count);
 
     left
@@ -702,18 +703,21 @@ mod tests {
                     distinct_count: Precision::Exact(5),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(0),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Exact(1),
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Exact(3),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: 
Precision::Exact(ScalarValue::Float32(Some(1.1))),
                     min_value: 
Precision::Exact(ScalarValue::Float32(Some(0.1))),
+                    sum_value: 
Precision::Exact(ScalarValue::Float32(Some(42.0))),
                     null_count: Precision::Absent,
                 },
             ],
@@ -727,18 +731,21 @@ mod tests {
                     distinct_count: Precision::Exact(3),
                     max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(42))),
                     null_count: Precision::Exact(1),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Exact(ScalarValue::from("c")),
                     min_value: Precision::Exact(ScalarValue::from("b")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                     null_count: Precision::Absent,
                 },
             ],
@@ -753,18 +760,21 @@ mod tests {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
                     min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
+                    sum_value: Precision::Exact(ScalarValue::Int64(Some(84))),
                     null_count: Precision::Exact(1),
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Exact(ScalarValue::from("x")),
                     min_value: Precision::Exact(ScalarValue::from("a")),
+                    sum_value: Precision::Absent,
                     null_count: Precision::Absent,
                 },
                 ColumnStatistics {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                     null_count: Precision::Absent,
                 },
             ],
diff --git a/datafusion/physical-plan/src/values.rs 
b/datafusion/physical-plan/src/values.rs
index b000f9335a..960e3f544e 100644
--- a/datafusion/physical-plan/src/values.rs
+++ b/datafusion/physical-plan/src/values.rs
@@ -321,6 +321,7 @@ mod tests {
                     distinct_count: Precision::Absent,
                     max_value: Precision::Absent,
                     min_value: Precision::Absent,
+                    sum_value: Precision::Absent,
                 },],
             }
         );
diff --git a/datafusion/proto-common/proto/datafusion_common.proto 
b/datafusion/proto-common/proto/datafusion_common.proto
index 6a7dc1604b..1c2807f390 100644
--- a/datafusion/proto-common/proto/datafusion_common.proto
+++ b/datafusion/proto-common/proto/datafusion_common.proto
@@ -570,6 +570,7 @@ message Statistics {
 message ColumnStats {
   Precision min_value = 1;
   Precision max_value = 2;
+  Precision sum_value = 5;
   Precision null_count = 3;
   Precision distinct_count = 4;
 }
diff --git a/datafusion/proto-common/src/from_proto/mod.rs 
b/datafusion/proto-common/src/from_proto/mod.rs
index d88186fbf3..b022e52b6a 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -701,6 +701,11 @@ impl From<&protobuf::ColumnStats> for ColumnStatistics {
             } else {
                 Precision::Absent
             },
+            sum_value: if let Some(sum) = &cs.sum_value {
+                sum.clone().into()
+            } else {
+                Precision::Absent
+            },
             distinct_count: if let Some(dc) = &cs.distinct_count {
                 dc.clone().into()
             } else {
diff --git a/datafusion/proto-common/src/generated/pbjson.rs 
b/datafusion/proto-common/src/generated/pbjson.rs
index e9f9de09d4..40687de098 100644
--- a/datafusion/proto-common/src/generated/pbjson.rs
+++ b/datafusion/proto-common/src/generated/pbjson.rs
@@ -985,6 +985,9 @@ impl serde::Serialize for ColumnStats {
         if self.max_value.is_some() {
             len += 1;
         }
+        if self.sum_value.is_some() {
+            len += 1;
+        }
         if self.null_count.is_some() {
             len += 1;
         }
@@ -998,6 +1001,9 @@ impl serde::Serialize for ColumnStats {
         if let Some(v) = self.max_value.as_ref() {
             struct_ser.serialize_field("maxValue", v)?;
         }
+        if let Some(v) = self.sum_value.as_ref() {
+            struct_ser.serialize_field("sumValue", v)?;
+        }
         if let Some(v) = self.null_count.as_ref() {
             struct_ser.serialize_field("nullCount", v)?;
         }
@@ -1018,6 +1024,8 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             "minValue",
             "max_value",
             "maxValue",
+            "sum_value",
+            "sumValue",
             "null_count",
             "nullCount",
             "distinct_count",
@@ -1028,6 +1036,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
         enum GeneratedField {
             MinValue,
             MaxValue,
+            SumValue,
             NullCount,
             DistinctCount,
         }
@@ -1053,6 +1062,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                         match value {
                             "minValue" | "min_value" => 
Ok(GeneratedField::MinValue),
                             "maxValue" | "max_value" => 
Ok(GeneratedField::MaxValue),
+                            "sumValue" | "sum_value" => 
Ok(GeneratedField::SumValue),
                             "nullCount" | "null_count" => 
Ok(GeneratedField::NullCount),
                             "distinctCount" | "distinct_count" => 
Ok(GeneratedField::DistinctCount),
                             _ => Err(serde::de::Error::unknown_field(value, 
FIELDS)),
@@ -1076,6 +1086,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
             {
                 let mut min_value__ = None;
                 let mut max_value__ = None;
+                let mut sum_value__ = None;
                 let mut null_count__ = None;
                 let mut distinct_count__ = None;
                 while let Some(k) = map_.next_key()? {
@@ -1092,6 +1103,12 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                             }
                             max_value__ = map_.next_value()?;
                         }
+                        GeneratedField::SumValue => {
+                            if sum_value__.is_some() {
+                                return 
Err(serde::de::Error::duplicate_field("sumValue"));
+                            }
+                            sum_value__ = map_.next_value()?;
+                        }
                         GeneratedField::NullCount => {
                             if null_count__.is_some() {
                                 return 
Err(serde::de::Error::duplicate_field("nullCount"));
@@ -1109,6 +1126,7 @@ impl<'de> serde::Deserialize<'de> for ColumnStats {
                 Ok(ColumnStats {
                     min_value: min_value__,
                     max_value: max_value__,
+                    sum_value: sum_value__,
                     null_count: null_count__,
                     distinct_count: distinct_count__,
                 })
diff --git a/datafusion/proto-common/src/generated/prost.rs 
b/datafusion/proto-common/src/generated/prost.rs
index 3263c1c755..9e4a1ecb6b 100644
--- a/datafusion/proto-common/src/generated/prost.rs
+++ b/datafusion/proto-common/src/generated/prost.rs
@@ -873,6 +873,8 @@ pub struct ColumnStats {
     pub min_value: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "2")]
     pub max_value: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "5")]
+    pub sum_value: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "3")]
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]
diff --git a/datafusion/proto-common/src/to_proto/mod.rs 
b/datafusion/proto-common/src/to_proto/mod.rs
index 88bbbfd604..ced1865795 100644
--- a/datafusion/proto-common/src/to_proto/mod.rs
+++ b/datafusion/proto-common/src/to_proto/mod.rs
@@ -750,6 +750,7 @@ impl From<&ColumnStatistics> for protobuf::ColumnStats {
         protobuf::ColumnStats {
             min_value: Some(protobuf::Precision::from(&s.min_value)),
             max_value: Some(protobuf::Precision::from(&s.max_value)),
+            sum_value: Some(protobuf::Precision::from(&s.sum_value)),
             null_count: Some(protobuf::Precision::from(&s.null_count)),
             distinct_count: Some(protobuf::Precision::from(&s.distinct_count)),
         }
diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs 
b/datafusion/proto/src/generated/datafusion_proto_common.rs
index 3263c1c755..9e4a1ecb6b 100644
--- a/datafusion/proto/src/generated/datafusion_proto_common.rs
+++ b/datafusion/proto/src/generated/datafusion_proto_common.rs
@@ -873,6 +873,8 @@ pub struct ColumnStats {
     pub min_value: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "2")]
     pub max_value: ::core::option::Option<Precision>,
+    #[prost(message, optional, tag = "5")]
+    pub sum_value: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "3")]
     pub null_count: ::core::option::Option<Precision>,
     #[prost(message, optional, tag = "4")]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to