This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new d10b1a479e Fix incorrect statistics read for binary columns in parquet 
(#10645)
d10b1a479e is described below

commit d10b1a479eeb062baeb8988290a7d6084275a404
Author: Xin Li <[email protected]>
AuthorDate: Sat May 25 18:11:15 2024 +0800

    Fix incorrect statistics read for binary columns in parquet (#10645)
---
 .../datasource/physical_plan/parquet/statistics.rs |  7 +++---
 datafusion/core/tests/parquet/arrow_statistics.rs  | 25 +++++++++++-----------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 8d17354839..bbdd46af5d 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -117,6 +117,9 @@ macro_rules! get_statistic {
                             *scale,
                         ))
                     }
+                    Some(DataType::Binary) => {
+                        
Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
+                    }
                     _ => {
                         let s = std::str::from_utf8(s.$bytes_func())
                             .map(|s| s.to_string())
@@ -644,10 +647,6 @@ mod test {
     }
 
     #[test]
-    #[should_panic(
-        expected = "Inconsistent types in ScalarValue::iter_to_array. Expected 
Utf8, got Binary(NULL)"
-    )]
-    // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_binary() {
         Test {
             input: Arc::new(BinaryArray::from_opt_vec(vec![
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs 
b/datafusion/core/tests/parquet/arrow_statistics.rs
index db687a3777..e5aadf2131 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -24,9 +24,9 @@ use std::sync::Arc;
 use arrow::compute::kernels::cast_utils::Parser;
 use arrow::datatypes::{Date32Type, Date64Type};
 use arrow_array::{
-    make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, 
Decimal128Array,
-    FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, 
Int8Array,
-    RecordBatch, StringArray, UInt64Array,
+    make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, 
Date64Array,
+    Decimal128Array, FixedSizeBinaryArray, Float64Array, Int16Array, 
Int32Array,
+    Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -905,18 +905,17 @@ async fn test_byte() {
     .run();
 
     // column "service_binary"
+
+    let expected_service_binary_min_values: Vec<&[u8]> =
+        vec![b"frontend five", b"backend one", b"backend eight"];
+
+    let expected_service_binary_max_values: Vec<&[u8]> =
+        vec![b"frontend two", b"frontend six", b"backend six"];
+
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(StringArray::from(vec![
-            "frontend five",
-            "backend one",
-            "backend eight",
-        ])), // Shuld be BinaryArray
-        expected_max: Arc::new(StringArray::from(vec![
-            "frontend two",
-            "frontend six",
-            "backend six",
-        ])), // Shuld be BinaryArray
+        expected_min: 
Arc::new(BinaryArray::from(expected_service_binary_min_values)), // Shuld be 
BinaryArray
+        expected_max: 
Arc::new(BinaryArray::from(expected_service_binary_max_values)), // Shuld be 
BinaryArray
         expected_null_counts: UInt64Array::from(vec![0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 5, 5]),
         column_name: "service_binary",


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to