This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 35f481dbcc Refactor min/max value update in Parquet statistics (#9120)
35f481dbcc is described below
commit 35f481dbcc8147e0c4a397de80eb8ac74ca4ce9a
Author: Alex Huang <[email protected]>
AuthorDate: Mon Feb 5 20:28:31 2024 +0800
Refactor min/max value update in Parquet statistics (#9120)
* Refactor min/max value update in Parquet statistics
* omit has_min_max_set
---
.../core/src/datasource/file_format/parquet.rs | 194 ++++++---------------
1 file changed, 55 insertions(+), 139 deletions(-)
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs
b/datafusion/core/src/datasource/file_format/parquet.rs
index 408233469e..89ec81630c 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -303,155 +303,71 @@ fn summarize_min_max(
i: usize,
stat: &ParquetStatistics,
) {
+ if !stat.has_min_max_set() {
+ max_values[i] = None;
+ min_values[i] = None;
+ return;
+ }
match stat {
- ParquetStatistics::Boolean(s) => {
- if let DataType::Boolean = fields[i].data_type() {
- if s.has_min_max_set() {
- if let Some(max_value) = &mut max_values[i] {
- match
max_value.update_batch(&[Arc::new(BooleanArray::from(
- vec![Some(*s.max())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- max_values[i] = None;
- }
- }
- }
- if let Some(min_value) = &mut min_values[i] {
- match
min_value.update_batch(&[Arc::new(BooleanArray::from(
- vec![Some(*s.min())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- min_values[i] = None;
- }
- }
- }
- return;
- }
+ ParquetStatistics::Boolean(s) if DataType::Boolean ==
*fields[i].data_type() => {
+ if let Some(max_value) = &mut max_values[i] {
+ max_value
+
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.max()]))])
+ .unwrap_or_else(|_| max_values[i] = None);
+ }
+ if let Some(min_value) = &mut min_values[i] {
+ min_value
+
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.min()]))])
+ .unwrap_or_else(|_| min_values[i] = None);
}
- max_values[i] = None;
- min_values[i] = None;
}
- ParquetStatistics::Int32(s) => {
- if let DataType::Int32 = fields[i].data_type() {
- if s.has_min_max_set() {
- if let Some(max_value) = &mut max_values[i] {
- match
max_value.update_batch(&[Arc::new(Int32Array::from_value(
- *s.max(),
- 1,
- ))]) {
- Ok(_) => {}
- Err(_) => {
- max_values[i] = None;
- }
- }
- }
- if let Some(min_value) = &mut min_values[i] {
- match
min_value.update_batch(&[Arc::new(Int32Array::from_value(
- *s.min(),
- 1,
- ))]) {
- Ok(_) => {}
- Err(_) => {
- min_values[i] = None;
- }
- }
- }
- return;
- }
+ ParquetStatistics::Int32(s) if DataType::Int32 ==
*fields[i].data_type() => {
+ if let Some(max_value) = &mut max_values[i] {
+ max_value
+ .update_batch(&[Arc::new(Int32Array::from_value(*s.max(),
1))])
+ .unwrap_or_else(|_| max_values[i] = None);
+ }
+ if let Some(min_value) = &mut min_values[i] {
+ min_value
+ .update_batch(&[Arc::new(Int32Array::from_value(*s.min(),
1))])
+ .unwrap_or_else(|_| min_values[i] = None);
}
- max_values[i] = None;
- min_values[i] = None;
}
- ParquetStatistics::Int64(s) => {
- if let DataType::Int64 = fields[i].data_type() {
- if s.has_min_max_set() {
- if let Some(max_value) = &mut max_values[i] {
- match
max_value.update_batch(&[Arc::new(Int64Array::from_value(
- *s.max(),
- 1,
- ))]) {
- Ok(_) => {}
- Err(_) => {
- max_values[i] = None;
- }
- }
- }
- if let Some(min_value) = &mut min_values[i] {
- match
min_value.update_batch(&[Arc::new(Int64Array::from_value(
- *s.min(),
- 1,
- ))]) {
- Ok(_) => {}
- Err(_) => {
- min_values[i] = None;
- }
- }
- }
- return;
- }
+ ParquetStatistics::Int64(s) if DataType::Int64 ==
*fields[i].data_type() => {
+ if let Some(max_value) = &mut max_values[i] {
+ max_value
+ .update_batch(&[Arc::new(Int64Array::from_value(*s.max(),
1))])
+ .unwrap_or_else(|_| max_values[i] = None);
+ }
+ if let Some(min_value) = &mut min_values[i] {
+ min_value
+ .update_batch(&[Arc::new(Int64Array::from_value(*s.min(),
1))])
+ .unwrap_or_else(|_| min_values[i] = None);
}
- max_values[i] = None;
- min_values[i] = None;
}
- ParquetStatistics::Float(s) => {
- if let DataType::Float32 = fields[i].data_type() {
- if s.has_min_max_set() {
- if let Some(max_value) = &mut max_values[i] {
- match
max_value.update_batch(&[Arc::new(Float32Array::from(
- vec![Some(*s.max())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- max_values[i] = None;
- }
- }
- }
- if let Some(min_value) = &mut min_values[i] {
- match
min_value.update_batch(&[Arc::new(Float32Array::from(
- vec![Some(*s.min())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- min_values[i] = None;
- }
- }
- }
- return;
- }
+ ParquetStatistics::Float(s) if DataType::Float32 ==
*fields[i].data_type() => {
+ if let Some(max_value) = &mut max_values[i] {
+ max_value
+
.update_batch(&[Arc::new(Float32Array::from(vec![*s.max()]))])
+ .unwrap_or_else(|_| max_values[i] = None);
+ }
+ if let Some(min_value) = &mut min_values[i] {
+ min_value
+
.update_batch(&[Arc::new(Float32Array::from(vec![*s.min()]))])
+ .unwrap_or_else(|_| min_values[i] = None);
}
- max_values[i] = None;
- min_values[i] = None;
}
- ParquetStatistics::Double(s) => {
- if let DataType::Float64 = fields[i].data_type() {
- if s.has_min_max_set() {
- if let Some(max_value) = &mut max_values[i] {
- match
max_value.update_batch(&[Arc::new(Float64Array::from(
- vec![Some(*s.max())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- max_values[i] = None;
- }
- }
- }
- if let Some(min_value) = &mut min_values[i] {
- match
min_value.update_batch(&[Arc::new(Float64Array::from(
- vec![Some(*s.min())],
- ))]) {
- Ok(_) => {}
- Err(_) => {
- min_values[i] = None;
- }
- }
- }
- return;
- }
+ ParquetStatistics::Double(s) if DataType::Float64 ==
*fields[i].data_type() => {
+ if let Some(max_value) = &mut max_values[i] {
+ max_value
+
.update_batch(&[Arc::new(Float64Array::from(vec![*s.max()]))])
+ .unwrap_or_else(|_| max_values[i] = None);
+ }
+ if let Some(min_value) = &mut min_values[i] {
+ min_value
+
.update_batch(&[Arc::new(Float64Array::from(vec![*s.min()]))])
+ .unwrap_or_else(|_| min_values[i] = None);
}
- max_values[i] = None;
- min_values[i] = None;
}
_ => {
max_values[i] = None;