alamb commented on a change in pull request #426: URL: https://github.com/apache/arrow-datafusion/pull/426#discussion_r638990402
########## File path: datafusion/src/physical_plan/parquet.rs ########## @@ -457,11 +461,102 @@ fn send_result( Ok(()) } +/// Wraps parquet statistics in a way +/// that implements [`PruningStatistics`] +struct RowGroupPruningStatistics<'a> { + row_group_metadata: &'a [RowGroupMetaData], + parquet_schema: &'a Schema, +} + +/// Extract the min/max statistics from a `ParquetStatistics` object +macro_rules! get_statistic { + ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{ + if !$column_statistics.has_min_max_set() { + return None; + } + match $column_statistics { + ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), + ParquetStatistics::Int32(s) => Some(ScalarValue::Int32(Some(*s.$func()))), + ParquetStatistics::Int64(s) => Some(ScalarValue::Int64(Some(*s.$func()))), + // 96 bit ints not supported + ParquetStatistics::Int96(_) => None, + ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), + ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), + ParquetStatistics::ByteArray(s) => { + let s = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + Some(ScalarValue::Utf8(s)) + } + // type not supported yet + ParquetStatistics::FixedLenByteArray(_) => None, + } + }}; +} + +// Extract the min or max value calling `func` or `bytes_func` on the ParquetStatistics as appropriate +macro_rules! get_min_max_values { + ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ + let (column_index, field) = if let Some((v, f)) = $self.parquet_schema.column_with_name($column) { + (v, f) + } else { + // Named column was not present + return None + }; + + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + // DataFusion doesn't have support for ScalarValues of the column type + return None + }; + + let scalar_values : Vec<ScalarValue> = $self.row_group_metadata + .iter() + .flat_map(|meta| { + meta.column(column_index).statistics() + }) + .map(|stats| { + get_statistic!(stats, $func, $bytes_func) + }) + .map(|maybe_scalar| { + // column either did't have statistics at all or didn't have min/max values + maybe_scalar.unwrap_or_else(|| null_scalar.clone()) + }) + .collect(); Review comment: I tried to avoid the collect() but I couldn't get Rust to stop complaining about returning a reference to a local value :( -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org