alamb commented on a change in pull request #1525: URL: https://github.com/apache/arrow-datafusion/pull/1525#discussion_r780660105
########## File path: datafusion/src/physical_plan/expressions/variance.rs ########## @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines physical expressions that can evaluated at runtime during query execution + +use std::any::Any; +use std::sync::Arc; + +use crate::error::{DataFusionError, Result}; +use crate::physical_plan::{Accumulator, AggregateExpr, PhysicalExpr}; +use crate::scalar::ScalarValue; +use arrow::datatypes::DataType; +use arrow::datatypes::Field; + +use super::format_state_name; + +/// VARIANCE aggregate expression +#[derive(Debug)] +pub struct Variance { + name: String, + expr: Arc<dyn PhysicalExpr>, +} + +/// function return type of variance +pub fn variance_return_type(arg_type: &DataType) -> Result<DataType> { + match arg_type { + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 => Ok(DataType::Float64), + other => Err(DataFusionError::Plan(format!( + "VARIANCE does not support {:?}", + other + ))), + } +} + +pub(crate) fn is_variance_support_arg_type(arg_type: &DataType) -> bool { + matches!( + arg_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + ) +} + +impl Variance { + /// Create a new VARIANCE aggregate function + pub fn new( + expr: Arc<dyn PhysicalExpr>, + name: impl Into<String>, + data_type: DataType, + ) -> Self { + // the result of variance just support FLOAT64 data type. + assert!(matches!(data_type, DataType::Float64)); + Self { + name: name.into(), + expr, + } + } +} + +impl AggregateExpr for Variance { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn field(&self) -> Result<Field> { + Ok(Field::new(&self.name, DataType::Float64, true)) + } + + fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> { + Ok(Box::new(VarianceAccumulator::try_new()?)) + } + + fn state_fields(&self) -> Result<Vec<Field>> { + Ok(vec![ + Field::new( + &format_state_name(&self.name, "count"), + DataType::UInt64, + true, + ), + Field::new( + &format_state_name(&self.name, "mean"), + DataType::Float64, + true, + ), + Field::new( + &format_state_name(&self.name, "m2"), + DataType::Float64, + true, + ), + ]) + } + + fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> { + vec![self.expr.clone()] + } + + fn name(&self) -> &str { + &self.name + } +} + +/// An accumulator to compute variance +#[derive(Debug)] +pub struct VarianceAccumulator { + m2: ScalarValue, + mean: ScalarValue, + count: u64, +} + +impl VarianceAccumulator { + /// Creates a new `VarianceAccumulator` + pub fn try_new() -> Result<Self> { + Ok(Self { + m2: ScalarValue::from(0 as f64), + mean: ScalarValue::from(0 as f64), + count: 0, + }) + } + + pub fn get_count(&self) -> u64 { + self.count + } + + pub fn get_mean(&self) -> ScalarValue { + self.mean.clone() + } + + pub fn get_m2(&self) -> ScalarValue { + self.m2.clone() + } +} + +impl Accumulator for VarianceAccumulator { + fn state(&self) -> Result<Vec<ScalarValue>> { + Ok(vec![ + ScalarValue::from(self.count), + self.mean.clone(), + self.m2.clone(), + ]) + } + + fn update(&mut self, values: &[ScalarValue]) -> Result<()> { + let values = &values[0]; + let is_empty = values.is_null(); + + if !is_empty { + let new_count = self.count + 1; + let delta1 = ScalarValue::add(values, &self.mean.arithmetic_negate())?; Review comment: I don't fully understand the discussion here about online and parallelism with respect to `update` and `update_batch` `update_batch` is actually what the HashAggregator calls during execution. The default implementation of `update_batch` simply iterates over each row of the input array, converting it to a `ScalarValue`, and calls `update` convert the https://github.com/apache/arrow-datafusion/blob/415c5e124af18a05500514f78604366d860dcf5a/datafusion/src/physical_plan/mod.rs#L572-L583 My mental picture of aggregation is like this: ``` ┌───────────────────────┐ ┌──────────────────────┐ │ │ │ │ │ Input RecordBatch 0 │ update_batch│ Accumulator │ │ │────────────▶│ │───────┐ │ │ │ │ │ └───────────────────────┘ └──────────────────────┘ │ │ │ merge │ ┌───────────────────────┐ ┌──────────────────────┐ │ ┌──────────────────────┐ │ │ │ │ │ │ │ │ Input RecordBatch 1 │ update_batch│ Accumulator │ │ │ Accumulator │ │ │────────────▶│ │───────┼──────▶│ │ │ │ │ │ │ │ │ └───────────────────────┘ └──────────────────────┘ │ └──────────────────────┘ │ │ │ │ ... ... │ │ │ │ ┌───────────────────────┐ ┌──────────────────────┐ │ │ │ │ │ │ │ Input RecordBatch N │ update_batch│ Accumulator │ │ │ │────────────▶│ │───────┘ │ │ │ │ └───────────────────────┘ └──────────────────────┘ ``` ########## File path: datafusion/src/scalar.rs ########## @@ -526,6 +526,282 @@ macro_rules! eq_array_primitive { } impl ScalarValue { + /// Return true if the value is numeric + pub fn is_numeric(&self) -> bool { + matches!(self, + ScalarValue::Float32(_) + | ScalarValue::Float64(_) + | ScalarValue::Decimal128(_, _, _) + | ScalarValue::Int8(_) + | ScalarValue::Int16(_) + | ScalarValue::Int32(_) + | ScalarValue::Int64(_) + | ScalarValue::UInt8(_) + | ScalarValue::UInt16(_) + | ScalarValue::UInt32(_) + | ScalarValue::UInt64(_) + ) + } + + /// Add two numeric ScalarValues + pub fn add(lhs: &ScalarValue, rhs: &ScalarValue) -> Result<ScalarValue> { + if !lhs.is_numeric() || !rhs.is_numeric() { + return Err(DataFusionError::Internal(format!( + "Addition only supports numeric types, \ + here has {:?} and {:?}", + lhs.get_datatype(), + rhs.get_datatype() + ))); + } + + // TODO: Finding a good way to support operation between different types without Review comment: > I think ScalarValue arithmetic is important to many future operators, I think that we should be trying to avoid `ScalarValue` arithmetic if at all possible, and instead use "vectorized" calculations (aka computations using arrow arrays). While this coding style is more cumbersome it is one of the key advantages for datafusion (and columnar execution engines in general) The reason that `ScalarValue` arithmetic is so slow is that for each row, there is overhead to switch based on the type (aka the large `match` statements required in this PR). Using `Array`s does this type switch once per array so you only pay the cost once for 1000s of rows. I think having a separate discussion / PR is a good. That being said, I agree that `ScalarValue::sum()` makes more sense that a free `sum` function (as it is easier to find) -- so perhaps as a follow on to this PR we can combine the two implementations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
