Jefffrey commented on code in PR #9409:
URL: https://github.com/apache/arrow-rs/pull/9409#discussion_r2804592839


##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
 
             Ok(Some(sum))
         }
+        DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+            DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+            DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+            DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+            _ => Ok(None),
+        },
         _ => sum_checked::<T>(as_primitive_array(&array)),
     }
 }
 
+// Logic for summing run-end-encoded arrays.
+mod ree {
+    use std::convert::Infallible;
+
+    use arrow_array::cast::AsArray;
+    use arrow_array::types::RunEndIndexType;
+    use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType, 
PrimitiveArray, TypedRunArray};
+    use arrow_buffer::ArrowNativeType;
+    use arrow_schema::ArrowError;
+
+    /// Downcasts an array to a TypedRunArray.
+    // Once specialization gets stabilized, this method can be templated over 
the source
+    // array type and can directly pick up the type of the child values array.
+    fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+        array: &'a dyn Array,
+    ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+        let array = array.as_run_opt::<I>()?;
+        // This fails if the values child array is not the PrimitiveArray<T>. 
That is okay as:
+        // * BooleanArray & StringArray are not primitive, but their Item is 
not
+        //   ArrowNumericType, so they are not in scope for this function.
+        // * Having the values child array be either dict-encoded, or 
run-end-encoded is unlikely.
+        // Note however that the Arrow specification does not forbid using an 
exotic type as the
+        // values child array.
+        array.downcast::<PrimitiveArray<V>>()
+    }
+
+    /// Computes the sum (wrapping) of the array values.
+    pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+        array: &dyn Array,
+    ) -> Option<V::Native> {
+        if array.null_count() == array.len() {

Review Comment:
   Run arrays don't have a null buffer so this check is essentially a noop each 
time



##########
arrow-arith/src/aggregate.rs:
##########
@@ -639,6 +774,48 @@ where
 {
     match array.data_type() {
         DataType::Dictionary(_, _) => min_max_helper::<T::Native, _, _>(array, 
cmp),
+        DataType::RunEndEncoded(run_ends, _) => {
+            // We can directly perform min/max on the values child array, as 
any
+            // run must have non-zero length. We just need take care of the 
logical offset & len.

Review Comment:
   We can use `RunArray::values_slice` here:
   
   
https://github.com/apache/arrow-rs/blob/70089ac5c1e8de99cd9af780bb3ccb4564ae8ef7/arrow-array/src/array/run_array.rs#L139-L150
   
   - Introduced by https://github.com/apache/arrow-rs/pull/9036



##########
arrow-arith/src/aggregate.rs:
##########
@@ -639,6 +774,48 @@ where
 {
     match array.data_type() {
         DataType::Dictionary(_, _) => min_max_helper::<T::Native, _, _>(array, 
cmp),
+        DataType::RunEndEncoded(run_ends, _) => {
+            // We can directly perform min/max on the values child array, as 
any
+            // run must have non-zero length. We just need take care of the 
logical offset & len.
+            fn values_and_boundaries<I: types::RunEndIndexType>(
+                array: &dyn Array,
+            ) -> Option<(&ArrayRef, Option<std::ops::Range<usize>>)> {
+                let array = array.as_run_opt::<I>()?;
+                // If the array is empty, start & end physical indices will be 
0. Which is
+                // incorrect: array[0] does not exist.
+                let range = if array.len() == 0 {
+                    None
+                } else {
+                    
Some(array.get_start_physical_index()..array.get_end_physical_index() + 1)
+                };
+                Some((array.values(), range))
+            }
+            let (values, range) = match run_ends.data_type() {
+                DataType::Int16 => 
values_and_boundaries::<types::Int16Type>(&array)?,
+                DataType::Int32 => 
values_and_boundaries::<types::Int32Type>(&array)?,
+                DataType::Int64 => 
values_and_boundaries::<types::Int64Type>(&array)?,
+                _ => return None,
+            };
+
+            // We will fail here if the values child array is not the 
PrimitiveArray<T>. That
+            // is okay as:
+            // * BooleanArray & StringArray are not primitive, but their Item 
is not
+            //   ArrowNumericType, so they are not in scope for this function.
+            // * Having the values child array be either dict-encoded, or 
run-end-encoded does not
+            //   make sense. Nor does using a custom array type.
+            // Note however that the Apache specification does not forbid 
using an exotic type as
+            // the values child array.
+            // The type parameter `A` is a TypedRunArray<'_, RunEndIndexType, 
ValuesArrayType>.
+            // Once specialization gets stabilized, this implementation can be 
changed to
+            // directly pick up `ValuesArrayType`.

Review Comment:
   While this is quite a comprehensive explanation, I feel we can simply leave 
it as "we expect child array to be primitive" 🤔
   
   - It doesn't feel necessary to include boolean/string in the explanation 
given we're only in the context of numeric types
   - It's not really necessary to try explain away custom array implementations 
as those would be extremely rare



##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
 
             Ok(Some(sum))
         }
+        DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+            DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+            DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+            DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+            _ => Ok(None),
+        },
         _ => sum_checked::<T>(as_primitive_array(&array)),
     }
 }
 
+// Logic for summing run-end-encoded arrays.
+mod ree {
+    use std::convert::Infallible;
+
+    use arrow_array::cast::AsArray;
+    use arrow_array::types::RunEndIndexType;
+    use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType, 
PrimitiveArray, TypedRunArray};
+    use arrow_buffer::ArrowNativeType;
+    use arrow_schema::ArrowError;
+
+    /// Downcasts an array to a TypedRunArray.
+    // Once specialization gets stabilized, this method can be templated over 
the source
+    // array type and can directly pick up the type of the child values array.
+    fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+        array: &'a dyn Array,
+    ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+        let array = array.as_run_opt::<I>()?;
+        // This fails if the values child array is not the PrimitiveArray<T>. 
That is okay as:

Review Comment:
   I feel sufficient justification is just "we support only runarrays wrapping 
primitive types"; no need to go into this much detail (especially as we're only 
dealing with numeric types so mentioning boolean/string for example seems 
unnecessary)



##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
 
             Ok(Some(sum))
         }
+        DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+            DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+            DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+            DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+            _ => Ok(None),
+        },
         _ => sum_checked::<T>(as_primitive_array(&array)),
     }
 }
 
+// Logic for summing run-end-encoded arrays.
+mod ree {
+    use std::convert::Infallible;
+
+    use arrow_array::cast::AsArray;
+    use arrow_array::types::RunEndIndexType;
+    use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType, 
PrimitiveArray, TypedRunArray};
+    use arrow_buffer::ArrowNativeType;
+    use arrow_schema::ArrowError;
+
+    /// Downcasts an array to a TypedRunArray.
+    // Once specialization gets stabilized, this method can be templated over 
the source
+    // array type and can directly pick up the type of the child values array.
+    fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+        array: &'a dyn Array,
+    ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+        let array = array.as_run_opt::<I>()?;
+        // This fails if the values child array is not the PrimitiveArray<T>. 
That is okay as:
+        // * BooleanArray & StringArray are not primitive, but their Item is 
not
+        //   ArrowNumericType, so they are not in scope for this function.
+        // * Having the values child array be either dict-encoded, or 
run-end-encoded is unlikely.
+        // Note however that the Arrow specification does not forbid using an 
exotic type as the
+        // values child array.
+        array.downcast::<PrimitiveArray<V>>()
+    }
+
+    /// Computes the sum (wrapping) of the array values.
+    pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+        array: &dyn Array,
+    ) -> Option<V::Native> {
+        if array.null_count() == array.len() {
+            return None;
+        }
+
+        let ree = downcast::<I, V>(array)?;
+
+        let sum = fold(ree, |acc, val, len| -> Result<V::Native, Infallible> {
+            Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
+        })
+        // Safety: error type is Infallible.
+        .unwrap();

Review Comment:
   ```suggestion
           let Ok(sum) = fold(ree, |acc, val, len| -> Result<V::Native, 
Infallible> {
               Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
           });
   ```
   
   If using `Infallible` then we can destructure like so without unwrap



##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
 
             Ok(Some(sum))
         }
+        DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+            DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+            DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+            DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+            _ => Ok(None),
+        },
         _ => sum_checked::<T>(as_primitive_array(&array)),
     }
 }
 
+// Logic for summing run-end-encoded arrays.
+mod ree {
+    use std::convert::Infallible;
+
+    use arrow_array::cast::AsArray;
+    use arrow_array::types::RunEndIndexType;
+    use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType, 
PrimitiveArray, TypedRunArray};
+    use arrow_buffer::ArrowNativeType;
+    use arrow_schema::ArrowError;
+
+    /// Downcasts an array to a TypedRunArray.
+    // Once specialization gets stabilized, this method can be templated over 
the source
+    // array type and can directly pick up the type of the child values array.
+    fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+        array: &'a dyn Array,
+    ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+        let array = array.as_run_opt::<I>()?;
+        // This fails if the values child array is not the PrimitiveArray<T>. 
That is okay as:
+        // * BooleanArray & StringArray are not primitive, but their Item is 
not
+        //   ArrowNumericType, so they are not in scope for this function.
+        // * Having the values child array be either dict-encoded, or 
run-end-encoded is unlikely.
+        // Note however that the Arrow specification does not forbid using an 
exotic type as the
+        // values child array.
+        array.downcast::<PrimitiveArray<V>>()
+    }
+
+    /// Computes the sum (wrapping) of the array values.
+    pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+        array: &dyn Array,
+    ) -> Option<V::Native> {
+        if array.null_count() == array.len() {
+            return None;
+        }
+
+        let ree = downcast::<I, V>(array)?;
+
+        let sum = fold(ree, |acc, val, len| -> Result<V::Native, Infallible> {
+            Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
+        })
+        // Safety: error type is Infallible.
+        .unwrap();
+
+        Some(sum)
+    }
+
+    /// Computes the sum (erroring on overflow) of the array values.
+    pub(super) fn sum_checked<I: RunEndIndexType, V: ArrowNumericType>(
+        array: &dyn Array,
+    ) -> Result<Option<V::Native>, ArrowError> {
+        if array.null_count() == array.len() {
+            return Ok(None);
+        }
+
+        let Some(ree) = downcast::<I, V>(array) else {
+            return Err(ArrowError::InvalidArgumentError(
+                "Input array is not a TypedRunArray<'_, _, 
PrimitiveArray<T>".to_string(),
+            ));
+        };
+
+        let sum = fold(ree, |acc, val, len| -> Result<V::Native, ArrowError> {
+            let Some(len) = V::Native::from_usize(len) else {
+                return Err(ArrowError::ArithmeticOverflow(format!(
+                    "Cannot convert a run-end index ({:?}) to the value type 
({})",
+                    len,
+                    std::any::type_name::<V::Native>()
+                )));
+            };
+            acc.add_checked(val.mul_checked(len)?)
+        });
+
+        sum.map(Some)
+    }
+
+    /// Folds over the values in a run-end-encoded array.
+    fn fold<'a, I: RunEndIndexType, V: ArrowNumericType, F, E>(
+        array: TypedRunArray<'a, I, PrimitiveArray<V>>,
+        mut f: F,
+    ) -> Result<V::Native, E>
+    where
+        F: FnMut(V::Native, V::Native, usize) -> Result<V::Native, E>,
+    {
+        let run_ends = array.run_ends();
+
+        let logical_start = run_ends.offset();
+        let logical_end = run_ends.offset() + run_ends.len();
+

Review Comment:
   I believe we can use `RunArray::values_slice`
   
   
https://github.com/apache/arrow-rs/blob/d8946ca0775ab7fe0eef2fdea4b8bb3d55ec6664/arrow-array/src/array/run_array.rs#L139-L150
   
   And `RunEndBuffer::sliced_values` (accessed via `RunArray::run_ends`)
   
   
https://github.com/apache/arrow-rs/blob/d8946ca0775ab7fe0eef2fdea4b8bb3d55ec6664/arrow-buffer/src/buffer/run.rs#L192-L215



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to