Jefffrey commented on code in PR #9409:
URL: https://github.com/apache/arrow-rs/pull/9409#discussion_r2804592839
##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
Ok(Some(sum))
}
+ DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+ DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+ DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+ DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+ _ => Ok(None),
+ },
_ => sum_checked::<T>(as_primitive_array(&array)),
}
}
+// Logic for summing run-end-encoded arrays.
+mod ree {
+ use std::convert::Infallible;
+
+ use arrow_array::cast::AsArray;
+ use arrow_array::types::RunEndIndexType;
+ use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType,
PrimitiveArray, TypedRunArray};
+ use arrow_buffer::ArrowNativeType;
+ use arrow_schema::ArrowError;
+
+ /// Downcasts an array to a TypedRunArray.
+ // Once specialization gets stabilized, this method can be templated over
the source
+ // array type and can directly pick up the type of the child values array.
+ fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+ array: &'a dyn Array,
+ ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+ let array = array.as_run_opt::<I>()?;
+ // This fails if the values child array is not the PrimitiveArray<T>.
That is okay as:
+ // * BooleanArray & StringArray are not primitive, but their Item is
not
+ // ArrowNumericType, so they are not in scope for this function.
+ // * Having the values child array be either dict-encoded, or
run-end-encoded is unlikely.
+ // Note however that the Arrow specification does not forbid using an
exotic type as the
+ // values child array.
+ array.downcast::<PrimitiveArray<V>>()
+ }
+
+ /// Computes the sum (wrapping) of the array values.
+ pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+ array: &dyn Array,
+ ) -> Option<V::Native> {
+ if array.null_count() == array.len() {
Review Comment:
Run arrays don't have a null buffer so this check is essentially a noop each
time
##########
arrow-arith/src/aggregate.rs:
##########
@@ -639,6 +774,48 @@ where
{
match array.data_type() {
DataType::Dictionary(_, _) => min_max_helper::<T::Native, _, _>(array,
cmp),
+ DataType::RunEndEncoded(run_ends, _) => {
+ // We can directly perform min/max on the values child array, as
any
+ // run must have non-zero length. We just need take care of the
logical offset & len.
Review Comment:
We can use `RunArray::values_slice` here:
https://github.com/apache/arrow-rs/blob/70089ac5c1e8de99cd9af780bb3ccb4564ae8ef7/arrow-array/src/array/run_array.rs#L139-L150
- Introduced by https://github.com/apache/arrow-rs/pull/9036
##########
arrow-arith/src/aggregate.rs:
##########
@@ -639,6 +774,48 @@ where
{
match array.data_type() {
DataType::Dictionary(_, _) => min_max_helper::<T::Native, _, _>(array,
cmp),
+ DataType::RunEndEncoded(run_ends, _) => {
+ // We can directly perform min/max on the values child array, as
any
+ // run must have non-zero length. We just need take care of the
logical offset & len.
+ fn values_and_boundaries<I: types::RunEndIndexType>(
+ array: &dyn Array,
+ ) -> Option<(&ArrayRef, Option<std::ops::Range<usize>>)> {
+ let array = array.as_run_opt::<I>()?;
+ // If the array is empty, start & end physical indices will be
0. Which is
+ // incorrect: array[0] does not exist.
+ let range = if array.len() == 0 {
+ None
+ } else {
+
Some(array.get_start_physical_index()..array.get_end_physical_index() + 1)
+ };
+ Some((array.values(), range))
+ }
+ let (values, range) = match run_ends.data_type() {
+ DataType::Int16 =>
values_and_boundaries::<types::Int16Type>(&array)?,
+ DataType::Int32 =>
values_and_boundaries::<types::Int32Type>(&array)?,
+ DataType::Int64 =>
values_and_boundaries::<types::Int64Type>(&array)?,
+ _ => return None,
+ };
+
+ // We will fail here if the values child array is not the
PrimitiveArray<T>. That
+ // is okay as:
+ // * BooleanArray & StringArray are not primitive, but their Item
is not
+ // ArrowNumericType, so they are not in scope for this function.
+ // * Having the values child array be either dict-encoded, or
run-end-encoded does not
+ // make sense. Nor does using a custom array type.
+ // Note however that the Apache specification does not forbid
using an exotic type as
+ // the values child array.
+ // The type parameter `A` is a TypedRunArray<'_, RunEndIndexType,
ValuesArrayType>.
+ // Once specialization gets stabilized, this implementation can be
changed to
+ // directly pick up `ValuesArrayType`.
Review Comment:
While this is quite a comprehensive explanation, I feel we can simply leave
it as "we expect child array to be primitive" 🤔
- It doesn't feel necessary to include boolean/string in the explanation
given we're only in the context of numeric types
- It's not really necessary to try explain away custom array implementations
as those would be extremely rare
##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
Ok(Some(sum))
}
+ DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+ DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+ DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+ DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+ _ => Ok(None),
+ },
_ => sum_checked::<T>(as_primitive_array(&array)),
}
}
+// Logic for summing run-end-encoded arrays.
+mod ree {
+ use std::convert::Infallible;
+
+ use arrow_array::cast::AsArray;
+ use arrow_array::types::RunEndIndexType;
+ use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType,
PrimitiveArray, TypedRunArray};
+ use arrow_buffer::ArrowNativeType;
+ use arrow_schema::ArrowError;
+
+ /// Downcasts an array to a TypedRunArray.
+ // Once specialization gets stabilized, this method can be templated over
the source
+ // array type and can directly pick up the type of the child values array.
+ fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+ array: &'a dyn Array,
+ ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+ let array = array.as_run_opt::<I>()?;
+ // This fails if the values child array is not the PrimitiveArray<T>.
That is okay as:
Review Comment:
I feel sufficient justification is just "we support only runarrays wrapping
primitive types"; no need to go into this much detail (especially as we're only
dealing with numeric types so mentioning boolean/string for example seems
unnecessary)
##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
Ok(Some(sum))
}
+ DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+ DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+ DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+ DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+ _ => Ok(None),
+ },
_ => sum_checked::<T>(as_primitive_array(&array)),
}
}
+// Logic for summing run-end-encoded arrays.
+mod ree {
+ use std::convert::Infallible;
+
+ use arrow_array::cast::AsArray;
+ use arrow_array::types::RunEndIndexType;
+ use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType,
PrimitiveArray, TypedRunArray};
+ use arrow_buffer::ArrowNativeType;
+ use arrow_schema::ArrowError;
+
+ /// Downcasts an array to a TypedRunArray.
+ // Once specialization gets stabilized, this method can be templated over
the source
+ // array type and can directly pick up the type of the child values array.
+ fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+ array: &'a dyn Array,
+ ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+ let array = array.as_run_opt::<I>()?;
+ // This fails if the values child array is not the PrimitiveArray<T>.
That is okay as:
+ // * BooleanArray & StringArray are not primitive, but their Item is
not
+ // ArrowNumericType, so they are not in scope for this function.
+ // * Having the values child array be either dict-encoded, or
run-end-encoded is unlikely.
+ // Note however that the Arrow specification does not forbid using an
exotic type as the
+ // values child array.
+ array.downcast::<PrimitiveArray<V>>()
+ }
+
+ /// Computes the sum (wrapping) of the array values.
+ pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+ array: &dyn Array,
+ ) -> Option<V::Native> {
+ if array.null_count() == array.len() {
+ return None;
+ }
+
+ let ree = downcast::<I, V>(array)?;
+
+ let sum = fold(ree, |acc, val, len| -> Result<V::Native, Infallible> {
+ Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
+ })
+ // Safety: error type is Infallible.
+ .unwrap();
Review Comment:
```suggestion
let Ok(sum) = fold(ree, |acc, val, len| -> Result<V::Native,
Infallible> {
Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
});
```
If using `Infallible` then we can destructure like so without unwrap
##########
arrow-arith/src/aggregate.rs:
##########
@@ -603,10 +611,137 @@ where
Ok(Some(sum))
}
+ DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() {
+ DataType::Int16 => ree::sum_checked::<types::Int16Type, T>(&array),
+ DataType::Int32 => ree::sum_checked::<types::Int32Type, T>(&array),
+ DataType::Int64 => ree::sum_checked::<types::Int64Type, T>(&array),
+ _ => Ok(None),
+ },
_ => sum_checked::<T>(as_primitive_array(&array)),
}
}
+// Logic for summing run-end-encoded arrays.
+mod ree {
+ use std::convert::Infallible;
+
+ use arrow_array::cast::AsArray;
+ use arrow_array::types::RunEndIndexType;
+ use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType,
PrimitiveArray, TypedRunArray};
+ use arrow_buffer::ArrowNativeType;
+ use arrow_schema::ArrowError;
+
+ /// Downcasts an array to a TypedRunArray.
+ // Once specialization gets stabilized, this method can be templated over
the source
+ // array type and can directly pick up the type of the child values array.
+ fn downcast<'a, I: RunEndIndexType, V: ArrowNumericType>(
+ array: &'a dyn Array,
+ ) -> Option<TypedRunArray<'a, I, PrimitiveArray<V>>> {
+ let array = array.as_run_opt::<I>()?;
+ // This fails if the values child array is not the PrimitiveArray<T>.
That is okay as:
+ // * BooleanArray & StringArray are not primitive, but their Item is
not
+ // ArrowNumericType, so they are not in scope for this function.
+ // * Having the values child array be either dict-encoded, or
run-end-encoded is unlikely.
+ // Note however that the Arrow specification does not forbid using an
exotic type as the
+ // values child array.
+ array.downcast::<PrimitiveArray<V>>()
+ }
+
+ /// Computes the sum (wrapping) of the array values.
+ pub(super) fn sum_wrapping<I: RunEndIndexType, V: ArrowNumericType>(
+ array: &dyn Array,
+ ) -> Option<V::Native> {
+ if array.null_count() == array.len() {
+ return None;
+ }
+
+ let ree = downcast::<I, V>(array)?;
+
+ let sum = fold(ree, |acc, val, len| -> Result<V::Native, Infallible> {
+ Ok(acc.add_wrapping(val.mul_wrapping(V::Native::usize_as(len))))
+ })
+ // Safety: error type is Infallible.
+ .unwrap();
+
+ Some(sum)
+ }
+
+ /// Computes the sum (erroring on overflow) of the array values.
+ pub(super) fn sum_checked<I: RunEndIndexType, V: ArrowNumericType>(
+ array: &dyn Array,
+ ) -> Result<Option<V::Native>, ArrowError> {
+ if array.null_count() == array.len() {
+ return Ok(None);
+ }
+
+ let Some(ree) = downcast::<I, V>(array) else {
+ return Err(ArrowError::InvalidArgumentError(
+ "Input array is not a TypedRunArray<'_, _,
PrimitiveArray<T>".to_string(),
+ ));
+ };
+
+ let sum = fold(ree, |acc, val, len| -> Result<V::Native, ArrowError> {
+ let Some(len) = V::Native::from_usize(len) else {
+ return Err(ArrowError::ArithmeticOverflow(format!(
+ "Cannot convert a run-end index ({:?}) to the value type
({})",
+ len,
+ std::any::type_name::<V::Native>()
+ )));
+ };
+ acc.add_checked(val.mul_checked(len)?)
+ });
+
+ sum.map(Some)
+ }
+
+ /// Folds over the values in a run-end-encoded array.
+ fn fold<'a, I: RunEndIndexType, V: ArrowNumericType, F, E>(
+ array: TypedRunArray<'a, I, PrimitiveArray<V>>,
+ mut f: F,
+ ) -> Result<V::Native, E>
+ where
+ F: FnMut(V::Native, V::Native, usize) -> Result<V::Native, E>,
+ {
+ let run_ends = array.run_ends();
+
+ let logical_start = run_ends.offset();
+ let logical_end = run_ends.offset() + run_ends.len();
+
Review Comment:
I believe we can use `RunArray::values_slice`
https://github.com/apache/arrow-rs/blob/d8946ca0775ab7fe0eef2fdea4b8bb3d55ec6664/arrow-array/src/array/run_array.rs#L139-L150
And `RunEndBuffer::sliced_values` (accessed via `RunArray::run_ends`)
https://github.com/apache/arrow-rs/blob/d8946ca0775ab7fe0eef2fdea4b8bb3d55ec6664/arrow-buffer/src/buffer/run.rs#L192-L215
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]