alamb commented on code in PR #6936:
URL: https://github.com/apache/arrow-datafusion/pull/6936#discussion_r1279718073
##########
datafusion/physical-expr/src/array_expressions.rs:
##########
@@ -363,6 +363,177 @@ pub fn make_array(arrays: &[ArrayRef]) ->
Result<ArrayRef> {
}
}
+fn return_empty(return_null: bool, data_type: DataType) -> Arc<dyn Array> {
+ if return_null {
+ new_null_array(&data_type, 1)
+ } else {
+ new_empty_array(&data_type)
+ }
+}
+
+macro_rules! list_slice {
+ ($ARRAY:expr, $I:expr, $J:expr, $RETURN_ELEMENT:expr, $ARRAY_TYPE:ident)
=> {{
+ let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
+ if $I == 0 && $J == 0 || $ARRAY.is_empty() {
+ return return_empty($RETURN_ELEMENT, $ARRAY.data_type().clone());
+ }
+
+ let i = if $I < 0 {
+ if $I.abs() as usize > array.len() {
+ return return_empty(true, $ARRAY.data_type().clone());
+ }
+
+ (array.len() as i64 + $I + 1) as usize
+ } else {
+ if $I == 0 {
+ 1
+ } else {
+ $I as usize
+ }
+ };
+ let j = if $J < 0 {
+ if $J.abs() as usize > array.len() {
+ return return_empty(true, $ARRAY.data_type().clone());
+ }
+
+ if $RETURN_ELEMENT {
+ (array.len() as i64 + $J + 1) as usize
+ } else {
+ (array.len() as i64 + $J) as usize
+ }
+ } else {
+ if $J == 0 {
+ 1
+ } else {
+ if $J as usize > array.len() {
+ array.len()
+ } else {
+ $J as usize
+ }
+ }
+ };
+
+ if i > j || i as usize > $ARRAY.len() {
+ return_empty($RETURN_ELEMENT, $ARRAY.data_type().clone())
+ } else {
+ Arc::new(array.slice((i - 1), (j + 1 - i)))
+ }
+ }};
+}
+
+macro_rules! slice {
Review Comment:
I didn't fully understand this implementation -- it seem like picking a
single value from a list array can be implemented by calculating the
appropriate offsets and then calling
[take](https://docs.rs/arrow/latest/arrow/compute/kernels/take/fn.take.html)
For example, given A ListArray like this
```
┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
┌ ─ ─ ─ ─ ─ ─ ┐ │
┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐
│ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ │ 1 │ │ A │ │ 0 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [] (empty) │ │ (3,3) │ │ 1 │ │ 3 │ │ │ 1 │ │ B │ │ 1 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ NULL │ │ (3,4) │ │ 0 │ │ 4 │ │ │ 1 │ │ C │ │ 2 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [D] │ │ 4,5 │ │ 1 │ │ 5 │ │ │ 1 │ │ ? │ │ 3 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [NULL, F] │ │ 5,7 │ │ 1 │ │ 5 │ │ │ 0 │ │ D │ │ 4 │
└─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤
│ 7 │ │ │ 1 │ │ ? │ │ 5 │
│ Validity └───┘ ├───┤ ├───┤
Logical Logical (nulls) Offsets │ │ 1 │ │ F │ │ 6 │
Values Offsets │ └───┘ └───┘
│ Values │ │
(offsets[i], │ ListArray (Array)
offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │
└ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
```
Computing `col[1]` (aka take the second element) could be computed by adding
1 to the first logical offset, checking if that is still in the array
So in this example, the take indicies would be
```
[
Some(1), (add 1 to start of (0, 3) is 1
None , // (adding 1 start of (3,3) is 4 which is outside,
None, // input was null,
None, // adding 1 start of (4,5 ) is 5 which is outside
Some(6), // adding 1 to start of (5,7) is 6
]
```
This calculation does not depend on datatype of the elements and would be
fully general
A similar calculation could be done for slice though in that case you would
have to build up the offsets of the new list array as well as the indices of
the take of the values.
In general I think many of these array calculations and manipulations can be
done in terms of the take kernel
##########
datafusion/expr/src/field_util.rs:
##########
@@ -18,28 +18,31 @@
//! Utility functions for complex field access
use arrow::datatypes::{DataType, Field};
-use datafusion_common::{DataFusionError, Result, ScalarValue};
+use datafusion_common::{DataFusionError, Result};
/// Returns the field access indexed by `key` from a [`DataType::List`] or
[`DataType::Struct`]
/// # Error
/// Errors if
-/// * the `data_type` is not a Struct or,
+/// * the `data_type` is not a Struct or a List,
Review Comment:
I found the new documentation in
https://docs.rs/datafusion/latest/datafusion/logical_expr/struct.GetIndexedField.html
to be sufficient. Thank you
##########
datafusion/physical-expr/src/array_expressions.rs:
##########
@@ -2550,69 +3060,6 @@ mod tests {
assert_eq!("1-*-3-*-*-6-7-*", result.value(0));
}
- #[test]
Review Comment:
why was this test removed?
##########
datafusion/core/tests/user_defined/user_defined_aggregates.rs:
##########
@@ -152,21 +152,6 @@ async fn test_udaf_returning_struct() {
assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap());
}
-/// Demonstrate extracting the fields from a structure using a subquery
Review Comment:
Why was this test removed?
(this is an important usecase for us in IOx where we have a UDF that returns
a struct that is accessed by name.
##########
datafusion/physical-expr/src/array_expressions.rs:
##########
@@ -363,6 +363,177 @@ pub fn make_array(arrays: &[ArrayRef]) ->
Result<ArrayRef> {
}
}
+fn return_empty(return_null: bool, data_type: DataType) -> Arc<dyn Array> {
+ if return_null {
+ new_null_array(&data_type, 1)
+ } else {
+ new_empty_array(&data_type)
+ }
+}
+
+macro_rules! list_slice {
+ ($ARRAY:expr, $I:expr, $J:expr, $RETURN_ELEMENT:expr, $ARRAY_TYPE:ident)
=> {{
+ let array = $ARRAY.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
+ if $I == 0 && $J == 0 || $ARRAY.is_empty() {
+ return return_empty($RETURN_ELEMENT, $ARRAY.data_type().clone());
+ }
+
+ let i = if $I < 0 {
+ if $I.abs() as usize > array.len() {
+ return return_empty(true, $ARRAY.data_type().clone());
+ }
+
+ (array.len() as i64 + $I + 1) as usize
+ } else {
+ if $I == 0 {
+ 1
+ } else {
+ $I as usize
+ }
+ };
+ let j = if $J < 0 {
+ if $J.abs() as usize > array.len() {
+ return return_empty(true, $ARRAY.data_type().clone());
+ }
+
+ if $RETURN_ELEMENT {
+ (array.len() as i64 + $J + 1) as usize
+ } else {
+ (array.len() as i64 + $J) as usize
+ }
+ } else {
+ if $J == 0 {
+ 1
+ } else {
+ if $J as usize > array.len() {
+ array.len()
+ } else {
+ $J as usize
+ }
+ }
+ };
+
+ if i > j || i as usize > $ARRAY.len() {
+ return_empty($RETURN_ELEMENT, $ARRAY.data_type().clone())
+ } else {
+ Arc::new(array.slice((i - 1), (j + 1 - i)))
+ }
+ }};
+}
+
+macro_rules! slice {
Review Comment:
I didn't fully understand this implementation -- it seem like picking a
single value from a list array can be implemented by calculating the
appropriate offsets and then calling
[take](https://docs.rs/arrow/latest/arrow/compute/kernels/take/fn.take.html)
For example, given A ListArray like this
```
┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
┌ ─ ─ ─ ─ ─ ─ ┐ │
┌─────────────┐ ┌───────┐ │ ┌───┐ ┌───┐ ┌───┐ ┌───┐
│ [A,B,C] │ │ (0,3) │ │ 1 │ │ 0 │ │ │ 1 │ │ A │ │ 0 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [] (empty) │ │ (3,3) │ │ 1 │ │ 3 │ │ │ 1 │ │ B │ │ 1 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ NULL │ │ (3,4) │ │ 0 │ │ 4 │ │ │ 1 │ │ C │ │ 2 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [D] │ │ 4,5 │ │ 1 │ │ 5 │ │ │ 1 │ │ ? │ │ 3 │
├─────────────┤ ├───────┤ │ ├───┤ ├───┤ ├───┤ ├───┤
│ [NULL, F] │ │ 5,7 │ │ 1 │ │ 5 │ │ │ 0 │ │ D │ │ 4 │
└─────────────┘ └───────┘ │ └───┘ ├───┤ ├───┤ ├───┤
│ 7 │ │ │ 1 │ │ ? │ │ 5 │
│ Validity └───┘ ├───┤ ├───┤
Logical Logical (nulls) Offsets │ │ 1 │ │ F │ │ 6 │
Values Offsets │ └───┘ └───┘
│ Values │ │
(offsets[i], │ ListArray (Array)
offsets[i+1]) └ ─ ─ ─ ─ ─ ─ ┘ │
└ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
```
Computing `col[1]` (aka take the second element) could be computed by adding
1 to the first logical offset, checking if that is still in the array
So in this example, the take indicies would be
```
[
Some(1), (add 1 to start of (0, 3) is 1
None , // (adding 1 start of (3,3) is 4 which is outside,
None, // input was null,
None, // adding 1 start of (4,5 ) is 5 which is outside
Some(6), // adding 1 to start of (5,7) is 6
]
```
This calculation does not depend on datatype of the elements and would be
fully general
A similar calculation could be done for slice though in that case you would
have to build up the offsets of the new list array as well as the indices of
the take of the values.
In general I think many of these array calculations and manipulations can be
done in terms of the take kernel
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]