This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new aa879bf045 Support custom struct field names with new scalar function 
named_struct (#9743)
aa879bf045 is described below

commit aa879bf045965d20792e9cd6ac08c550b3615280
Author: gstvg <[email protected]>
AuthorDate: Sat Mar 30 17:11:55 2024 -0300

    Support custom struct field names with new scalar function named_struct 
(#9743)
    
    * Support custom struct field names with new scalar function named_struct
    
    * add tests and corretly handle mixed arrray and scalar values
    
    * fix slt
    
    * fmt
    
    * port test to slt
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/functions/src/core/mod.rs           |   3 +
 datafusion/functions/src/core/named_struct.rs  | 148 +++++++++++++++++++++++++
 datafusion/sql/src/expr/mod.rs                 |  45 ++++++--
 datafusion/sqllogictest/test_files/explain.slt |   4 +-
 datafusion/sqllogictest/test_files/struct.slt  | 112 ++++++++++++++++++-
 docs/source/user-guide/sql/scalar_functions.md |  58 ++++++++--
 6 files changed, 343 insertions(+), 27 deletions(-)

diff --git a/datafusion/functions/src/core/mod.rs 
b/datafusion/functions/src/core/mod.rs
index 5a0bd2c77f..85d2410251 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -20,6 +20,7 @@
 mod arrow_cast;
 mod arrowtypeof;
 mod getfield;
+mod named_struct;
 mod nullif;
 mod nvl;
 mod nvl2;
@@ -32,6 +33,7 @@ make_udf_function!(nvl::NVLFunc, NVL, nvl);
 make_udf_function!(nvl2::NVL2Func, NVL2, nvl2);
 make_udf_function!(arrowtypeof::ArrowTypeOfFunc, ARROWTYPEOF, arrow_typeof);
 make_udf_function!(r#struct::StructFunc, STRUCT, r#struct);
+make_udf_function!(named_struct::NamedStructFunc, NAMED_STRUCT, named_struct);
 make_udf_function!(getfield::GetFieldFunc, GET_FIELD, get_field);
 
 // Export the functions out of this package, both as expr_fn as well as a list 
of functions
@@ -42,5 +44,6 @@ export_functions!(
     (nvl2, arg_1 arg_2 arg_3, "Returns value2 if value1 is not NULL; 
otherwise, it returns value3."),
     (arrow_typeof, arg_1, "Returns the Arrow type of the input expression."),
     (r#struct, args, "Returns a struct with the given arguments"),
+    (named_struct, args, "Returns a struct with the given names and arguments 
pairs"),
     (get_field, arg_1 arg_2, "Returns the value of the field with the given 
name from the struct")
 );
diff --git a/datafusion/functions/src/core/named_struct.rs 
b/datafusion/functions/src/core/named_struct.rs
new file mode 100644
index 0000000000..327a41baa7
--- /dev/null
+++ b/datafusion/functions/src/core/named_struct.rs
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::StructArray;
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::{exec_err, internal_err, Result, ScalarValue};
+use datafusion_expr::{ColumnarValue, Expr, ExprSchemable};
+use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+/// put values in a struct array.
+fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    // do not accept 0 arguments.
+    if args.is_empty() {
+        return exec_err!(
+            "named_struct requires at least one pair of arguments, got 0 
instead"
+        );
+    }
+
+    if args.len() % 2 != 0 {
+        return exec_err!(
+            "named_struct requires an even number of arguments, got {} 
instead",
+            args.len()
+        );
+    }
+
+    let (names, values): (Vec<_>, Vec<_>) = args
+        .chunks_exact(2)
+        .enumerate()
+        .map(|(i, chunk)| {
+
+            let name_column = &chunk[0];
+
+            let name = match name_column {
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(name_scalar))) => 
name_scalar,
+                _ => return exec_err!("named_struct even arguments must be 
string literals, got {name_column:?} instead at position {}", i * 2)
+            };
+
+            Ok((name, chunk[1].clone()))
+        })
+        .collect::<Result<Vec<_>>>()?
+        .into_iter()
+        .unzip();
+
+    let arrays = ColumnarValue::values_to_arrays(&values)?;
+
+    let fields = names
+        .into_iter()
+        .zip(arrays)
+        .map(|(name, value)| {
+            (
+                Arc::new(Field::new(name, value.data_type().clone(), true)),
+                value,
+            )
+        })
+        .collect::<Vec<_>>();
+
+    Ok(ColumnarValue::Array(Arc::new(StructArray::from(fields))))
+}
+
+#[derive(Debug)]
+pub(super) struct NamedStructFunc {
+    signature: Signature,
+}
+
+impl NamedStructFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for NamedStructFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "named_struct"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        internal_err!(
+            "named_struct: return_type called instead of 
return_type_from_exprs"
+        )
+    }
+
+    fn return_type_from_exprs(
+        &self,
+        args: &[datafusion_expr::Expr],
+        schema: &dyn datafusion_common::ExprSchema,
+        _arg_types: &[DataType],
+    ) -> Result<DataType> {
+        // do not accept 0 arguments.
+        if args.is_empty() {
+            return exec_err!(
+                "named_struct requires at least one pair of arguments, got 0 
instead"
+            );
+        }
+
+        if args.len() % 2 != 0 {
+            return exec_err!(
+                "named_struct requires an even number of arguments, got {} 
instead",
+                args.len()
+            );
+        }
+
+        let return_fields = args
+            .chunks_exact(2)
+            .enumerate()
+            .map(|(i, chunk)| {
+                let name = &chunk[0];
+                let value = &chunk[1];
+
+                if let Expr::Literal(ScalarValue::Utf8(Some(name))) = name {
+                    Ok(Field::new(name, value.get_type(schema)?, true))
+                } else {
+                    exec_err!("named_struct even arguments must be string 
literals, got {name} instead at position {}", i * 2)
+                }
+            })
+            .collect::<Result<Vec<Field>>>()?;
+        Ok(DataType::Struct(Fields::from(return_fields)))
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        named_struct_expr(args)
+    }
+}
diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs
index 43bf2d8715..064578ad51 100644
--- a/datafusion/sql/src/expr/mod.rs
+++ b/datafusion/sql/src/expr/mod.rs
@@ -29,7 +29,8 @@ use datafusion_expr::expr::InList;
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::{
     col, expr, lit, AggregateFunction, Between, BinaryExpr, 
BuiltinScalarFunction, Cast,
-    Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Operator, 
TryCast,
+    Expr, ExprSchemable, GetFieldAccess, GetIndexedField, Like, Literal, 
Operator,
+    TryCast,
 };
 
 use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
@@ -604,18 +605,44 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         }
         let args = values
             .into_iter()
-            .map(|value| {
-                self.sql_expr_to_logical_expr(value, input_schema, 
planner_context)
+            .enumerate()
+            .map(|(i, value)| {
+                let args = if let SQLExpr::Named { expr, name } = value {
+                    [
+                        name.value.lit(),
+                        self.sql_expr_to_logical_expr(
+                            *expr,
+                            input_schema,
+                            planner_context,
+                        )?,
+                    ]
+                } else {
+                    [
+                        format!("c{i}").lit(),
+                        self.sql_expr_to_logical_expr(
+                            value,
+                            input_schema,
+                            planner_context,
+                        )?,
+                    ]
+                };
+
+                Ok(args)
             })
-            .collect::<Result<Vec<_>>>()?;
-        let struct_func = self
+            .collect::<Result<Vec<_>>>()?
+            .into_iter()
+            .flatten()
+            .collect();
+
+        let named_struct_func = self
             .context_provider
-            .get_function_meta("struct")
+            .get_function_meta("named_struct")
             .ok_or_else(|| {
-                internal_datafusion_err!("Unable to find expected 'struct' 
function")
-            })?;
+            internal_datafusion_err!("Unable to find expected 'named_struct' 
function")
+        })?;
+
         Ok(Expr::ScalarFunction(ScalarFunction::new_udf(
-            struct_func,
+            named_struct_func,
             args,
         )))
     }
diff --git a/datafusion/sqllogictest/test_files/explain.slt 
b/datafusion/sqllogictest/test_files/explain.slt
index b7ad36dace..4653250cf9 100644
--- a/datafusion/sqllogictest/test_files/explain.slt
+++ b/datafusion/sqllogictest/test_files/explain.slt
@@ -390,8 +390,8 @@ query TT
 explain select struct(1, 2.3, 'abc');
 ----
 logical_plan
-Projection: Struct({c0:1,c1:2.3,c2:abc}) AS 
struct(Int64(1),Float64(2.3),Utf8("abc"))
+Projection: Struct({c0:1,c1:2.3,c2:abc}) AS 
named_struct(Utf8("c0"),Int64(1),Utf8("c1"),Float64(2.3),Utf8("c2"),Utf8("abc"))
 --EmptyRelation
 physical_plan
-ProjectionExec: expr=[{c0:1,c1:2.3,c2:abc} as 
struct(Int64(1),Float64(2.3),Utf8("abc"))]
+ProjectionExec: expr=[{c0:1,c1:2.3,c2:abc} as 
named_struct(Utf8("c0"),Int64(1),Utf8("c1"),Float64(2.3),Utf8("c2"),Utf8("abc"))]
 --PlaceholderRowExec
diff --git a/datafusion/sqllogictest/test_files/struct.slt 
b/datafusion/sqllogictest/test_files/struct.slt
index 1ab6f3908b..2e0b699f6d 100644
--- a/datafusion/sqllogictest/test_files/struct.slt
+++ b/datafusion/sqllogictest/test_files/struct.slt
@@ -23,11 +23,12 @@ statement ok
 CREATE TABLE values(
     a INT,
     b FLOAT,
-    c VARCHAR
+    c VARCHAR,
+    n VARCHAR,
 ) AS VALUES
-  (1, 1.1, 'a'),
-  (2, 2.2, 'b'),
-  (3, 3.3, 'c')
+  (1, 1.1, 'a', NULL),
+  (2, 2.2, 'b', NULL),
+  (3, 3.3, 'c', NULL)
 ;
 
 # struct[i]
@@ -50,6 +51,18 @@ select struct(1, 3.14, 'e');
 ----
 {c0: 1, c1: 3.14, c2: e}
 
+# struct scalar function with named values
+query ?
+select struct(1 as "name0", 3.14 as name1, 'e', true as 'name3');
+----
+{name0: 1, name1: 3.14, c2: e, name3: true}
+
+# struct scalar function with mixed named and unnamed values
+query ?
+select struct(1, 3.14 as name1, 'e', true);
+----
+{c0: 1, name1: 3.14, c2: e, c3: true}
+
 # struct scalar function with columns #1
 query ?
 select struct(a, b, c) from values;
@@ -72,11 +85,98 @@ query TT
 explain select struct(a, b, c) from values;
 ----
 logical_plan
-Projection: struct(values.a, values.b, values.c)
+Projection: named_struct(Utf8("c0"), values.a, Utf8("c1"), values.b, 
Utf8("c2"), values.c)
 --TableScan: values projection=[a, b, c]
 physical_plan
-ProjectionExec: expr=[struct(a@0, b@1, c@2) as 
struct(values.a,values.b,values.c)]
+ProjectionExec: expr=[named_struct(c0, a@0, c1, b@1, c2, c@2) as 
named_struct(Utf8("c0"),values.a,Utf8("c1"),values.b,Utf8("c2"),values.c)]
 --MemoryExec: partitions=1, partition_sizes=[1]
 
+# error on 0 arguments
+query error DataFusion error: Error during planning: No function matches the 
given name and argument types 'named_struct\(\)'. You might need to add 
explicit type casts.
+select named_struct();
+
+# error on odd number of arguments #1
+query error DataFusion error: Execution error: named_struct requires an even 
number of arguments, got 1 instead
+select named_struct('a');
+
+# error on odd number of arguments #2
+query error DataFusion error: Execution error: named_struct requires an even 
number of arguments, got 1 instead
+select named_struct(1);
+
+# error on odd number of arguments #3
+query error DataFusion error: Execution error: named_struct requires an even 
number of arguments, got 1 instead
+select named_struct(values.a) from values;
+
+# error on odd number of arguments #4
+query error DataFusion error: Execution error: named_struct requires an even 
number of arguments, got 3 instead
+select named_struct('a', 1, 'b');
+
+# error on even argument not a string literal #1
+query error DataFusion error: Execution error: named_struct even arguments 
must be string literals, got Int64\(1\) instead at position 0
+select named_struct(1, 'a');
+
+# error on even argument not a string literal #2
+query error DataFusion error: Execution error: named_struct even arguments 
must be string literals, got Int64\(0\) instead at position 2
+select named_struct('corret', 1, 0, 'wrong');
+
+# error on even argument not a string literal #3
+query error DataFusion error: Execution error: named_struct even arguments 
must be string literals, got values\.a instead at position 0
+select named_struct(values.a, 'a') from values;
+
+# error on even argument not a string literal #4
+query error DataFusion error: Execution error: named_struct even arguments 
must be string literals, got values\.c instead at position 0
+select named_struct(values.c, 'c') from values;
+
+# named_struct with mixed scalar and array values #1
+query ?
+select named_struct('scalar', 27, 'array', values.a, 'null', NULL) from values;
+----
+{scalar: 27, array: 1, null: }
+{scalar: 27, array: 2, null: }
+{scalar: 27, array: 3, null: }
+
+# named_struct with mixed scalar and array values #2
+query ?
+select named_struct('array', values.a, 'scalar', 27, 'null', NULL) from values;
+----
+{array: 1, scalar: 27, null: }
+{array: 2, scalar: 27, null: }
+{array: 3, scalar: 27, null: }
+
+# named_struct with mixed scalar and array values #3
+query ?
+select named_struct('null', NULL, 'array', values.a, 'scalar', 27) from values;
+----
+{null: , array: 1, scalar: 27}
+{null: , array: 2, scalar: 27}
+{null: , array: 3, scalar: 27}
+
+# named_struct with mixed scalar and array values #4
+query ?
+select named_struct('null_array', values.n, 'array', values.a, 'scalar', 27, 
'null', NULL) from values;
+----
+{null_array: , array: 1, scalar: 27, null: }
+{null_array: , array: 2, scalar: 27, null: }
+{null_array: , array: 3, scalar: 27, null: }
+
+# named_struct arrays only
+query ?
+select named_struct('field_a', a, 'field_b', b) from values;
+----
+{field_a: 1, field_b: 1.1}
+{field_a: 2, field_b: 2.2}
+{field_a: 3, field_b: 3.3}
+
+# named_struct scalars only
+query ?
+select named_struct('field_a', 1, 'field_b', 2);
+----
+{field_a: 1, field_b: 2}
+
 statement ok
 drop table values;
+
+query T
+select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3));
+----
+Struct([Field { name: "first", data_type: Int64, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: {} }, Field { name: "second", data_type: 
Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
Field { name: "third", data_type: Int64, nullable: true, dict_id: 0, 
dict_is_ordered: false, metadata: {} }])
\ No newline at end of file
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index 52edf4bb72..e2e129a2e2 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -3312,11 +3312,12 @@ are not allowed
 ## Struct Functions
 
 - [struct](#struct)
+- [named_struct](#named_struct)
 
 ### `struct`
 
-Returns an Arrow struct using the specified input expressions.
-Fields in the returned struct use the `cN` naming convention.
+Returns an Arrow struct using the specified input expressions optionally named.
+Fields in the returned struct use the optional name or the `cN` naming 
convention.
 For example: `c0`, `c1`, `c2`, etc.
 
 ```
@@ -3324,7 +3325,7 @@ struct(expression1[, ..., expression_n])
 ```
 
 For example, this query converts two columns `a` and `b` to a single column 
with
-a struct type of fields `c0` and `c1`:
+a struct type of fields `field_a` and `c1`:
 
 ```
 select * from t;
@@ -3335,18 +3336,55 @@ select * from t;
 | 3 | 4 |
 +---+---+
 
-select struct(a, b) from t;
-+-----------------+
-| struct(t.a,t.b) |
-+-----------------+
-| {c0: 1, c1: 2}  |
-| {c0: 3, c1: 4}  |
-+-----------------+
+select struct(a as field_a, b) from t;
++--------------------------------------------------+
+| named_struct(Utf8("field_a"),t.a,Utf8("c1"),t.b) |
++--------------------------------------------------+
+| {field_a: 1, c1: 2}                              |
+| {field_a: 3, c1: 4}                              |
++--------------------------------------------------+
 ```
 
 #### Arguments
 
 - **expression_n**: Expression to include in the output struct.
+  Can be a constant, column, or function, any combination of arithmetic or
+  string operators, or a named expression of previous listed .
+
+### `named_struct`
+
+Returns an Arrow struct using the specified name and input expressions pairs.
+
+```
+named_struct(expression1_name, expression1_input[, ..., expression_n_name, 
expression_n_input])
+```
+
+For example, this query converts two columns `a` and `b` to a single column 
with
+a struct type of fields `field_a` and `field_b`:
+
+```
+select * from t;
++---+---+
+| a | b |
++---+---+
+| 1 | 2 |
+| 3 | 4 |
++---+---+
+
+select named_struct('field_a', a, 'field_b', b) from t;
++-------------------------------------------------------+
+| named_struct(Utf8("field_a"),t.a,Utf8("field_b"),t.b) |
++-------------------------------------------------------+
+| {field_a: 1, field_b: 2}                              |
+| {field_a: 3, field_b: 4}                              |
++-------------------------------------------------------+
+```
+
+#### Arguments
+
+- **expression_n_name**: Name of the column field.
+  Must be a constant string.
+- **expression_n_input**: Expression to include in the output struct.
   Can be a constant, column, or function, and any combination of arithmetic or
   string operators.
 

Reply via email to