This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new d844f8687a Add:arrow_metadata() UDF (#19435)
d844f8687a is described below

commit d844f8687a549f87bbdaa8e8e202015525c8ee6b
Author: xonx <[email protected]>
AuthorDate: Tue Dec 23 09:39:25 2025 +0530

    Add:arrow_metadata() UDF (#19435)
    
    ## Which issue does this PR close?
    Closes #19356
    
    ## Rationale for this change
    This PR implements the arrow_metadata UDF as requested in issue #19356.
    
    ## What changes are included in this PR?
    Added arrow_metadata UDF
    Refactored Tests
    
    ## Are these changes tested?
    Yes.
    
    ## Are there any user-facing changes?
    Yes.
---
 datafusion/functions/src/core/arrow_metadata.rs | 148 ++++++++++++++++++++++++
 datafusion/functions/src/core/mod.rs            |   7 ++
 datafusion/sqllogictest/src/test_context.rs     |  56 +--------
 datafusion/sqllogictest/test_files/metadata.slt |  16 ++-
 docs/source/user-guide/sql/scalar_functions.md  |  31 +++++
 5 files changed, 198 insertions(+), 60 deletions(-)

diff --git a/datafusion/functions/src/core/arrow_metadata.rs 
b/datafusion/functions/src/core/arrow_metadata.rs
new file mode 100644
index 0000000000..92873889b0
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_metadata.rs
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{MapBuilder, StringBuilder};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+    Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Other Functions"),
+    description = "Returns the metadata of the input expression. If a key is 
provided, returns the value for that key. If no key is provided, returns a Map 
of all metadata.",
+    syntax_example = "arrow_metadata(expression, [key])",
+    sql_example = r#"```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
+```"#,
+    argument(
+        name = "expression",
+        description = "The expression to retrieve metadata from. Can be a 
column or other expression."
+    ),
+    argument(
+        name = "key",
+        description = "Optional. The specific metadata key to retrieve."
+    )
+)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ArrowMetadataFunc {
+    signature: Signature,
+}
+
+impl ArrowMetadataFunc {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::variadic_any(Volatility::Immutable),
+        }
+    }
+}
+
+impl Default for ArrowMetadataFunc {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ScalarUDFImpl for ArrowMetadataFunc {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "arrow_metadata"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn documentation(&self) -> Option<&Documentation> {
+        self.doc()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.len() == 2 {
+            Ok(DataType::Utf8)
+        } else if arg_types.len() == 1 {
+            Ok(DataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    DataType::Struct(Fields::from(vec![
+                        Field::new("keys", DataType::Utf8, false),
+                        Field::new("values", DataType::Utf8, true),
+                    ])),
+                    false,
+                )),
+                false,
+            ))
+        } else {
+            exec_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        let metadata = args.arg_fields[0].metadata();
+
+        if args.args.len() == 2 {
+            let key = match &args.args[1] {
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k,
+                _ => {
+                    return exec_err!(
+                        "Second argument to arrow_metadata must be a string 
literal key"
+                    );
+                }
+            };
+            let value = metadata.get(key).cloned();
+            Ok(ColumnarValue::Scalar(ScalarValue::Utf8(value)))
+        } else if args.args.len() == 1 {
+            let mut map_builder =
+                MapBuilder::new(None, StringBuilder::new(), 
StringBuilder::new());
+
+            let mut entries: Vec<_> = metadata.iter().collect();
+            entries.sort_by_key(|(k, _)| *k);
+
+            for (k, v) in entries {
+                map_builder.keys().append_value(k);
+                map_builder.values().append_value(v);
+            }
+            map_builder.append(true)?;
+
+            let map_array = map_builder.finish();
+
+            Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+                &map_array, 0,
+            )?))
+        } else {
+            exec_err!("arrow_metadata requires 1 or 2 arguments")
+        }
+    }
+}
diff --git a/datafusion/functions/src/core/mod.rs 
b/datafusion/functions/src/core/mod.rs
index 0c569d3006..82b0097aa7 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
 use std::sync::Arc;
 
 pub mod arrow_cast;
+pub mod arrow_metadata;
 pub mod arrowtypeof;
 pub mod coalesce;
 pub mod expr_ext;
@@ -55,6 +56,7 @@ make_udf_function!(least::LeastFunc, least);
 make_udf_function!(union_extract::UnionExtractFun, union_extract);
 make_udf_function!(union_tag::UnionTagFunc, union_tag);
 make_udf_function!(version::VersionFunc, version);
+make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
 
 pub mod expr_fn {
     use datafusion_expr::{Expr, Literal};
@@ -83,6 +85,10 @@ pub mod expr_fn {
         arrow_typeof,
         "Returns the Arrow type of the input expression.",
         arg1
+    ),(
+        arrow_metadata,
+        "Returns the metadata of the input expression",
+        args,
     ),(
         r#struct,
         "Returns a struct with the given arguments",
@@ -127,6 +133,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
     vec![
         nullif(),
         arrow_cast(),
+        arrow_metadata(),
         nvl(),
         nvl2(),
         overlay(),
diff --git a/datafusion/sqllogictest/src/test_context.rs 
b/datafusion/sqllogictest/src/test_context.rs
index dc6dc29fdc..9ec085b41e 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -32,7 +32,7 @@ use arrow::record_batch::RecordBatch;
 use datafusion::catalog::{
     CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session,
 };
-use datafusion::common::{DataFusionError, Result, ScalarValue, exec_err, 
not_impl_err};
+use datafusion::common::{DataFusionError, Result, not_impl_err};
 use datafusion::functions::math::abs;
 use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
 use datafusion::logical_expr::{
@@ -398,60 +398,6 @@ pub async fn register_metadata_tables(ctx: 
&SessionContext) {
     .unwrap();
 
     ctx.register_batch("table_with_metadata", batch).unwrap();
-
-    // Register the get_metadata UDF for testing metadata preservation
-    ctx.register_udf(ScalarUDF::from(GetMetadataUdf::new()));
-}
-
-/// UDF to extract metadata from a field for testing purposes
-/// Usage: get_metadata(expr, 'key') -> returns the metadata value or NULL
-#[derive(Debug, PartialEq, Eq, Hash)]
-struct GetMetadataUdf {
-    signature: Signature,
-}
-
-impl GetMetadataUdf {
-    fn new() -> Self {
-        Self {
-            signature: Signature::any(2, Volatility::Immutable),
-        }
-    }
-}
-
-impl ScalarUDFImpl for GetMetadataUdf {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        "get_metadata"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
-        Ok(DataType::Utf8)
-    }
-
-    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
-        // Get the metadata key from the second argument (must be a string 
literal)
-        let key = match &args.args[1] {
-            ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k.clone(),
-            _ => {
-                return exec_err!(
-                    "get_metadata second argument must be a string literal"
-                );
-            }
-        };
-
-        // Get metadata from the first argument's field
-        let metadata_value = args.arg_fields[0].metadata().get(&key).cloned();
-
-        // Return as a scalar (same value for all rows)
-        Ok(ColumnarValue::Scalar(ScalarValue::Utf8(metadata_value)))
-    }
 }
 
 /// Create a UDF function named "example". See the `sample_udf.rs` example
diff --git a/datafusion/sqllogictest/test_files/metadata.slt 
b/datafusion/sqllogictest/test_files/metadata.slt
index d7df46e4f9..41a511b5fa 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -237,21 +237,21 @@ NULL 1
 
 # Regression test: first_value should preserve metadata
 query IT
-select first_value(id order by id asc nulls last), get_metadata(first_value(id 
order by id asc nulls last), 'metadata_key')
+select first_value(id order by id asc nulls last), 
arrow_metadata(first_value(id order by id asc nulls last), 'metadata_key')
 from table_with_metadata;
 ----
 1 the id field
 
 # Regression test: last_value should preserve metadata
 query IT
-select last_value(id order by id asc nulls first), get_metadata(last_value(id 
order by id asc nulls first), 'metadata_key')
+select last_value(id order by id asc nulls first), 
arrow_metadata(last_value(id order by id asc nulls first), 'metadata_key')
 from table_with_metadata;
 ----
 3 the id field
 
 # Regression test: DISTINCT ON should preserve metadata (uses first_value 
internally)
 query ITTT
-select distinct on (id) id, get_metadata(id, 'metadata_key'), name, 
get_metadata(name, 'metadata_key')
+select distinct on (id) id, arrow_metadata(id, 'metadata_key'), name, 
arrow_metadata(name, 'metadata_key')
 from table_with_metadata order by id asc nulls last;
 ----
 1 the id field NULL the name field
@@ -263,7 +263,7 @@ query ITTT
 with res AS (
   select distinct id, name from table_with_metadata
 )
-select id, get_metadata(id, 'metadata_key'), name, get_metadata(name, 
'metadata_key')
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 
'metadata_key')
 from res
 order by id asc nulls last;
 ----
@@ -278,7 +278,7 @@ with res AS (
   from table_with_metadata
   group by id, name
 )
-select id, get_metadata(id, 'metadata_key'), name, get_metadata(name, 
'metadata_key')
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name, 
'metadata_key')
 from res
 order by id asc nulls last, name asc nulls last
 ----
@@ -286,5 +286,11 @@ order by id asc nulls last, name asc nulls last
 3 the id field baz the name field
 NULL the id field bar the name field
 
+# Test arrow_metadata with single argument (returns Map)
+query ?
+select arrow_metadata(id) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
+
 statement ok
 drop table table_with_metadata;
diff --git a/docs/source/user-guide/sql/scalar_functions.md 
b/docs/source/user-guide/sql/scalar_functions.md
index 89ebf09991..ce7ea85a81 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -4988,6 +4988,7 @@ union_tag(union_expression)
 ## Other Functions
 
 - [arrow_cast](#arrow_cast)
+- [arrow_metadata](#arrow_metadata)
 - [arrow_typeof](#arrow_typeof)
 - [get_field](#get_field)
 - [version](#version)
@@ -5030,6 +5031,36 @@ arrow_cast(expression, datatype)
 +---------------------------+---------------------+
 ```
 
+### `arrow_metadata`
+
+Returns the metadata of the input expression. If a key is provided, returns 
the value for that key. If no key is provided, returns a Map of all metadata.
+
+```sql
+arrow_metadata(expression, [key])
+```
+
+#### Arguments
+
+- **expression**: The expression to retrieve metadata from. Can be a column or 
other expression.
+- **key**: Optional. The specific metadata key to retrieve.
+
+#### Example
+
+```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col)  |
++----------------------------+
+| {k: v}                     |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v                             |
++-------------------------------+
+```
+
 ### `arrow_typeof`
 
 Returns the name of the underlying [Arrow data 
type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the 
expression.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to