This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new d844f8687a Add:arrow_metadata() UDF (#19435)
d844f8687a is described below
commit d844f8687a549f87bbdaa8e8e202015525c8ee6b
Author: xonx <[email protected]>
AuthorDate: Tue Dec 23 09:39:25 2025 +0530
Add:arrow_metadata() UDF (#19435)
## Which issue does this PR close?
Closes #19356
## Rationale for this change
This PR implements the arrow_metadata UDF as requested in issue #19356.
## What changes are included in this PR?
Added arrow_metadata UDF
Refactored Tests
## Are these changes tested?
Yes.
## Are there any user-facing changes?
Yes.
---
datafusion/functions/src/core/arrow_metadata.rs | 148 ++++++++++++++++++++++++
datafusion/functions/src/core/mod.rs | 7 ++
datafusion/sqllogictest/src/test_context.rs | 56 +--------
datafusion/sqllogictest/test_files/metadata.slt | 16 ++-
docs/source/user-guide/sql/scalar_functions.md | 31 +++++
5 files changed, 198 insertions(+), 60 deletions(-)
diff --git a/datafusion/functions/src/core/arrow_metadata.rs
b/datafusion/functions/src/core/arrow_metadata.rs
new file mode 100644
index 0000000000..92873889b0
--- /dev/null
+++ b/datafusion/functions/src/core/arrow_metadata.rs
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{MapBuilder, StringBuilder};
+use arrow::datatypes::{DataType, Field, Fields};
+use datafusion_common::{Result, ScalarValue, exec_err};
+use datafusion_expr::{
+ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+ Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+ doc_section(label = "Other Functions"),
+ description = "Returns the metadata of the input expression. If a key is
provided, returns the value for that key. If no key is provided, returns a Map
of all metadata.",
+ syntax_example = "arrow_metadata(expression, [key])",
+ sql_example = r#"```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col) |
++----------------------------+
+| {k: v} |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v |
++-------------------------------+
+```"#,
+ argument(
+ name = "expression",
+ description = "The expression to retrieve metadata from. Can be a
column or other expression."
+ ),
+ argument(
+ name = "key",
+ description = "Optional. The specific metadata key to retrieve."
+ )
+)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ArrowMetadataFunc {
+ signature: Signature,
+}
+
+impl ArrowMetadataFunc {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::variadic_any(Volatility::Immutable),
+ }
+ }
+}
+
+impl Default for ArrowMetadataFunc {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl ScalarUDFImpl for ArrowMetadataFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "arrow_metadata"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn documentation(&self) -> Option<&Documentation> {
+ self.doc()
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ if arg_types.len() == 2 {
+ Ok(DataType::Utf8)
+ } else if arg_types.len() == 1 {
+ Ok(DataType::Map(
+ Arc::new(Field::new(
+ "entries",
+ DataType::Struct(Fields::from(vec![
+ Field::new("keys", DataType::Utf8, false),
+ Field::new("values", DataType::Utf8, true),
+ ])),
+ false,
+ )),
+ false,
+ ))
+ } else {
+ exec_err!("arrow_metadata requires 1 or 2 arguments")
+ }
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ let metadata = args.arg_fields[0].metadata();
+
+ if args.args.len() == 2 {
+ let key = match &args.args[1] {
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k,
+ _ => {
+ return exec_err!(
+ "Second argument to arrow_metadata must be a string
literal key"
+ );
+ }
+ };
+ let value = metadata.get(key).cloned();
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(value)))
+ } else if args.args.len() == 1 {
+ let mut map_builder =
+ MapBuilder::new(None, StringBuilder::new(),
StringBuilder::new());
+
+ let mut entries: Vec<_> = metadata.iter().collect();
+ entries.sort_by_key(|(k, _)| *k);
+
+ for (k, v) in entries {
+ map_builder.keys().append_value(k);
+ map_builder.values().append_value(v);
+ }
+ map_builder.append(true)?;
+
+ let map_array = map_builder.finish();
+
+ Ok(ColumnarValue::Scalar(ScalarValue::try_from_array(
+ &map_array, 0,
+ )?))
+ } else {
+ exec_err!("arrow_metadata requires 1 or 2 arguments")
+ }
+ }
+}
diff --git a/datafusion/functions/src/core/mod.rs
b/datafusion/functions/src/core/mod.rs
index 0c569d3006..82b0097aa7 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -21,6 +21,7 @@ use datafusion_expr::ScalarUDF;
use std::sync::Arc;
pub mod arrow_cast;
+pub mod arrow_metadata;
pub mod arrowtypeof;
pub mod coalesce;
pub mod expr_ext;
@@ -55,6 +56,7 @@ make_udf_function!(least::LeastFunc, least);
make_udf_function!(union_extract::UnionExtractFun, union_extract);
make_udf_function!(union_tag::UnionTagFunc, union_tag);
make_udf_function!(version::VersionFunc, version);
+make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
pub mod expr_fn {
use datafusion_expr::{Expr, Literal};
@@ -83,6 +85,10 @@ pub mod expr_fn {
arrow_typeof,
"Returns the Arrow type of the input expression.",
arg1
+ ),(
+ arrow_metadata,
+ "Returns the metadata of the input expression",
+ args,
),(
r#struct,
"Returns a struct with the given arguments",
@@ -127,6 +133,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec![
nullif(),
arrow_cast(),
+ arrow_metadata(),
nvl(),
nvl2(),
overlay(),
diff --git a/datafusion/sqllogictest/src/test_context.rs
b/datafusion/sqllogictest/src/test_context.rs
index dc6dc29fdc..9ec085b41e 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -32,7 +32,7 @@ use arrow::record_batch::RecordBatch;
use datafusion::catalog::{
CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, Session,
};
-use datafusion::common::{DataFusionError, Result, ScalarValue, exec_err,
not_impl_err};
+use datafusion::common::{DataFusionError, Result, not_impl_err};
use datafusion::functions::math::abs;
use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl};
use datafusion::logical_expr::{
@@ -398,60 +398,6 @@ pub async fn register_metadata_tables(ctx:
&SessionContext) {
.unwrap();
ctx.register_batch("table_with_metadata", batch).unwrap();
-
- // Register the get_metadata UDF for testing metadata preservation
- ctx.register_udf(ScalarUDF::from(GetMetadataUdf::new()));
-}
-
-/// UDF to extract metadata from a field for testing purposes
-/// Usage: get_metadata(expr, 'key') -> returns the metadata value or NULL
-#[derive(Debug, PartialEq, Eq, Hash)]
-struct GetMetadataUdf {
- signature: Signature,
-}
-
-impl GetMetadataUdf {
- fn new() -> Self {
- Self {
- signature: Signature::any(2, Volatility::Immutable),
- }
- }
-}
-
-impl ScalarUDFImpl for GetMetadataUdf {
- fn as_any(&self) -> &dyn Any {
- self
- }
-
- fn name(&self) -> &str {
- "get_metadata"
- }
-
- fn signature(&self) -> &Signature {
- &self.signature
- }
-
- fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
- Ok(DataType::Utf8)
- }
-
- fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- // Get the metadata key from the second argument (must be a string
literal)
- let key = match &args.args[1] {
- ColumnarValue::Scalar(ScalarValue::Utf8(Some(k))) => k.clone(),
- _ => {
- return exec_err!(
- "get_metadata second argument must be a string literal"
- );
- }
- };
-
- // Get metadata from the first argument's field
- let metadata_value = args.arg_fields[0].metadata().get(&key).cloned();
-
- // Return as a scalar (same value for all rows)
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(metadata_value)))
- }
}
/// Create a UDF function named "example". See the `sample_udf.rs` example
diff --git a/datafusion/sqllogictest/test_files/metadata.slt
b/datafusion/sqllogictest/test_files/metadata.slt
index d7df46e4f9..41a511b5fa 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -237,21 +237,21 @@ NULL 1
# Regression test: first_value should preserve metadata
query IT
-select first_value(id order by id asc nulls last), get_metadata(first_value(id
order by id asc nulls last), 'metadata_key')
+select first_value(id order by id asc nulls last),
arrow_metadata(first_value(id order by id asc nulls last), 'metadata_key')
from table_with_metadata;
----
1 the id field
# Regression test: last_value should preserve metadata
query IT
-select last_value(id order by id asc nulls first), get_metadata(last_value(id
order by id asc nulls first), 'metadata_key')
+select last_value(id order by id asc nulls first),
arrow_metadata(last_value(id order by id asc nulls first), 'metadata_key')
from table_with_metadata;
----
3 the id field
# Regression test: DISTINCT ON should preserve metadata (uses first_value
internally)
query ITTT
-select distinct on (id) id, get_metadata(id, 'metadata_key'), name,
get_metadata(name, 'metadata_key')
+select distinct on (id) id, arrow_metadata(id, 'metadata_key'), name,
arrow_metadata(name, 'metadata_key')
from table_with_metadata order by id asc nulls last;
----
1 the id field NULL the name field
@@ -263,7 +263,7 @@ query ITTT
with res AS (
select distinct id, name from table_with_metadata
)
-select id, get_metadata(id, 'metadata_key'), name, get_metadata(name,
'metadata_key')
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name,
'metadata_key')
from res
order by id asc nulls last;
----
@@ -278,7 +278,7 @@ with res AS (
from table_with_metadata
group by id, name
)
-select id, get_metadata(id, 'metadata_key'), name, get_metadata(name,
'metadata_key')
+select id, arrow_metadata(id, 'metadata_key'), name, arrow_metadata(name,
'metadata_key')
from res
order by id asc nulls last, name asc nulls last
----
@@ -286,5 +286,11 @@ order by id asc nulls last, name asc nulls last
3 the id field baz the name field
NULL the id field bar the name field
+# Test arrow_metadata with single argument (returns Map)
+query ?
+select arrow_metadata(id) from table_with_metadata limit 1;
+----
+{metadata_key: the id field}
+
statement ok
drop table table_with_metadata;
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index 89ebf09991..ce7ea85a81 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -4988,6 +4988,7 @@ union_tag(union_expression)
## Other Functions
- [arrow_cast](#arrow_cast)
+- [arrow_metadata](#arrow_metadata)
- [arrow_typeof](#arrow_typeof)
- [get_field](#get_field)
- [version](#version)
@@ -5030,6 +5031,36 @@ arrow_cast(expression, datatype)
+---------------------------+---------------------+
```
+### `arrow_metadata`
+
+Returns the metadata of the input expression. If a key is provided, returns
the value for that key. If no key is provided, returns a Map of all metadata.
+
+```sql
+arrow_metadata(expression, [key])
+```
+
+#### Arguments
+
+- **expression**: The expression to retrieve metadata from. Can be a column or
other expression.
+- **key**: Optional. The specific metadata key to retrieve.
+
+#### Example
+
+```sql
+> select arrow_metadata(col) from table;
++----------------------------+
+| arrow_metadata(table.col) |
++----------------------------+
+| {k: v} |
++----------------------------+
+> select arrow_metadata(col, 'k') from table;
++-------------------------------+
+| arrow_metadata(table.col, 'k')|
++-------------------------------+
+| v |
++-------------------------------+
+```
+
### `arrow_typeof`
Returns the name of the underlying [Arrow data
type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the
expression.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]