paleolimbot commented on code in PR #21984:
URL: https://github.com/apache/datafusion/pull/21984#discussion_r3245386025
##########
datafusion/functions-nested/src/arrays_zip.rs:
##########
@@ -223,12 +264,17 @@ fn arrays_zip_inner(args: &[ArrayRef]) ->
Result<ArrayRef> {
.map(|v| v.as_ref().map(|view| view.values.to_data()))
.collect();
- let struct_fields: Fields = element_types
- .iter()
- .enumerate()
- .map(|(i, dt)| Field::new(format!("{}", i + 1), dt.clone(), true))
- .collect::<Vec<_>>()
- .into();
+ // Prefer the planning-time struct fields (which preserve input metadata)
+ // when available; fall back to building bare fields from element types.
+ let struct_fields: Fields = match inner_field.as_ref().map(|f|
f.data_type()) {
Review Comment:
If this can ever happen it would be helpful for future readers to elaborate
on how (as far as I know this is not supposed to happen?)
##########
datafusion/functions-nested/src/arrays_zip.rs:
##########
@@ -139,8 +140,45 @@ impl ScalarUDFImpl for ArraysZip {
))))
}
+ fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
+ if args.arg_fields.is_empty() {
+ return exec_err!("arrays_zip requires at least one argument");
+ }
+
+ // For each input list-typed argument, take its element field and
+ // rename it to the positional struct member name (1-based), preserving
+ // metadata. For Null-typed inputs, build a bare Null field.
+ let struct_fields: Fields = args
+ .arg_fields
+ .iter()
+ .enumerate()
+ .map(|(i, arg_field)| {
+ let name = format!("{}", i + 1);
+ match arg_field.data_type() {
+ List(field) | LargeList(field) | FixedSizeList(field, _)
=> {
+ Ok(nullable_inner_field_from(field, &name))
+ }
+ Null => Ok(Arc::new(Field::new(name, Null, true))),
+ dt => exec_err!("arrays_zip expects array arguments, got
{dt}"),
+ }
+ })
+ .collect::<Result<Vec<_>>>()?
+ .into();
+ let struct_dt = DataType::Struct(struct_fields);
+ let inner = Arc::new(Field::new(Field::LIST_FIELD_DEFAULT_NAME,
struct_dt, true));
+ Ok(Arc::new(Field::new(self.name(), List(inner), true)))
+ }
+
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- make_scalar_function(arrays_zip_inner)(&args.args)
+ let inner_field = match args.return_field.data_type() {
+ List(field) | LargeList(field) | FixedSizeList(field, _) => {
+ Some(Arc::clone(field))
+ }
+ _ => None,
+ };
Review Comment:
Should this return `internal_err!()` for the case where the return field is
not a `DataType::List`?
##########
datafusion/functions-nested/src/make_array.rs:
##########
@@ -105,8 +107,34 @@ impl ScalarUDFImpl for MakeArray {
Ok(DataType::new_list(element_type, true))
}
+ fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
+ // Pick the first non-Null argument's field as the source of element
+ // type and metadata; fall back to Null if all inputs are Null.
+ // Coercion has already unified element types, so any non-Null input is
+ // representative.
+ let inner = args
+ .arg_fields
+ .iter()
+ .find(|f| !f.data_type().is_null())
+ .map(nullable_list_item_field_from)
+ .unwrap_or_else(|| Arc::new(Field::new_list_field(Null, true)));
+ Ok(Arc::new(Field::new(
+ self.name(),
+ DataType::List(inner),
+ true,
+ )))
+ }
+
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- make_scalar_function(make_array_inner)(&args.args)
+ let inner_field = match args.return_field.data_type() {
+ DataType::List(field)
+ | DataType::LargeList(field)
+ | DataType::FixedSizeList(field, _) => Some(Arc::clone(field)),
+ _ => None,
Review Comment:
Should this return an internal_err for the case where the return type wasn't
computed to be a list?
##########
datafusion/functions-nested/src/map.rs:
##########
@@ -672,6 +772,60 @@ fn build_map_array(
#[cfg(test)]
mod tests {
use super::*;
+
+ /// Regression test for #21982: `map(...)` must propagate the input list
+ /// elements' metadata onto the map's `key` and `value` fields.
+ #[test]
+ fn map_preserves_key_value_field_metadata() -> Result<()> {
+ use datafusion_expr::ReturnFieldArgs;
+
+ let key_inner: FieldRef =
+ Arc::new(Field::new("e", DataType::Utf8, false).with_metadata(
+ std::collections::HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.uuid".to_string(),
+ )]),
+ ));
Review Comment:
This is my favourite way so far of constructing the test fields (no helper
function).
Also worth using arbitrary k/v metadata or a valid UUID storage here
##########
datafusion/functions-nested/src/make_array.rs:
##########
@@ -105,8 +107,34 @@ impl ScalarUDFImpl for MakeArray {
Ok(DataType::new_list(element_type, true))
}
+ fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
+ // Pick the first non-Null argument's field as the source of element
+ // type and metadata; fall back to Null if all inputs are Null.
+ // Coercion has already unified element types, so any non-Null input is
+ // representative.
+ let inner = args
+ .arg_fields
+ .iter()
+ .find(|f| !f.data_type().is_null())
+ .map(nullable_list_item_field_from)
+ .unwrap_or_else(|| Arc::new(Field::new_list_field(Null, true)));
+ Ok(Arc::new(Field::new(
+ self.name(),
Review Comment:
I personally find the naming of return fields very confusing (nobody is sure
whether to name them or not). The documentation suggests the name doesn't
matter and will be ignored (and my understanding is that this name is always
obliterated).
##########
datafusion/sqllogictest/test_files/array_metadata_propagation.slt:
##########
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Regression tests for https://github.com/apache/datafusion/issues/21982:
+# UDFs/UDAFs that wrap an input column into a composite type (List, Struct,
+# Map, ...) must propagate the input field's metadata onto the inner field
+# of the output composite. The data type rendering used by `arrow_field`
+# includes inner-field metadata, so we verify by string-matching the
+# `data_type` projection.
+
+# make_array preserves inner field metadata
+query T
+SELECT arrow_field(make_array(with_metadata(1, 'k', 'v')))['data_type']
+----
+List(Int64, metadata: {"k": "v"})
Review Comment:
Do these go through physical execution or are they all executed via constant
folding? (In the other PR I'm working on the two methods give different
metadata for casting). If it's easy enough it might be worth having a version
with a VALUES clause for each of these.
##########
datafusion/common/src/utils/mod.rs:
##########
@@ -445,10 +448,17 @@ impl SingleRowListArrayBuilder {
self
}
- /// Copies field name and nullable from the specified field
- pub fn with_field(self, field: &Field) -> Self {
- self.with_field_name(Some(field.name().to_owned()))
- .with_nullable(field.is_nullable())
+ /// Copies field name, nullable, and metadata from the specified field.
+ ///
+ /// Propagating metadata is required for Arrow extension types
+ /// (`ARROW:extension:name` / `ARROW:extension:metadata`) to round-trip
+ /// through SQL list constructors (e.g. `make_array`, `array_agg`).
+ pub fn with_field(mut self, field: &Field) -> Self {
+ self.field_name = Some(field.name().to_owned());
Review Comment:
Should this use the default list item name ("item") canonically? (Using a
different name can sometimes cause equality issues)
##########
datafusion/functions-nested/src/arrays_zip.rs:
##########
Review Comment:
Not here, but for the perfect zip (all value arrays the same length, no
nulls with non-zero element slot lengths, no null padding needed) this should
ideally be just clones of the original arrayrefs
##########
datafusion/functions-nested/src/make_array.rs:
##########
@@ -130,6 +158,16 @@ impl ScalarUDFImpl for MakeArray {
/// Constructs an array using the input `data` as `ArrayRef`.
/// Returns a reference-counted `Array` instance result.
pub(crate) fn make_array_inner(arrays: &[ArrayRef]) -> Result<ArrayRef> {
+ make_array_inner_with_field(arrays, None)
+}
Review Comment:
Is it worth deleting this function to expose other places that are not
propagating list field metadata?
##########
datafusion/functions-nested/src/make_array.rs:
##########
@@ -256,3 +315,82 @@ pub fn coerce_types_inner(arg_types: &[DataType], name:
&str) -> Result<Vec<Data
)
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow::array::Int64Array;
+ use datafusion_common::ScalarValue;
+ use datafusion_common::config::ConfigOptions;
+ use datafusion_expr::{ReturnFieldArgs, ScalarUDFImpl};
+ use std::collections::HashMap;
+
+ fn ext_field(data_type: DataType) -> FieldRef {
+ let metadata = HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.uuid".to_string(),
+ )]);
+ Arc::new(Field::new("v", data_type, true).with_metadata(metadata))
+ }
Review Comment:
Probably worth using arbitrary metadata unless we use the proper storage to
avoid confusion. Also possibly worth using the same structure for helper
functions for all of these tests (the one above is a lambda with configurable
k/v strings).
##########
datafusion/functions-nested/src/arrays_zip.rs:
##########
@@ -315,15 +361,60 @@ fn arrays_zip_inner(args: &[ArrayRef]) ->
Result<ArrayRef> {
let null_buffer = null_builder.finish();
- let result = ListArray::try_new(
+ let list_inner = inner_field.unwrap_or_else(|| {
Arc::new(Field::new_list_field(
struct_array.data_type().clone(),
true,
- )),
+ ))
+ });
+ let result = ListArray::try_new(
+ list_inner,
OffsetBuffer::new(offsets.into()),
Arc::new(struct_array),
null_buffer,
)?;
Ok(Arc::new(result))
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::collections::HashMap;
+
+ /// Regression test for #21982: `arrays_zip` must propagate each input
+ /// list's element-field metadata onto the corresponding struct member.
+ #[test]
+ fn arrays_zip_preserves_struct_member_metadata() -> Result<()> {
+ let with_meta = |k: &str, v: &str, dt: DataType| -> FieldRef {
+ let metadata = HashMap::from([(k.to_string(), v.to_string())]);
+ Arc::new(Field::new("e", dt, true).with_metadata(metadata))
+ };
+ let a_inner = with_meta("ARROW:extension:name", "arrow.uuid",
DataType::Int64);
+ let b_inner = with_meta("ARROW:extension:name", "arrow.json",
DataType::Utf8);
Review Comment:
If this uses official extension names it should probably use valid storage
to avoid confusing future humans or LLMs. Probably arbitrary key/value metadata
is equally illustrative here.
##########
datafusion/functions-nested/src/repeat.rs:
##########
@@ -320,3 +356,39 @@ fn get_count_with_validity(count_array: &Int64Array, idx:
usize) -> usize {
if c > 0 { c as usize } else { 0 }
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use datafusion_expr::ReturnFieldArgs;
+ use std::collections::HashMap;
+
+ /// Regression test for #21982: `array_repeat` must propagate the input
+ /// field's metadata onto the resulting list's inner field.
+ #[test]
+ fn array_repeat_preserves_inner_field_metadata() -> Result<()> {
+ let metadata = HashMap::from([(
+ "ARROW:extension:name".to_string(),
+ "arrow.uuid".to_string(),
+ )]);
Review Comment:
This should probably use a valid storage type or arbitrary key/value metadata
##########
datafusion/functions-nested/src/map.rs:
##########
@@ -419,6 +479,43 @@ fn get_element_type(data_type: &DataType) ->
Result<&DataType> {
}
}
+/// Like [`get_element_type`] but returns the full inner [`Field`] so callers
+/// can read its metadata (used to propagate Arrow extension types onto the
+/// resulting map's `key` / `value` fields).
+fn get_element_field(field: &FieldRef) -> Result<&FieldRef> {
+ match field.data_type() {
+ DataType::List(inner)
+ | DataType::LargeList(inner)
+ | DataType::FixedSizeList(inner, _) => Ok(inner),
Review Comment:
For some of these helpers it is probably worth putting ListView and
LargeListView in for the day when these are supported (they will error
elsewhere else for things like this)
##########
datafusion/functions-nested/src/range.rs:
##########
@@ -247,21 +247,73 @@ impl ScalarUDFImpl for Range {
}
}
+ fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
+ if args.arg_fields.iter().any(|f| f.data_type().is_null()) {
+ return Ok(Arc::new(Field::new(self.name(), DataType::Null, true)));
+ }
+ // Reuse the data type computed by `return_type` so the date64→date32
+ // coercion and timezone preservation logic stays in one place. Then
+ // walk the result and replace the inner field with a metadata-bearing
+ // copy of the start arg's field so extension types flow through.
+ let arg_types: Vec<_> = args
Review Comment:
Do you want range() to propagate field metadata? The one extension type
example I have for this is a discrete grid system (S2) which is an extension on
top of a u64 where adding the step would result in an invalid or meaninglessly
related identifier. For extension types specifically this probably only makes
sense where the "+" operation is meaningful (and identical to the storage
type). I would personally rather this didn't propagate metadata.
##########
datafusion/sqllogictest/test_files/array_metadata_propagation.slt:
##########
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Regression tests for https://github.com/apache/datafusion/issues/21982:
+# UDFs/UDAFs that wrap an input column into a composite type (List, Struct,
+# Map, ...) must propagate the input field's metadata onto the inner field
+# of the output composite. The data type rendering used by `arrow_field`
+# includes inner-field metadata, so we verify by string-matching the
+# `data_type` projection.
+
+# make_array preserves inner field metadata
+query T
+SELECT arrow_field(make_array(with_metadata(1, 'k', 'v')))['data_type']
+----
+List(Int64, metadata: {"k": "v"})
+
+# array_repeat preserves inner field metadata
+query T
+SELECT arrow_field(array_repeat(with_metadata(1, 'k', 'v'), 2))['data_type']
+----
+List(Int64, metadata: {"k": "v"})
+
+# range and generate_series propagate the start argument's metadata
+query T
+SELECT arrow_field(range(with_metadata(1, 'k', 'v'), 4))['data_type']
+----
+List(Int64, metadata: {"k": "v"})
+
+query T
+SELECT arrow_field(generate_series(with_metadata(1, 'k', 'v'), 3))['data_type']
+----
+List(Int64, metadata: {"k": "v"})
+
+# struct preserves member field metadata
+query T
+SELECT arrow_field(struct(with_metadata(1, 'k', 'v')))['data_type']
+----
+Struct("c0": Int64, metadata: {"k": "v"})
+
+# arrays_zip preserves struct member metadata
+query T
+SELECT arrow_field(arrays_zip(make_array(with_metadata(1, 'k',
'v'))))['data_type']
+----
+List(Struct("1": Int64, metadata: {"k": "v"}))
+
+# map preserves key/value field metadata
+query T
+SELECT arrow_field(map(make_array(with_metadata('a', 'k', 'v')),
make_array(with_metadata(1, 'k2', 'v2'))))['data_type']
+----
+Map("entries": non-null Struct("key": non-null Utf8, metadata: {"k": "v"},
"value": Int64, metadata: {"k2": "v2"}), unsorted)
+
+# array_agg preserves inner field metadata
+query T
+SELECT arrow_field(array_agg(with_metadata(column1, 'k', 'v')))['data_type']
+FROM (VALUES (1), (2), (3))
+----
+List(Int64, metadata: {"k": "v"})
+
+# array_agg DISTINCT preserves inner field metadata
Review Comment:
FWIW "distinct" has varying degrees of meaningful (byte-for-byte storage
equality is usually fine though, if perhaps conservative in some cases)
##########
datafusion/functions/src/core/struct.rs:
##########
@@ -114,6 +117,25 @@ impl ScalarUDFImpl for StructFunc {
Ok(DataType::Struct(fields))
}
+ fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
+ if args.arg_fields.is_empty() {
+ return exec_err!("struct requires at least one argument, got 0
instead");
+ }
+ // Preserve each input field's metadata on the corresponding struct
+ // member field so Arrow extension types survive `struct(...)` calls.
+ let fields: Fields = args
+ .arg_fields
+ .iter()
+ .enumerate()
+ .map(|(pos, f)| nullable_inner_field_from(f, &format!("c{pos}")))
Review Comment:
Should the inner fields always be nullable or inherit the nullability of the
argument?
##########
datafusion/functions-nested/src/map.rs:
##########
@@ -244,6 +262,7 @@ fn make_map_batch_internal(
values: &ArrayRef,
can_evaluate_to_const: bool,
data_type: &DataType,
+ entries: Option<FieldRef>,
Review Comment:
Wherever possible (e.g., local or crate local functions where we're not
breaking compatibility) it is probably better to have this be `FieldRef`
(non-optional) to make it harder for future contributors to stop propagating
the metadata.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]