This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 57eeb266af arrow-cast: Add ability to cast plain struct to dictionary
(#10039)
57eeb266af is described below
commit 57eeb266af09f7fee47b6d4265ed2bdff6746929
Author: Frederic Branczyk <[email protected]>
AuthorDate: Tue Jun 2 14:55:53 2026 +0000
arrow-cast: Add ability to cast plain struct to dictionary (#10039)
# Which issue does this PR close?
- Closes #10038
# What changes are included in this PR?
A naive implementation of casting plain structs to dictionaries, that
doesn't perform any deduplication.
# Are these changes tested?
Unit tests added.
# Are there any user-facing changes?
No, just a new feature.
@alamb @Jefffrey
---
arrow-cast/src/cast/dictionary.rs | 39 ++++++++++++
arrow-cast/src/cast/mod.rs | 126 ++++++++++++++++++++++++++++++++++++++
2 files changed, 165 insertions(+)
diff --git a/arrow-cast/src/cast/dictionary.rs
b/arrow-cast/src/cast/dictionary.rs
index 601f50a4d0..83aa691482 100644
--- a/arrow-cast/src/cast/dictionary.rs
+++ b/arrow-cast/src/cast/dictionary.rs
@@ -315,12 +315,51 @@ pub(crate) fn cast_to_dictionary<K:
ArrowDictionaryKeyType>(
FixedSizeBinary(byte_size) => {
pack_byte_to_fixed_size_dictionary::<K>(array, cast_options,
byte_size)
}
+ Struct(_) => pack_struct_to_dictionary::<K>(array, dict_value_type,
cast_options),
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {dict_value_type}"
))),
}
}
+/// Wrap a struct-valued array as a `DictionaryArray<K, Struct>` with identity
+/// keys `[0, 1, ..., len-1]`. Unlike the primitive / byte packers above, no
+/// deduplication is performed, since struct values have no general
hash/equality
+/// builder in arrow-rs.
+///
+/// Each child field of the source is recursively cast to the matching field of
+/// `dict_value_type` via `cast_with_options` before keys are emitted. If any
+/// child cast fails, the whole pack fails, the same contract as the primitive
+/// packers above.
+fn pack_struct_to_dictionary<K: ArrowDictionaryKeyType>(
+ array: &dyn Array,
+ dict_value_type: &DataType,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
+ let len = cast_values.len();
+
+ // Identity keys `[0, 1, ..., len-1]`, with null entries wherever the
+ // source row is null so the dictionary's logical null mask matches.
+ let mut builder = PrimitiveBuilder::<K>::with_capacity(len);
+ for i in 0..len {
+ if cast_values.is_null(i) {
+ builder.append_null();
+ } else {
+ let key = K::Native::from_usize(i).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Cannot fit {len} dictionary keys in {:?}",
+ K::DATA_TYPE,
+ ))
+ })?;
+ builder.append_value(key);
+ }
+ }
+ let keys = builder.finish();
+
+ Ok(Arc::new(DictionaryArray::<K>::try_new(keys, cast_values)?))
+}
+
// Packs the data from the primitive array of type <V> to a
// DictionaryArray with keys of type K and values of value_type V
pub(crate) fn pack_numeric_to_dictionary<K, V>(
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 0367a54121..4d67703ea6 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -6278,6 +6278,132 @@ mod tests {
assert_ne!(keys.value(0), keys.value(1));
}
+ #[test]
+ fn test_cast_struct_array_to_dict_struct() {
+ // Cast a StructArray into Dictionary<UInt32, Struct{…}>. The
dictionary
+ // value type's child fields may differ from the source's (here:
+ // Utf8 source → Utf8View child for `name`), so the per-field cast
+ // must run before identity keys are emitted. This is the "as long as
+ // the struct can be cast to the dict value" contract.
+ let names = StringArray::from(vec![Some("alpha"), None,
Some("gamma")]);
+ let ids = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+ let source = StructArray::from(vec![
+ (
+ Arc::new(Field::new("name", DataType::Utf8, true)),
+ Arc::new(names) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("id", DataType::Int32, false)),
+ Arc::new(ids) as ArrayRef,
+ ),
+ ]);
+
+ let target_value_type = DataType::Struct(
+ vec![
+ Field::new("name", DataType::Utf8View, true),
+ Field::new("id", DataType::Int64, false),
+ ]
+ .into(),
+ );
+ let cast_type = DataType::Dictionary(
+ Box::new(DataType::UInt32),
+ Box::new(target_value_type.clone()),
+ );
+ assert!(can_cast_types(source.data_type(), &cast_type));
+
+ let cast_array = cast(&source, &cast_type).unwrap();
+ assert_eq!(cast_array.data_type(), &cast_type);
+ assert_eq!(cast_array.len(), 3);
+
+ let dict = cast_array.as_dictionary::<UInt32Type>();
+ assert_eq!(dict.values().data_type(), &target_value_type);
+ // No dedup is performed for struct values — one row, one key.
+ assert_eq!(dict.values().len(), 3);
+
+ // Source row 1 was a `Utf8`-null in the `name` field but the whole
+ // struct row was valid (StructArray::from above takes per-field
+ // nulls only). The dictionary's logical null mask therefore mirrors
+ // the source struct's row-level null mask — all rows valid here.
+ let keys = dict.keys();
+ assert_eq!(keys.values(), &[0u32, 1, 2]);
+ assert_eq!(keys.null_count(), 0);
+
+ let struct_values = dict.values().as_struct();
+ let names_out = struct_values
+ .column_by_name("name")
+ .unwrap()
+ .as_string_view();
+ assert_eq!(names_out.value(0), "alpha");
+ assert!(names_out.is_null(1));
+ assert_eq!(names_out.value(2), "gamma");
+ let ids_out = struct_values
+ .column_by_name("id")
+ .unwrap()
+ .as_primitive::<Int64Type>();
+ assert_eq!(ids_out.values(), &[1i64, 2, 3]);
+ }
+
+ #[test]
+ fn test_cast_struct_array_to_dict_struct_row_nulls() {
+ // Row-level nulls on the source struct must surface as null keys on
+ // the dictionary, since the dictionary's logical null mask is
+ // determined by the keys.
+ let names = StringArray::from(vec![Some("alpha"), Some("beta"),
Some("gamma")]);
+ let ids = Int32Array::from(vec![Some(1), Some(2), Some(3)]);
+ let source = StructArray::try_new(
+ vec![
+ Field::new("name", DataType::Utf8, true),
+ Field::new("id", DataType::Int32, false),
+ ]
+ .into(),
+ vec![Arc::new(names) as ArrayRef, Arc::new(ids) as ArrayRef],
+ Some(NullBuffer::from(vec![true, false, true])),
+ )
+ .unwrap();
+
+ let target_value_type = DataType::Struct(
+ vec![
+ Field::new("name", DataType::Utf8, true),
+ Field::new("id", DataType::Int32, false),
+ ]
+ .into(),
+ );
+ let cast_type =
+ DataType::Dictionary(Box::new(DataType::UInt32),
Box::new(target_value_type));
+
+ let cast_array = cast(&source, &cast_type).unwrap();
+ let dict = cast_array.as_dictionary::<UInt32Type>();
+ assert_eq!(dict.len(), 3);
+ let keys = dict.keys();
+ assert!(!keys.is_null(0));
+ assert!(keys.is_null(1));
+ assert!(!keys.is_null(2));
+ }
+
+ #[test]
+ fn test_cast_struct_array_to_dict_struct_key_overflow() {
+ // Source has 300 rows but the dictionary key type is UInt8 (max 255).
+ // We must return a CastError instead of silently truncating.
+ let n = 300;
+ let names = StringArray::from((0..n).map(|i|
Some(format!("v{i}"))).collect::<Vec<_>>());
+ let source = StructArray::from(vec![(
+ Arc::new(Field::new("name", DataType::Utf8, true)),
+ Arc::new(names) as ArrayRef,
+ )]);
+
+ let cast_type = DataType::Dictionary(
+ Box::new(DataType::UInt8),
+ Box::new(DataType::Struct(
+ vec![Field::new("name", DataType::Utf8, true)].into(),
+ )),
+ );
+ let err = cast(&source, &cast_type).unwrap_err().to_string();
+ assert!(
+ err.contains("Cannot fit") && err.contains("dictionary keys"),
+ "expected key-overflow error, got: {err}"
+ );
+ }
+
#[test]
fn test_cast_empty_string_array_to_dict_utf8_view() {
let array = StringArray::from(Vec::<Option<&str>>::new());