alamb commented on code in PR #8839:
URL: https://github.com/apache/arrow-rs/pull/8839#discussion_r2543197226
##########
arrow-row/src/lib.rs:
##########
@@ -1762,6 +1907,110 @@ unsafe fn decode_column(
},
_ => unreachable!(),
},
+ Codec::Union(converters, null_rows, _mode) => {
+ let len = rows.len();
+
+ let DataType::Union(union_fields, mode) = &field.data_type else {
+ unreachable!()
+ };
+
+ let mut type_ids = Vec::with_capacity(len);
+ let mut rows_by_field: Vec<Vec<(usize, &[u8])>> = vec![Vec::new();
converters.len()];
+
+ for (idx, row) in rows.iter_mut().enumerate() {
+ // skip the null sentinel
+ let mut cursor = 1;
Review Comment:
I think you need to look at the null byte to recover nulls 🤔
##########
arrow-row/src/lib.rs:
##########
@@ -3598,4 +3847,144 @@ mod tests {
assert_eq!(unchecked_values_len, 13);
assert!(checked_values_len > unchecked_values_len);
}
+
+ #[test]
+ fn test_sparse_union() {
Review Comment:
can you please also add tests here for union arrays that have nulls?
Specifically for a union array that has a null buffer
##########
arrow-row/src/lib.rs:
##########
@@ -1637,6 +1733,55 @@ fn encode_column(
},
_ => unreachable!(),
},
+ Encoder::Union {
+ child_rows,
+ type_ids,
+ offsets: offsets_buf,
+ mode,
+ } => {
+ let union_array = as_union_array(column);
+ let null_sentinel = null_sentinel(opts);
+
+ offsets
+ .iter_mut()
+ .skip(1)
+ .enumerate()
+ .for_each(|(i, offset)| {
+ let sentinel = if union_array.is_valid(i) {
+ 0x01
+ } else {
+ null_sentinel
+ };
+
+ let type_id = type_ids[i];
+
+ let child_row_idx = match (mode, offsets_buf) {
+ (UnionMode::Dense, Some(o)) => o[i] as usize,
+ (UnionMode::Sparse, None) => i,
+ foreign => {
+ unreachable!("invalid union mode/offsets
combination: {foreign:?}")
Review Comment:
see above for a way to simplify this (don't hold mode too)
##########
arrow-row/src/lib.rs:
##########
@@ -592,6 +624,29 @@ impl Codec {
let rows =
converter.convert_columns(std::slice::from_ref(values))?;
Ok(Encoder::RunEndEncoded(rows))
}
+ Codec::Union(converters, _, mode) => {
+ let union_array = array
+ .as_any()
+ .downcast_ref::<UnionArray>()
+ .expect("expected Union array");
+
+ let type_ids = union_array.type_ids().clone();
+ let offsets = union_array.offsets().cloned();
+
+ let mut child_rows = Vec::with_capacity(converters.len());
+ for (type_id, converter) in converters.iter().enumerate() {
+ let child_array = union_array.child(type_id as i8);
Review Comment:
I think it makes sense because the type_id is the index of the child field
types
Maybe we can document better that `converters` is indexed by `type_id` 🤔
##########
arrow-row/src/lib.rs:
##########
@@ -622,6 +681,13 @@ enum Encoder<'a> {
List(Rows),
/// The row encoding of the values array
RunEndEncoded(Rows),
+ /// The row encoding of each union field's child array, type_ids buffer,
offsets buffer (for Dense), and mode
+ Union {
+ child_rows: Vec<Rows>,
+ type_ids: ScalarBuffer<i8>,
+ offsets: Option<ScalarBuffer<i32>>,
Review Comment:
strictly speaking the mode is redundant here -- if there are no offsets,
then the mode is sparse, otherwise the mode is dense. You could probably
simplify the code if you removed the redundancy
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]