This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new b00b5aa3bb fix(ipc): correct skip_field handling for V4 Union (#9829)
b00b5aa3bb is described below

commit b00b5aa3bbe3aab8ea6595fdd99794a11c1ca730
Author: pchintar <[email protected]>
AuthorDate: Mon Apr 27 16:08:19 2026 -0400

    fix(ipc): correct skip_field handling for V4 Union (#9829)
    
    # Which issue does this PR close?
    
    - Closes #9828 .
    
    # Rationale for this change
    
    Currently, `skip_field` does not correctly handle the buffer layout of
    `Union` types for V4 IPC.
    
    In V4:
    
    * `Union` includes a null buffer + type_ids (+ offsets for dense)
    
    In V5:
    * `Union` has no null buffer, only type_ids (+ offsets for dense)
    
    `create_array` correctly handles this difference using a version check.
    However, `skip_field` always assumes a null buffer and does not skip
    `type_ids`, leading to buffer misalignment when skipping a `Union`
    column in V4.
    
    This can cause incorrect decoding or runtime errors for projected
    columns.
    
    # What changes are included in this PR?
    
    * Updated `skip_field` in `arrow-ipc/src/reader.rs` to:
    
      * conditionally skip the null buffer only for V4
      * explicitly skip the `type_ids` buffer
      * correctly handle dense vs sparse offsets
    
    * Aligns `skip_field` behavior with `create_array` and actual IPC layout
    
    # Are these changes tested?
    
    Yes.
    
    * Added test: `test_projection_skip_union_v4`
    
    * The test:
    
      * writes IPC data using V4 metadata
      * includes a `Union` column followed by an `Int32` column
      * reads only the second column (skipping the `Union`)
      * verifies the output matches expected values
    
    * The test fails before the fix and passes after
    
    * All existing `arrow-ipc` tests pass (`cargo test -p arrow-ipc --lib`)
    
    # Are there any user-facing changes?
    
    No.
---
 arrow-ipc/src/reader.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)

diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index aa66696271..1d5e06c687 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -696,10 +696,13 @@ impl<'a> RecordBatchDecoder<'a> {
                 self.skip_buffer(); // Indices
             }
             Union(fields, mode) => {
-                self.skip_buffer(); // Nulls
+                if self.version < MetadataVersion::V5 {
+                    self.skip_buffer(); // Null buffer
+                }
+                self.skip_buffer(); // Type ids
 
                 match mode {
-                    UnionMode::Dense => self.skip_buffer(),
+                    UnionMode::Dense => self.skip_buffer(), // Offsets
                     UnionMode::Sparse => {}
                 };
 
@@ -3543,6 +3546,60 @@ mod tests {
         assert_eq!(read_batch.column(0).as_ref(), &values);
     }
 
+    // Tests reading a column when a preceding V4 Union column is skipped.
+    // V4 Union columns include a null buffer and type ids (and offsets for 
dense unions).
+    #[test]
+    fn test_projection_skip_union_v4() {
+        use crate::MetadataVersion;
+        use crate::reader::FileReader;
+        use crate::writer::{FileWriter, IpcWriteOptions};
+        use arrow_array::{
+            ArrayRef, Int32Array, RecordBatch, builder::UnionBuilder, 
types::Int32Type,
+        };
+        use arrow_schema::{DataType, Field, Schema};
+        use std::sync::Arc;
+
+        // Build a dense Union column with simple Int32 values
+        let mut builder = UnionBuilder::new_dense();
+        builder.append::<Int32Type>("a", 1).unwrap();
+        builder.append::<Int32Type>("a", 2).unwrap();
+        builder.append::<Int32Type>("a", 3).unwrap();
+        let union = builder.build().unwrap();
+
+        // Second column with known values to verify correctness after 
projection
+        let values = Int32Array::from(vec![10, 20, 30]);
+
+        // Schema: first column is Union (to be skipped), second is Int32 (to 
be read)
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("union", union.data_type().clone(), false),
+            Field::new("values", DataType::Int32, false),
+        ]));
+
+        // Create a batch containing both columns
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(union) as ArrayRef, Arc::new(values.clone())],
+        )
+        .unwrap();
+
+        // Write IPC using V4 metadata to trigger Union null buffer behavior
+        let mut buf = Vec::new();
+        {
+            let options = IpcWriteOptions::try_new(8, false, 
MetadataVersion::V4).unwrap();
+            let mut writer =
+                FileWriter::try_new_with_options(&mut buf, &batch.schema(), 
options).unwrap();
+            writer.write(&batch).unwrap();
+            writer.finish().unwrap();
+        }
+        // Read only the second column (skip the Union column)
+        let mut reader = FileReader::try_new(std::io::Cursor::new(buf), 
Some(vec![1])).unwrap();
+        let read_batch = reader.next().unwrap().unwrap();
+
+        // Verify that the projected column is read correctly after skipping 
Union
+        assert_eq!(read_batch.num_columns(), 1);
+        assert_eq!(read_batch.column(0).as_ref(), &values);
+    }
+
     // Tests reading a column when preceding fixed-width and boolean columns 
are skipped.
     // Covers all types that use the same two-buffer layout (null + values).
     // Verifies that skipping these types does not affect subsequent column 
decoding.

Reply via email to