This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 4fa8d2ff5f fix(ipc): replace wildcard in skip_field with explicit 
DataType handling (#9822)
4fa8d2ff5f is described below

commit 4fa8d2ff5f18f2d773f9642631715509f844a062
Author: pchintar <[email protected]>
AuthorDate: Sat Apr 25 11:52:47 2026 -0400

    fix(ipc): replace wildcard in skip_field with explicit DataType handling 
(#9822)
    
    # Which issue does this PR close?
    
    - Closes #9821 .
    
    # Rationale for this change
    
    `skip_field` currently uses a wildcard match (`_`) to skip remaining
    `DataType` variants by assuming a two-buffer layout. This is not robust,
    as new variants may have different buffer layouts, leading to incorrect
    skipping and buffer misalignment.
    
    Replacing the wildcard with explicit handling ensures the correct number
    of buffers are skipped and avoids silent errors for future types.
    
    # What changes are included in this PR?
    
    * Removed the wildcard (`_`) match arm in `skip_field`
    * Added explicit handling for all remaining fixed-width and boolean
    `DataType` variants
    * Each of these types now explicitly skips:
    
      * null buffer
      * values buffer
    
    File updated:
    
    * `arrow-ipc/src/reader.rs`
    
    # Are these changes tested?
    
    Yes.
    
    * Added test: `test_projection_skip_fixed_width_types` in
    `arrow-ipc/src/reader.rs`
    
    * The test iterates over all fixed-width and boolean `DataType` variants
    covered by this change
    
    * For each type:
    
      * writes a batch with `[skipped_column(type), values_column(Int32)]`
      * reads only the second column (skipping the first)
    * verifies whether the returned column exactly matches the original
    `Int32` values
    
    * This directly validates that skipping each of these types consumes the
    correct number of buffers
    
    * Also, all existing `arrow-ipc` tests pass (`cargo test -p arrow-ipc
    --lib`)
    
    # Are there any user-facing changes?
    
    No.
---
 arrow-ipc/src/reader.rs | 125 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 5312fad3ae..aa66696271 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -707,8 +707,33 @@ impl<'a> RecordBatchDecoder<'a> {
                     self.skip_field(field, variadic_count)?
                 }
             }
-            Null => {} // No buffer increases
-            _ => {
+            // Null has no buffers to skip
+            Null => {}
+
+            // Fixed-width and boolean types: skip null buffer + values buffer
+            Boolean
+            | Int8
+            | Int16
+            | Int32
+            | Int64
+            | UInt8
+            | UInt16
+            | UInt32
+            | UInt64
+            | Float16
+            | Float32
+            | Float64
+            | Timestamp(_, _)
+            | Date32
+            | Date64
+            | Time32(_)
+            | Time64(_)
+            | Duration(_)
+            | Interval(_)
+            | Decimal32(_, _)
+            | Decimal64(_, _)
+            | Decimal128(_, _)
+            | Decimal256(_, _) => {
                 self.skip_buffer();
                 self.skip_buffer();
             }
@@ -3517,4 +3542,100 @@ mod tests {
         assert_eq!(read_batch.num_columns(), 1);
         assert_eq!(read_batch.column(0).as_ref(), &values);
     }
+
+    // Tests reading a column when preceding fixed-width and boolean columns 
are skipped.
+    // Covers all types that use the same two-buffer layout (null + values).
+    // Verifies that skipping these types does not affect subsequent column 
decoding.
+    #[test]
+    fn test_projection_skip_fixed_width_types() {
+        use std::sync::Arc;
+
+        use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch, 
make_array};
+        use arrow_buffer::Buffer;
+        use arrow_data::ArrayData;
+        use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
+
+        use crate::reader::FileReader;
+        use crate::writer::FileWriter;
+
+        // Create a minimal array for a given fixed-width or boolean type
+        fn make_array_for_type(data_type: DataType) -> ArrayRef {
+            let len = 3;
+
+            if matches!(data_type, DataType::Boolean) {
+                return Arc::new(BooleanArray::from(vec![true, false, true]));
+            }
+
+            let width = data_type.primitive_width().unwrap();
+            let data = ArrayData::builder(data_type)
+                .len(len)
+                .add_buffer(Buffer::from(vec![0_u8; len * width]))
+                .build()
+                .unwrap();
+
+            make_array(data)
+        }
+
+        // List of types that follow the same two-buffer layout (null + values)
+        let data_types = vec![
+            DataType::Boolean,
+            DataType::Int8,
+            DataType::Int16,
+            DataType::Int32,
+            DataType::Int64,
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+            DataType::Float16,
+            DataType::Float32,
+            DataType::Float64,
+            DataType::Timestamp(TimeUnit::Second, None),
+            DataType::Date32,
+            DataType::Date64,
+            DataType::Time32(TimeUnit::Second),
+            DataType::Time64(TimeUnit::Microsecond),
+            DataType::Duration(TimeUnit::Second),
+            DataType::Interval(IntervalUnit::YearMonth),
+            DataType::Interval(IntervalUnit::DayTime),
+            DataType::Interval(IntervalUnit::MonthDayNano),
+            DataType::Decimal32(9, 2),
+            DataType::Decimal64(18, 2),
+            DataType::Decimal128(38, 2),
+            DataType::Decimal256(76, 2),
+        ];
+
+        // For each type:
+        // - write a batch with [skipped_column, values]
+        // - read only the second column
+        // - verify the result is correct
+        for data_type in data_types {
+            let skipped = make_array_for_type(data_type.clone());
+            let values = Int32Array::from(vec![10, 20, 30]);
+
+            let schema = Arc::new(Schema::new(vec![
+                Field::new("skipped", data_type, false),
+                Field::new("values", DataType::Int32, false),
+            ]));
+
+            let batch =
+                RecordBatch::try_new(schema, vec![skipped, 
Arc::new(values.clone())]).unwrap();
+
+            // Serialize the batch into IPC format
+            let mut buf = Vec::new();
+            {
+                let mut writer = FileWriter::try_new(&mut buf, 
&batch.schema()).unwrap();
+                writer.write(&batch).unwrap();
+                writer.finish().unwrap();
+            }
+
+            // Read back only the second column (skip the first)
+            let mut reader = FileReader::try_new(std::io::Cursor::new(buf), 
Some(vec![1])).unwrap();
+            let read_batch = reader.next().unwrap().unwrap();
+
+            // Verify that the returned column matches the original values 
column
+            assert_eq!(read_batch.num_columns(), 1);
+            assert_eq!(read_batch.column(0).as_ref(), &values);
+        }
+    }
 }

Reply via email to