This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 8de8900e07 arrow-ipc: Write 0 offset buffer for length-0 variable-size 
arrays (#9717)
8de8900e07 is described below

commit 8de8900e0720ad8fa459433cfc3046f1fd6a2265
Author: Atwam <[email protected]>
AuthorDate: Wed Apr 22 14:34:54 2026 +0100

    arrow-ipc: Write 0 offset buffer for length-0 variable-size arrays (#9717)
    
    # Which issue does this PR close?
    
    - Closes #9716 .
    
    # Rationale for this change
    
    Current version serializes a length-0 offsets buffer, and relies on the
    array constructor to set offsets to `[0]` for empty arrays. This does
    not conform with spec, which specifies that the offsets buffer should
    have `length + 1` elements.
    
    This correctly serializes a length-1 offsets buffer containing `[0]`. I
    have left the current behavior of filling-in offsets with `[0]` for
    empty arrays, so that future versions can still read IPC files
    serialized by previous versions.
    
    # What changes are included in this PR?
    
    Added tests for serialization of empty arrays, ensuring length-1 offsets
    buffer. Fixed serialization of empty arrays.
    This fixes serialization of empty Binary/Utf8/List/LargeList (and Map)
    arrays.
    
    # Are these changes tested?
    
    Yes
    
    # Are there any user-facing changes?
    
    There should not be any breaking change for users. Serialized files for
    empty record batches will be slightly different. Older serialized files
    should still be read fine.
    
    ---------
    
    Co-authored-by: Damien Couture <[email protected]>
---
 arrow-ipc/src/reader.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 arrow-ipc/src/writer.rs |  73 +++++++++++++++++++++++++---
 2 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 9afae78b06..411c1f14c2 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -2098,6 +2098,133 @@ mod tests {
         }
     }
 
+    /// Test that the reader can read legacy files where empty list arrays 
were written with a 0-byte offsets buffer.
+    #[test]
+    fn test_read_legacy_empty_list_without_offsets_buffer() {
+        use crate::r#gen::Message::*;
+        use flatbuffers::FlatBufferBuilder;
+
+        let schema = Arc::new(Schema::new(vec![Field::new_list(
+            "items",
+            Field::new_list_field(DataType::Int32, true),
+            true,
+        )]));
+
+        // Legacy arrow-rs versions wrote empty offsets buffers for empty list 
arrays.
+        // Keep reader compatibility with such files by accepting a 0-byte 
offsets buffer.
+        let mut fbb = FlatBufferBuilder::new();
+        let nodes = fbb.create_vector(&[
+            FieldNode::new(0, 0), // list node
+            FieldNode::new(0, 0), // child int32 node
+        ]);
+        let buffers = fbb.create_vector(&[
+            crate::Buffer::new(0, 0), // list validity
+            crate::Buffer::new(0, 0), // list offsets (legacy empty buffer)
+            crate::Buffer::new(0, 0), // child validity
+            crate::Buffer::new(0, 0), // child values
+        ]);
+        let batch_offset = RecordBatch::create(
+            &mut fbb,
+            &RecordBatchArgs {
+                length: 0,
+                nodes: Some(nodes),
+                buffers: Some(buffers),
+                compression: None,
+                variadicBufferCounts: None,
+            },
+        );
+        fbb.finish_minimal(batch_offset);
+        let batch_bytes = fbb.finished_data().to_vec();
+        let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();
+
+        let body = Buffer::from(Vec::<u8>::new());
+        let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
+        let metadata = MetadataVersion::V5;
+
+        let decoder =
+            RecordBatchDecoder::try_new(&body, batch, schema.clone(), 
&dictionaries, &metadata)
+                .unwrap();
+
+        let read_batch = decoder.read_record_batch().unwrap();
+        assert_eq!(read_batch.num_rows(), 0);
+
+        let list = read_batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<ListArray>()
+            .unwrap();
+        assert_eq!(list.len(), 0);
+        assert_eq!(list.values().len(), 0);
+    }
+
+    /// Test that the reader can read legacy files where empty Utf8/Binary 
arrays were written with a 0-byte offsets buffer.
+    #[test]
+    fn test_read_legacy_empty_utf8_and_binary_without_offsets_buffer() {
+        use crate::r#gen::Message::*;
+        use flatbuffers::FlatBufferBuilder;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("name", DataType::Utf8, true),
+            Field::new("payload", DataType::Binary, true),
+        ]));
+
+        // Legacy arrow-rs versions wrote empty offsets buffers for empty 
Utf8/Binary arrays.
+        // Keep reader compatibility with such files by accepting 0-byte 
offsets buffers.
+        let mut fbb = FlatBufferBuilder::new();
+        let nodes = fbb.create_vector(&[
+            FieldNode::new(0, 0), // utf8 node
+            FieldNode::new(0, 0), // binary node
+        ]);
+        let buffers = fbb.create_vector(&[
+            crate::Buffer::new(0, 0), // utf8 validity
+            crate::Buffer::new(0, 0), // utf8 offsets (legacy empty buffer)
+            crate::Buffer::new(0, 0), // utf8 values
+            crate::Buffer::new(0, 0), // binary validity
+            crate::Buffer::new(0, 0), // binary offsets (legacy empty buffer)
+            crate::Buffer::new(0, 0), // binary values
+        ]);
+        let batch_offset = RecordBatch::create(
+            &mut fbb,
+            &RecordBatchArgs {
+                length: 0,
+                nodes: Some(nodes),
+                buffers: Some(buffers),
+                compression: None,
+                variadicBufferCounts: None,
+            },
+        );
+        fbb.finish_minimal(batch_offset);
+        let batch_bytes = fbb.finished_data().to_vec();
+        let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();
+
+        let body = Buffer::from(Vec::<u8>::new());
+        let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
+        let metadata = MetadataVersion::V5;
+
+        let decoder =
+            RecordBatchDecoder::try_new(&body, batch, schema.clone(), 
&dictionaries, &metadata)
+                .unwrap();
+
+        let read_batch = decoder.read_record_batch().unwrap();
+        assert_eq!(read_batch.num_rows(), 0);
+
+        let utf8 = read_batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!(utf8.len(), 0);
+        assert_eq!(utf8.value_offsets(), [0]);
+
+        let binary = read_batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+        assert_eq!(binary.len(), 0);
+        assert_eq!(binary.value_offsets(), [0]);
+    }
+
     #[test]
     fn test_projection_array_values() {
         // define schema
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 5fc4027a8f..a05072a2c4 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -37,7 +37,7 @@ use arrow_array::cast::*;
 use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType};
 use arrow_array::*;
 use arrow_buffer::bit_util;
-use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ToByteSlice};
 use arrow_data::{ArrayData, ArrayDataBuilder, BufferSpec, layout};
 use arrow_schema::*;
 
@@ -1722,7 +1722,11 @@ fn reencode_offsets<O: OffsetSizeTrait>(
 /// size of sliced arrays, as values that have been sliced away are not encoded
 fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, 
Buffer) {
     if data.is_empty() {
-        return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());
+        // As per specification, offsets buffer has N+1 elements.
+        // So an empty array should still be encoded with a single 0 offset.
+        let mut offsets = MutableBuffer::new(size_of::<O>());
+        offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
+        return (offsets.into(), MutableBuffer::new(0).into());
     }
 
     let (offsets, original_start_offset, len) = 
reencode_offsets::<O>(&data.buffers()[0], data);
@@ -1734,10 +1738,11 @@ fn get_byte_array_buffers<O: OffsetSizeTrait>(data: 
&ArrayData) -> (Buffer, Buff
 /// of a values buffer.
 fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, 
ArrayData) {
     if data.is_empty() {
-        return (
-            MutableBuffer::new(0).into(),
-            data.child_data()[0].slice(0, 0),
-        );
+        // As per specification, offsets buffer has N+1 elements.
+        // So an empty array should still be encoded with a single 0 offset.
+        let mut offsets = MutableBuffer::new(size_of::<O>());
+        offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
+        return (offsets.into(), data.child_data()[0].slice(0, 0));
     }
 
     let (offsets, original_start_offset, len) = 
reencode_offsets::<O>(&data.buffers()[0], data);
@@ -2370,6 +2375,62 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_empty_utf8_ipc_writes_nonempty_offsets_buffer() {
+        let name = StringArray::from(Vec::<String>::new());
+        let (offsets, values) = get_byte_array_buffers::<i32>(&name.to_data());
+
+        assert_eq!(name.len(), 0);
+        assert_eq!(
+            offsets.len(),
+            std::mem::size_of::<i32>(),
+            "offsets buffer should contain one zero i32 offset"
+        );
+        assert_eq!(values.len(), 0, "values buffer should remain empty");
+    }
+
+    #[test]
+    fn test_empty_large_utf8_ipc_writes_nonempty_offsets_buffer() {
+        let name = LargeStringArray::from(Vec::<String>::new());
+        let (offsets, values) = get_byte_array_buffers::<i64>(&name.to_data());
+
+        assert_eq!(name.len(), 0);
+        assert_eq!(
+            offsets.len(),
+            std::mem::size_of::<i64>(),
+            "offsets buffer should contain one zero i64 offset"
+        );
+        assert_eq!(values.len(), 0, "values buffer should remain empty");
+    }
+
+    #[test]
+    fn test_empty_list_ipc_writes_nonempty_offsets_buffer() {
+        let list = GenericListBuilder::<i32, 
_>::new(UInt32Builder::new()).finish();
+        let (offsets, child_data) = 
get_list_array_buffers::<i32>(&list.to_data());
+
+        assert_eq!(list.len(), 0);
+        assert_eq!(
+            offsets.len(),
+            std::mem::size_of::<i32>(),
+            "offsets buffer should contain one zero i32 offset"
+        );
+        assert_eq!(child_data.len(), 0, "child data should remain empty");
+    }
+
+    #[test]
+    fn test_empty_large_list_ipc_writes_nonempty_offsets_buffer() {
+        let list = GenericListBuilder::<i64, 
_>::new(UInt32Builder::new()).finish();
+        let (offsets, child_data) = 
get_list_array_buffers::<i64>(&list.to_data());
+
+        assert_eq!(list.len(), 0);
+        assert_eq!(
+            offsets.len(),
+            std::mem::size_of::<i64>(),
+            "offsets buffer should contain one zero i64 offset"
+        );
+        assert_eq!(child_data.len(), 0, "child data should remain empty");
+    }
+
     fn write_null_file(options: IpcWriteOptions) {
         let schema = Schema::new(vec![
             Field::new("nulls", DataType::Null, true),

Reply via email to