This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 8de8900e07 arrow-ipc: Write 0 offset buffer for length-0 variable-size
arrays (#9717)
8de8900e07 is described below
commit 8de8900e0720ad8fa459433cfc3046f1fd6a2265
Author: Atwam <[email protected]>
AuthorDate: Wed Apr 22 14:34:54 2026 +0100
arrow-ipc: Write 0 offset buffer for length-0 variable-size arrays (#9717)
# Which issue does this PR close?
- Closes #9716 .
# Rationale for this change
Current version serializes a length-0 offsets buffer, and relies on the
array constructor to set offsets to `[0]` for empty arrays. This does
not conform with spec, which specifies that the offsets buffer should
have `length + 1` elements.
This correctly serializes a length-1 offsets buffer containing `[0]`. I
have left the current behavior of filling-in offsets with `[0]` for
empty arrays, so that future versions can still read IPC files
serialized by previous versions.
# What changes are included in this PR?
Added tests for serialization of empty arrays, ensuring length-1 offsets
buffer. Fixed serialization of empty arrays.
This fixes serialization of empty Binary/Utf8/List/LargeList (and Map)
arrays.
# Are these changes tested?
Yes
# Are there any user-facing changes?
There should not be any breaking change for users. Serialized files for
empty record batches will be slightly different. Older serialized files
should still be read fine.
---------
Co-authored-by: Damien Couture <[email protected]>
---
arrow-ipc/src/reader.rs | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
arrow-ipc/src/writer.rs | 73 +++++++++++++++++++++++++---
2 files changed, 194 insertions(+), 6 deletions(-)
diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 9afae78b06..411c1f14c2 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -2098,6 +2098,133 @@ mod tests {
}
}
+ /// Test that the reader can read legacy files where empty list arrays
were written with a 0-byte offsets buffer.
+ #[test]
+ fn test_read_legacy_empty_list_without_offsets_buffer() {
+ use crate::r#gen::Message::*;
+ use flatbuffers::FlatBufferBuilder;
+
+ let schema = Arc::new(Schema::new(vec![Field::new_list(
+ "items",
+ Field::new_list_field(DataType::Int32, true),
+ true,
+ )]));
+
+ // Legacy arrow-rs versions wrote empty offsets buffers for empty list
arrays.
+ // Keep reader compatibility with such files by accepting a 0-byte
offsets buffer.
+ let mut fbb = FlatBufferBuilder::new();
+ let nodes = fbb.create_vector(&[
+ FieldNode::new(0, 0), // list node
+ FieldNode::new(0, 0), // child int32 node
+ ]);
+ let buffers = fbb.create_vector(&[
+ crate::Buffer::new(0, 0), // list validity
+ crate::Buffer::new(0, 0), // list offsets (legacy empty buffer)
+ crate::Buffer::new(0, 0), // child validity
+ crate::Buffer::new(0, 0), // child values
+ ]);
+ let batch_offset = RecordBatch::create(
+ &mut fbb,
+ &RecordBatchArgs {
+ length: 0,
+ nodes: Some(nodes),
+ buffers: Some(buffers),
+ compression: None,
+ variadicBufferCounts: None,
+ },
+ );
+ fbb.finish_minimal(batch_offset);
+ let batch_bytes = fbb.finished_data().to_vec();
+ let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();
+
+ let body = Buffer::from(Vec::<u8>::new());
+ let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
+ let metadata = MetadataVersion::V5;
+
+ let decoder =
+ RecordBatchDecoder::try_new(&body, batch, schema.clone(),
&dictionaries, &metadata)
+ .unwrap();
+
+ let read_batch = decoder.read_record_batch().unwrap();
+ assert_eq!(read_batch.num_rows(), 0);
+
+ let list = read_batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<ListArray>()
+ .unwrap();
+ assert_eq!(list.len(), 0);
+ assert_eq!(list.values().len(), 0);
+ }
+
+ /// Test that the reader can read legacy files where empty Utf8/Binary
arrays were written with a 0-byte offsets buffer.
+ #[test]
+ fn test_read_legacy_empty_utf8_and_binary_without_offsets_buffer() {
+ use crate::r#gen::Message::*;
+ use flatbuffers::FlatBufferBuilder;
+
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("name", DataType::Utf8, true),
+ Field::new("payload", DataType::Binary, true),
+ ]));
+
+ // Legacy arrow-rs versions wrote empty offsets buffers for empty
Utf8/Binary arrays.
+ // Keep reader compatibility with such files by accepting 0-byte
offsets buffers.
+ let mut fbb = FlatBufferBuilder::new();
+ let nodes = fbb.create_vector(&[
+ FieldNode::new(0, 0), // utf8 node
+ FieldNode::new(0, 0), // binary node
+ ]);
+ let buffers = fbb.create_vector(&[
+ crate::Buffer::new(0, 0), // utf8 validity
+ crate::Buffer::new(0, 0), // utf8 offsets (legacy empty buffer)
+ crate::Buffer::new(0, 0), // utf8 values
+ crate::Buffer::new(0, 0), // binary validity
+ crate::Buffer::new(0, 0), // binary offsets (legacy empty buffer)
+ crate::Buffer::new(0, 0), // binary values
+ ]);
+ let batch_offset = RecordBatch::create(
+ &mut fbb,
+ &RecordBatchArgs {
+ length: 0,
+ nodes: Some(nodes),
+ buffers: Some(buffers),
+ compression: None,
+ variadicBufferCounts: None,
+ },
+ );
+ fbb.finish_minimal(batch_offset);
+ let batch_bytes = fbb.finished_data().to_vec();
+ let batch = flatbuffers::root::<RecordBatch>(&batch_bytes).unwrap();
+
+ let body = Buffer::from(Vec::<u8>::new());
+ let dictionaries: HashMap<i64, ArrayRef> = HashMap::new();
+ let metadata = MetadataVersion::V5;
+
+ let decoder =
+ RecordBatchDecoder::try_new(&body, batch, schema.clone(),
&dictionaries, &metadata)
+ .unwrap();
+
+ let read_batch = decoder.read_record_batch().unwrap();
+ assert_eq!(read_batch.num_rows(), 0);
+
+ let utf8 = read_batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+ assert_eq!(utf8.len(), 0);
+ assert_eq!(utf8.value_offsets(), [0]);
+
+ let binary = read_batch
+ .column(1)
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .unwrap();
+ assert_eq!(binary.len(), 0);
+ assert_eq!(binary.value_offsets(), [0]);
+ }
+
#[test]
fn test_projection_array_values() {
// define schema
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 5fc4027a8f..a05072a2c4 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -37,7 +37,7 @@ use arrow_array::cast::*;
use arrow_array::types::{Int16Type, Int32Type, Int64Type, RunEndIndexType};
use arrow_array::*;
use arrow_buffer::bit_util;
-use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ToByteSlice};
use arrow_data::{ArrayData, ArrayDataBuilder, BufferSpec, layout};
use arrow_schema::*;
@@ -1722,7 +1722,11 @@ fn reencode_offsets<O: OffsetSizeTrait>(
/// size of sliced arrays, as values that have been sliced away are not encoded
fn get_byte_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer,
Buffer) {
if data.is_empty() {
- return (MutableBuffer::new(0).into(), MutableBuffer::new(0).into());
+ // As per specification, offsets buffer has N+1 elements.
+ // So an empty array should still be encoded with a single 0 offset.
+ let mut offsets = MutableBuffer::new(size_of::<O>());
+ offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
+ return (offsets.into(), MutableBuffer::new(0).into());
}
let (offsets, original_start_offset, len) =
reencode_offsets::<O>(&data.buffers()[0], data);
@@ -1734,10 +1738,11 @@ fn get_byte_array_buffers<O: OffsetSizeTrait>(data:
&ArrayData) -> (Buffer, Buff
/// of a values buffer.
fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer,
ArrayData) {
if data.is_empty() {
- return (
- MutableBuffer::new(0).into(),
- data.child_data()[0].slice(0, 0),
- );
+ // As per specification, offsets buffer has N+1 elements.
+ // So an empty array should still be encoded with a single 0 offset.
+ let mut offsets = MutableBuffer::new(size_of::<O>());
+ offsets.extend_from_slice(O::usize_as(0).to_byte_slice());
+ return (offsets.into(), data.child_data()[0].slice(0, 0));
}
let (offsets, original_start_offset, len) =
reencode_offsets::<O>(&data.buffers()[0], data);
@@ -2370,6 +2375,62 @@ mod tests {
}
}
+ #[test]
+ fn test_empty_utf8_ipc_writes_nonempty_offsets_buffer() {
+ let name = StringArray::from(Vec::<String>::new());
+ let (offsets, values) = get_byte_array_buffers::<i32>(&name.to_data());
+
+ assert_eq!(name.len(), 0);
+ assert_eq!(
+ offsets.len(),
+ std::mem::size_of::<i32>(),
+ "offsets buffer should contain one zero i32 offset"
+ );
+ assert_eq!(values.len(), 0, "values buffer should remain empty");
+ }
+
+ #[test]
+ fn test_empty_large_utf8_ipc_writes_nonempty_offsets_buffer() {
+ let name = LargeStringArray::from(Vec::<String>::new());
+ let (offsets, values) = get_byte_array_buffers::<i64>(&name.to_data());
+
+ assert_eq!(name.len(), 0);
+ assert_eq!(
+ offsets.len(),
+ std::mem::size_of::<i64>(),
+ "offsets buffer should contain one zero i64 offset"
+ );
+ assert_eq!(values.len(), 0, "values buffer should remain empty");
+ }
+
+ #[test]
+ fn test_empty_list_ipc_writes_nonempty_offsets_buffer() {
+ let list = GenericListBuilder::<i32,
_>::new(UInt32Builder::new()).finish();
+ let (offsets, child_data) =
get_list_array_buffers::<i32>(&list.to_data());
+
+ assert_eq!(list.len(), 0);
+ assert_eq!(
+ offsets.len(),
+ std::mem::size_of::<i32>(),
+ "offsets buffer should contain one zero i32 offset"
+ );
+ assert_eq!(child_data.len(), 0, "child data should remain empty");
+ }
+
+ #[test]
+ fn test_empty_large_list_ipc_writes_nonempty_offsets_buffer() {
+ let list = GenericListBuilder::<i64,
_>::new(UInt32Builder::new()).finish();
+ let (offsets, child_data) =
get_list_array_buffers::<i64>(&list.to_data());
+
+ assert_eq!(list.len(), 0);
+ assert_eq!(
+ offsets.len(),
+ std::mem::size_of::<i64>(),
+ "offsets buffer should contain one zero i64 offset"
+ );
+ assert_eq!(child_data.len(), 0, "child data should remain empty");
+ }
+
fn write_null_file(options: IpcWriteOptions) {
let schema = Schema::new(vec![
Field::new("nulls", DataType::Null, true),