This is an automated email from the ASF dual-hosted git repository.
jbonofre pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-java.git
The following commit(s) were added to refs/heads/main by this push:
new 77df3ecb2 GH-343: Fix BaseVariableWidthVector and
BaseLargeVariableWidthVector offset buffer serialization (#989)
77df3ecb2 is described below
commit 77df3ecb2cf5517fb5d37a4b2806844e3b4700df
Author: Yicong Huang <[email protected]>
AuthorDate: Thu Mar 12 01:39:44 2026 -0700
GH-343: Fix BaseVariableWidthVector and BaseLargeVariableWidthVector offset
buffer serialization (#989)
## What's Changed
Fix `BaseVariableWidthVector`/`BaseLargeVariableWidthVector` IPC
serialization when `valueCount` is 0.
### Problem
When `valueCount == 0`, `setReaderAndWriterIndex()` was setting
`offsetBuffer.writerIndex(0)`, which means `readableBytes() == 0`. IPC
serializer uses `readableBytes()` to determine buffer size, so 0 bytes
were written to the IPC stream. This crashes IPC readers in other
libraries because Arrow spec requires offset buffer to have at least one
entry `[0]`.
This is a follow-up to #967 which fixed the same issue in
`ListVector`/`LargeListVector`.
### Fix
Modify `setReaderAndWriterIndex()` to always use `(valueCount + 1) *
OFFSET_WIDTH` for the offset buffer's `writerIndex`, moved outside the
if/else branch. When the offset buffer capacity is insufficient (e.g.,
empty buffer from constructor or loaded via `loadFieldBuffers()`), it
reallocates a properly sized buffer on demand.
### Testing
Added tests for empty `VarCharVector` and `LargeVarCharVector` verifying
offset buffer has correct `readableBytes()` after `setValueCount(0)`.
Closes #343
---------
Co-authored-by: Yicong Huang <[email protected]>
---
.../arrow/adapter/jdbc/ResultSetUtilityTest.java | 22 ++++++++-----
.../arrow/vector/BaseLargeVariableWidthVector.java | 16 +++++++--
.../arrow/vector/BaseVariableWidthVector.java | 16 +++++++--
.../org/apache/arrow/vector/TestValueVector.java | 38 ++++++++++++++++++++++
4 files changed, 79 insertions(+), 13 deletions(-)
diff --git
a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
index c7dc9b279..e5039ccf5 100644
---
a/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
+++
b/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtilityTest.java
@@ -43,15 +43,19 @@ public class ResultSetUtilityTest {
.setReuseVectorSchemaRoot(reuseVectorSchemaRoot)
.build();
- ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs,
config);
- assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should
haveNext() before use");
- VectorSchemaRoot root = iter.next();
- assertNotNull(root, "VectorSchemaRoot from first next() result should
never be null");
- assertEquals(
- 0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet
should have zero rows");
- assertFalse(
- iter.hasNext(),
- "hasNext() should return false on empty ResultSets after initial
next() call");
+ try (ArrowVectorIterator iter =
JdbcToArrow.sqlToArrowVectorIterator(rs, config)) {
+ assertTrue(iter.hasNext(), "Iterator on zero row ResultSet should
haveNext() before use");
+ VectorSchemaRoot root = iter.next();
+ assertNotNull(root, "VectorSchemaRoot from first next() result
should never be null");
+ assertEquals(
+ 0, root.getRowCount(), "VectorSchemaRoot from empty ResultSet
should have zero rows");
+ assertFalse(
+ iter.hasNext(),
+ "hasNext() should return false on empty ResultSets after initial
next() call");
+ if (!reuseVectorSchemaRoot) {
+ root.close();
+ }
+ }
}
}
}
diff --git
a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
index 6c451f10a..3fac19578 100644
---
a/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
+++
b/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java
@@ -373,14 +373,26 @@ public abstract class BaseLargeVariableWidthVector
extends BaseValueVector
valueBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
- offsetBuffer.writerIndex(0);
valueBuffer.writerIndex(0);
} else {
final long lastDataOffset = getStartOffset(valueCount);
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
- offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
valueBuffer.writerIndex(lastDataOffset);
}
+ // IPC serializer will determine readable bytes based on `readerIndex` and
`writerIndex`.
+ // Both are set to 0 means 0 bytes are written to the IPC stream which
will crash IPC readers
+ // in other libraries. According to Arrow spec, we should still output the
offset buffer which
+ // is [0].
+ final long requiredOffsetBufferSize = (long) (valueCount + 1) *
OFFSET_WIDTH;
+ if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
+ ArrowBuf newOffsetBuffer =
allocateOffsetBuffer(requiredOffsetBufferSize);
+ if (offsetBuffer.capacity() > 0) {
+ newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
+ }
+ offsetBuffer.getReferenceManager().release();
+ offsetBuffer = newOffsetBuffer;
+ }
+ offsetBuffer.writerIndex(requiredOffsetBufferSize);
}
/** Same as {@link #allocateNewSafe()}. */
diff --git
a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
index 96e2afbd2..d5bd16725 100644
--- a/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
+++ b/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java
@@ -389,14 +389,26 @@ public abstract class BaseVariableWidthVector extends
BaseValueVector
valueBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
- offsetBuffer.writerIndex(0);
valueBuffer.writerIndex(0);
} else {
final int lastDataOffset = getStartOffset(valueCount);
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
- offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
valueBuffer.writerIndex(lastDataOffset);
}
+ // IPC serializer will determine readable bytes based on `readerIndex` and
`writerIndex`.
+ // Both are set to 0 means 0 bytes are written to the IPC stream which
will crash IPC readers
+ // in other libraries. According to Arrow spec, we should still output the
offset buffer which
+ // is [0].
+ final long requiredOffsetBufferSize = (long) (valueCount + 1) *
OFFSET_WIDTH;
+ if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
+ ArrowBuf newOffsetBuffer =
allocateOffsetBuffer(requiredOffsetBufferSize);
+ if (offsetBuffer.capacity() > 0) {
+ newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
+ }
+ offsetBuffer.getReferenceManager().release();
+ offsetBuffer = newOffsetBuffer;
+ }
+ offsetBuffer.writerIndex(requiredOffsetBufferSize);
}
/** Same as {@link #allocateNewSafe()}. */
diff --git a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index df42d04e6..22c93b0cb 100644
--- a/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -3940,4 +3940,42 @@ public class TestValueVector {
}
}
}
+
+ @Test
+ public void testEmptyVarCharOffsetBuffer() {
+ // Validates that offset buffer has at least OFFSET_WIDTH bytes (for
offset[0]=0)
+ // even when valueCount is 0, per Arrow specification.
+ try (VarCharVector vector = newVarCharVector("varchar", allocator)) {
+ vector.allocateNew();
+ vector.setValueCount(0);
+
+ List<ArrowBuf> buffers = vector.getFieldBuffers();
+ // buffers: [validity, offset, data]
+ assertTrue(
+ buffers.get(1).readableBytes() >=
BaseVariableWidthVector.OFFSET_WIDTH,
+ "Offset buffer should have at least "
+ + BaseVariableWidthVector.OFFSET_WIDTH
+ + " bytes for offset[0]");
+ assertEquals(0, vector.getOffsetBuffer().getInt(0));
+ }
+ }
+
+ @Test
+ public void testEmptyLargeVarCharOffsetBuffer() {
+ // Validates that offset buffer has at least OFFSET_WIDTH bytes (for
offset[0]=0)
+ // even when valueCount is 0, per Arrow specification.
+ try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar",
allocator)) {
+ vector.allocateNew();
+ vector.setValueCount(0);
+
+ List<ArrowBuf> buffers = vector.getFieldBuffers();
+ // buffers: [validity, offset, data]
+ assertTrue(
+ buffers.get(1).readableBytes() >=
BaseLargeVariableWidthVector.OFFSET_WIDTH,
+ "Offset buffer should have at least "
+ + BaseLargeVariableWidthVector.OFFSET_WIDTH
+ + " bytes for offset[0]");
+ assertEquals(0, vector.getOffsetBuffer().getLong(0));
+ }
+ }
}