This is an automated email from the ASF dual-hosted git repository.
CurtHagenlocher pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git
The following commit(s) were added to refs/heads/main by this push:
new 7949fb6 fix: handle file reader cancellation and empty nested concat
(#327)
7949fb6 is described below
commit 7949fb671ccc0610a4589f58c2c5ff5e801751c0
Author: InCerryGit <[email protected]>
AuthorDate: Sat Apr 25 00:41:53 2026 +0800
fix: handle file reader cancellation and empty nested concat (#327)
## Summary
This PR fixes two correctness edge cases found while reviewing the IPC
reader and array concatenation paths.
### File reader cancellation
`ArrowFileReaderImplementation.ReadRecordBatchAsync(...)` and
`ReadNextRecordBatchAsync(...)` accepted a caller cancellation token,
but the
schema/footer loading step still called `ReadSchemaAsync()` without
passing that
token.
That meant a canceled async file read could still continue through
schema/footer
I/O before cancellation was observed later in the record batch path.
This PR
passes the caller token into schema loading so cancellation is honored
before
dictionary and record batch reads begin.
### Empty nested array concatenation
`ArrayDataConcatenator` could drop the required child array structure
when
concatenating nested list arrays whose parent arrays were all empty.
Nested Arrow
arrays still need a valid zero-length child `ArrayData`; otherwise
downstream
array construction can fail or produce structurally invalid nested
arrays.
This PR preserves a zero-length child for all-empty nested/list inputs.
### Null-only ListView concatenation
For `ListView` and `LargeListView`, null rows can have `size == 0`, and
their
offset values should not contribute to the child slice range. The
previous logic
used every offset when computing the child bounds, even for zero-size
entries.
This PR computes list-view child bounds only from entries with `size >
0`, so
null-only inputs keep an empty child slice instead of deriving a range
from null
rows.
## Validation
- `dotnet test test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -c
Release --filter
"FullyQualifiedName~ArrowFileReaderTests.ReadRecordBatchAsync_HonorsPreCanceledTokenDuringSchemaRead|FullyQualifiedName~ArrowFileReaderTests.ReadNextRecordBatchAsync_HonorsPreCanceledTokenDuringSchemaRead|FullyQualifiedName~ArrowArrayConcatenatorTests.TestConcatenateAllEmpty|FullyQualifiedName~ArrowArrayConcatenatorTests.TestConcatenateNullOnly"`
- `dotnet build Apache.Arrow.sln -c Release`
---
src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs | 76 +++++++--
.../Ipc/ArrowFileReaderImplementation.cs | 4 +-
.../ArrowArrayConcatenatorTests.cs | 189 +++++++++++++++++++++
3 files changed, 255 insertions(+), 14 deletions(-)
diff --git a/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
b/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
index 764292a..cc9572a 100644
--- a/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
+++ b/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
@@ -133,13 +133,29 @@ namespace Apache.Arrow
var child = arrayData.Children[0];
ReadOnlySpan<long> offsets =
arrayData.Buffers[1].Span.CastTo<long>().Slice(arrayData.Offset,
arrayData.Length);
ReadOnlySpan<long> sizes =
arrayData.Buffers[2].Span.CastTo<long>().Slice(arrayData.Offset,
arrayData.Length);
- var minOffset = offsets[0];
+ long minOffset = 0;
long maxEnd = 0;
+ bool hasNonEmptyValue = false;
for (int i = 0; i < arrayData.Length; ++i)
{
- minOffset = Math.Min(minOffset, offsets[i]);
- maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+ if (sizes[i] > 0)
+ {
+ minOffset = hasNonEmptyValue ? Math.Min(minOffset,
offsets[i]) : offsets[i];
+ maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+ hasNonEmptyValue = true;
+ }
+ }
+
+ if (!hasNonEmptyValue)
+ {
+ for (int i = 0; i < offsets.Length; ++i)
+ {
+ offsetsBuilder.Append(baseOffset);
+ }
+
+ children.Add(child.Slice(0, 0));
+ continue;
}
foreach (long offset in offsets)
@@ -158,7 +174,7 @@ namespace Apache.Arrow
}
ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator);
- ArrayData combinedChild = Concatenate(children, _allocator);
+ ArrayData combinedChild = ConcatenateChildrenOrEmpty(children);
Result = new ArrayData(type, _totalLength, _totalNullCount, 0,
new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] {
combinedChild });
}
@@ -185,13 +201,29 @@ namespace Apache.Arrow
var child = arrayData.Children[0];
ReadOnlySpan<int> offsets =
arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset,
arrayData.Length);
ReadOnlySpan<int> sizes =
arrayData.Buffers[2].Span.CastTo<int>().Slice(arrayData.Offset,
arrayData.Length);
- var minOffset = offsets[0];
- var maxEnd = 0;
+ int minOffset = 0;
+ int maxEnd = 0;
+ bool hasNonEmptyValue = false;
for (int i = 0; i < arrayData.Length; ++i)
{
- minOffset = Math.Min(minOffset, offsets[i]);
- maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+ if (sizes[i] > 0)
+ {
+ minOffset = hasNonEmptyValue ? Math.Min(minOffset,
offsets[i]) : offsets[i];
+ maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+ hasNonEmptyValue = true;
+ }
+ }
+
+ if (!hasNonEmptyValue)
+ {
+ for (int i = 0; i < offsets.Length; ++i)
+ {
+ offsetsBuilder.Append(baseOffset);
+ }
+
+ children.Add(child.Slice(0, 0));
+ continue;
}
foreach (int offset in offsets)
@@ -210,7 +242,7 @@ namespace Apache.Arrow
}
ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator);
- ArrayData combinedChild = Concatenate(children, _allocator);
+ ArrayData combinedChild = ConcatenateChildrenOrEmpty(children);
Result = new ArrayData(type, _totalLength, _totalNullCount, 0,
new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] {
combinedChild });
}
@@ -237,7 +269,7 @@ namespace Apache.Arrow
children.Add(child);
}
- ArrayData combinedChild = Concatenate(children, _allocator);
+ ArrayData combinedChild = ConcatenateChildrenOrEmpty(children);
Result = new ArrayData(type, _totalLength, _totalNullCount, 0,
new ArrowBuffer[] { validityBuffer }, new[] { combinedChild });
}
@@ -707,7 +739,7 @@ namespace Apache.Arrow
children.Add(child);
}
- ArrayData combinedChild = Concatenate(children, _allocator);
+ ArrayData combinedChild = ConcatenateChildrenOrEmpty(children);
Result = new ArrayData(type, _totalLength, _totalNullCount, 0,
new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { combinedChild });
}
@@ -748,11 +780,31 @@ namespace Apache.Arrow
children.Add(child);
}
- ArrayData combinedChild = Concatenate(children, _allocator);
+ ArrayData combinedChild = ConcatenateChildrenOrEmpty(children);
Result = new ArrayData(type, _totalLength, _totalNullCount, 0,
new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { combinedChild });
}
+ private ArrayData ConcatenateChildrenOrEmpty(List<ArrayData>
children)
+ {
+ if (children.Count > 0)
+ {
+ return Concatenate(children, _allocator);
+ }
+
+ foreach (ArrayData arrayData in _arrayDataList)
+ {
+ if (arrayData.Children?.Length > 0 &&
arrayData.Children[0] != null)
+ {
+ // All parent arrays are empty, but the nested array
still needs a real
+ // zero-length child to preserve Arrow's structural
invariant.
+ return arrayData.Children[0].Slice(0, 0);
+ }
+ }
+
+ throw new InvalidOperationException("Nested array data must
contain child data.");
+ }
+
private ArrowBuffer ConcatenateLargeOffsetBuffer()
{
var builder = new ArrowBuffer.Builder<long>(_totalLength + 1);
diff --git a/src/Apache.Arrow/Ipc/ArrowFileReaderImplementation.cs
b/src/Apache.Arrow/Ipc/ArrowFileReaderImplementation.cs
index 1498c85..7217b08 100644
--- a/src/Apache.Arrow/Ipc/ArrowFileReaderImplementation.cs
+++ b/src/Apache.Arrow/Ipc/ArrowFileReaderImplementation.cs
@@ -151,7 +151,7 @@ namespace Apache.Arrow.Ipc
public async ValueTask<RecordBatch> ReadRecordBatchAsync(int index,
CancellationToken cancellationToken)
{
- await ReadSchemaAsync().ConfigureAwait(false);
+ await ReadSchemaAsync(cancellationToken).ConfigureAwait(false);
await
ReadDictionariesAsync(cancellationToken).ConfigureAwait(false);
if (index >= _footer.RecordBatchCount)
@@ -185,7 +185,7 @@ namespace Apache.Arrow.Ipc
public override async ValueTask<RecordBatch>
ReadNextRecordBatchAsync(CancellationToken cancellationToken)
{
- await ReadSchemaAsync().ConfigureAwait(false);
+ await ReadSchemaAsync(cancellationToken).ConfigureAwait(false);
await
ReadDictionariesAsync(cancellationToken).ConfigureAwait(false);
if (_recordBatchIndex >= _footer.RecordBatchCount)
diff --git a/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
b/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
index 14ae6ec..7702fde 100644
--- a/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
+++ b/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
@@ -60,6 +60,195 @@ namespace Apache.Arrow.Tests
ArrowReaderVerifier.CompareArrays(array, actualArray);
}
+ [Fact]
+ public void TestConcatenateAllEmptyListArraysKeepsEmptyChild()
+ {
+ AssertConcatenateAllEmptyNestedArraysKeepsChild(
+ new ListArray.Builder(Int32Type.Default).Build(),
+ new ListArray.Builder(Int32Type.Default).Build());
+ }
+
+ [Fact]
+ public void TestConcatenateAllEmptyLargeListArraysKeepsEmptyChild()
+ {
+ AssertConcatenateAllEmptyNestedArraysKeepsChild(
+ new LargeListArray.Builder(Int32Type.Default).Build(),
+ new LargeListArray.Builder(Int32Type.Default).Build());
+ }
+
+ [Fact]
+ public void TestConcatenateAllEmptyListViewArraysKeepsEmptyChild()
+ {
+ AssertConcatenateAllEmptyNestedArraysKeepsChild(
+ new ListViewArray.Builder(Int32Type.Default).Build(),
+ new ListViewArray.Builder(Int32Type.Default).Build());
+ }
+
+ [Fact]
+ public void TestConcatenateAllEmptyLargeListViewArraysKeepsEmptyChild()
+ {
+ AssertConcatenateAllEmptyNestedArraysKeepsChild(
+ new LargeListViewArray.Builder(Int32Type.Default).Build(),
+ new LargeListViewArray.Builder(Int32Type.Default).Build());
+ }
+
+ [Fact]
+ public void TestConcatenateNullOnlyListViewArraysKeepsEmptyChild()
+ {
+ var first = new
ListViewArray.Builder(Int32Type.Default).AppendNull().Build();
+ var second = new
ListViewArray.Builder(Int32Type.Default).AppendNull().Build();
+
+ AssertConcatenateNullOnlyListViewArraysKeepsChild(first, second);
+ }
+
+ [Fact]
+ public void TestConcatenateNullOnlyLargeListViewArraysKeepsEmptyChild()
+ {
+ var first = new
LargeListViewArray.Builder(Int32Type.Default).AppendNull().Build();
+ var second = new
LargeListViewArray.Builder(Int32Type.Default).AppendNull().Build();
+
+ AssertConcatenateNullOnlyListViewArraysKeepsChild(first, second);
+ }
+
+ [Fact]
+ public void
TestConcatenateSlicedNullOnlyListViewArraysNormalizesOffsets()
+ {
+ var builder = new ListViewArray.Builder(Int32Type.Default);
+ var valueBuilder = (Int32Array.Builder)builder.ValueBuilder;
+ builder.Append();
+ valueBuilder.Append(42);
+ builder.AppendNull();
+ var sliced = ArrowArrayFactory.Slice(builder.Build(), 1, 1);
+
+ var result =
Assert.IsType<ListViewArray>(ArrowArrayConcatenator.Concatenate(new[] { sliced,
sliced }));
+
+ Assert.Equal(new[] { 0, 0 }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 0, 0 }, result.Sizes.ToArray());
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
+ [Fact]
+ public void
TestConcatenateSlicedNullOnlyLargeListViewArraysNormalizesOffsets()
+ {
+ var builder = new LargeListViewArray.Builder(Int32Type.Default);
+ var valueBuilder = (Int32Array.Builder)builder.ValueBuilder;
+ builder.Append();
+ valueBuilder.Append(42);
+ builder.AppendNull();
+ var sliced = ArrowArrayFactory.Slice(builder.Build(), 1, 1);
+
+ var result =
Assert.IsType<LargeListViewArray>(ArrowArrayConcatenator.Concatenate(new[] {
sliced, sliced }));
+
+ Assert.Equal(new[] { 0L, 0L }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 0L, 0L }, result.Sizes.ToArray());
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
+ [Fact]
+ public void TestConcatenateSlicedEmptyListViewArraysNormalizesOffsets()
+ {
+ var builder = new ListViewArray.Builder(Int32Type.Default);
+ var valueBuilder = (Int32Array.Builder)builder.ValueBuilder;
+ builder.Append();
+ valueBuilder.Append(42);
+ builder.Append();
+ var sliced = ArrowArrayFactory.Slice(builder.Build(), 1, 1);
+
+ var result =
Assert.IsType<ListViewArray>(ArrowArrayConcatenator.Concatenate(new[] { sliced,
sliced }));
+
+ Assert.Equal(new[] { 0, 0 }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 0, 0 }, result.Sizes.ToArray());
+ Assert.Equal(0, result.NullCount);
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
+ [Fact]
+ public void
TestConcatenateSlicedEmptyLargeListViewArraysNormalizesOffsets()
+ {
+ var builder = new LargeListViewArray.Builder(Int32Type.Default);
+ var valueBuilder = (Int32Array.Builder)builder.ValueBuilder;
+ builder.Append();
+ valueBuilder.Append(42);
+ builder.Append();
+ var sliced = ArrowArrayFactory.Slice(builder.Build(), 1, 1);
+
+ var result =
Assert.IsType<LargeListViewArray>(ArrowArrayConcatenator.Concatenate(new[] {
sliced, sliced }));
+
+ Assert.Equal(new[] { 0L, 0L }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 0L, 0L }, result.Sizes.ToArray());
+ Assert.Equal(0, result.NullCount);
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
+ [Fact]
+ public void
TestConcatenateListViewZeroSizeSliceAfterNonEmptyUsesBaseOffset()
+ {
+ var nonEmptyBuilder = new ListViewArray.Builder(Int32Type.Default);
+ var nonEmptyValueBuilder =
(Int32Array.Builder)nonEmptyBuilder.ValueBuilder;
+ nonEmptyBuilder.Append();
+ nonEmptyValueBuilder.Append(7);
+ var nonEmpty = nonEmptyBuilder.Build();
+
+ var emptyBuilder = new ListViewArray.Builder(Int32Type.Default);
+ var emptyValueBuilder =
(Int32Array.Builder)emptyBuilder.ValueBuilder;
+ emptyBuilder.Append();
+ emptyValueBuilder.Append(42);
+ emptyBuilder.Append();
+ var sliced = ArrowArrayFactory.Slice(emptyBuilder.Build(), 1, 1);
+
+ var result =
Assert.IsType<ListViewArray>(ArrowArrayConcatenator.Concatenate(new[] {
nonEmpty, sliced, sliced }));
+
+ Assert.Equal(new[] { 0, 1, 1 }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 1, 0, 0 }, result.Sizes.ToArray());
+ Assert.Equal(1, result.Data.Children[0].Length);
+ }
+
+ [Fact]
+ public void
TestConcatenateLargeListViewZeroSizeSliceAfterNonEmptyUsesBaseOffset()
+ {
+ var nonEmptyBuilder = new
LargeListViewArray.Builder(Int32Type.Default);
+ var nonEmptyValueBuilder =
(Int32Array.Builder)nonEmptyBuilder.ValueBuilder;
+ nonEmptyBuilder.Append();
+ nonEmptyValueBuilder.Append(7);
+ var nonEmpty = nonEmptyBuilder.Build();
+
+ var emptyBuilder = new
LargeListViewArray.Builder(Int32Type.Default);
+ var emptyValueBuilder =
(Int32Array.Builder)emptyBuilder.ValueBuilder;
+ emptyBuilder.Append();
+ emptyValueBuilder.Append(42);
+ emptyBuilder.Append();
+ var sliced = ArrowArrayFactory.Slice(emptyBuilder.Build(), 1, 1);
+
+ var result =
Assert.IsType<LargeListViewArray>(ArrowArrayConcatenator.Concatenate(new[] {
nonEmpty, sliced, sliced }));
+
+ Assert.Equal(new[] { 0L, 1L, 1L }, result.ValueOffsets.ToArray());
+ Assert.Equal(new[] { 1L, 0L, 0L }, result.Sizes.ToArray());
+ Assert.Equal(1, result.Data.Children[0].Length);
+ }
+
+ private static void
AssertConcatenateAllEmptyNestedArraysKeepsChild(IArrowArray first, IArrowArray
second)
+ {
+ IArrowArray result = ArrowArrayConcatenator.Concatenate(new[] {
first, second });
+
+ Assert.Equal(0, result.Length);
+ Assert.NotNull(result.Data.Children);
+ Assert.Single(result.Data.Children);
+ Assert.NotNull(result.Data.Children[0]);
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
+ private static void
AssertConcatenateNullOnlyListViewArraysKeepsChild(IArrowArray first,
IArrowArray second)
+ {
+ IArrowArray result = ArrowArrayConcatenator.Concatenate(new[] {
first, second });
+
+ Assert.Equal(2, result.Length);
+ Assert.Equal(2, result.NullCount);
+ Assert.NotNull(result.Data.Children);
+ Assert.Single(result.Data.Children);
+ Assert.NotNull(result.Data.Children[0]);
+ Assert.Equal(0, result.Data.Children[0].Length);
+ }
+
[Fact]
public void TestRunEndEncodedInt32RunEnds()
{