This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git
The following commit(s) were added to refs/heads/main by this push:
new c0f957a feat: Add support for Run-End Encoded arrays (#308)
c0f957a is described below
commit c0f957ad48495aaf82e8f3be473e7815f762e8b9
Author: Curt Hagenlocher <[email protected]>
AuthorDate: Wed Apr 8 04:54:15 2026 -0700
feat: Add support for Run-End Encoded arrays (#308)
## What's Changed
This PR adds basic support for Run-End Encoded arrays by following
established codebase patterns.
Notably:
- New `ArrowTypeId` added.
- New array type `RunEndEncodedArray` added.
- New visitor method to handle the new array type.
- New entry in the IPC serializer field type switch.
- New `RunEndEncodedType` nested type.
- Basic feature tests.
- C API support
- Concatenation support
Co-authored-by: Jorge Candeias <[email protected]>
Supercedes #260
---------
Co-authored-by: Jorge Candeias <[email protected]>
---
src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs | 206 +++++++++-
src/Apache.Arrow/Arrays/ArrowArrayFactory.cs | 2 +
src/Apache.Arrow/Arrays/RunEndEncodedArray.cs | 399 +++++++++++++++++++
src/Apache.Arrow/C/CArrowArrayImporter.cs | 5 +
src/Apache.Arrow/C/CArrowSchemaExporter.cs | 1 +
src/Apache.Arrow/C/CArrowSchemaImporter.cs | 9 +
src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs | 12 +
src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 54 ++-
src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs | 9 +
src/Apache.Arrow/Ipc/MessageSerializer.cs | 6 +
src/Apache.Arrow/Types/IArrowType.cs | 1 +
src/Apache.Arrow/Types/RunEndEncodedType.cs | 84 ++++
test/Apache.Arrow.IntegrationTest/JsonFile.cs | 28 ++
.../ArrowArrayConcatenatorTests.cs | 175 +++++++++
test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 20 +
test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs | 433 +++++++++++++++++++++
test/Apache.Arrow.Tests/TableTests.cs | 4 +-
test/Apache.Arrow.Tests/TestData.cs | 68 ++++
18 files changed, 1512 insertions(+), 4 deletions(-)
diff --git a/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
b/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
index 68f9fff..6cd1caa 100644
--- a/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
+++ b/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
@@ -59,7 +59,8 @@ namespace Apache.Arrow
IArrowTypeVisitor<LargeStringType>,
IArrowTypeVisitor<LargeListType>,
IArrowTypeVisitor<LargeListViewType>,
- IArrowTypeVisitor<MapType>
+ IArrowTypeVisitor<MapType>,
+ IArrowTypeVisitor<RunEndEncodedType>
{
public ArrayData Result { get; private set; }
private readonly IReadOnlyList<ArrayData> _arrayDataList;
@@ -286,6 +287,209 @@ namespace Apache.Arrow
public void Visit(MapType type) =>
ConcatenateLists(type.UnsortedKey()); /* Can't tell if the output is still
sorted */
+ public void Visit(RunEndEncodedType type)
+ {
+ ArrowTypeId runEndsTypeId = type.RunEndsDataType.TypeId;
+ if (runEndsTypeId != ArrowTypeId.Int16 &&
+ runEndsTypeId != ArrowTypeId.Int32 &&
+ runEndsTypeId != ArrowTypeId.Int64)
+ {
+ throw new InvalidOperationException(
+ $"Run-ends array must be Int16, Int32, or Int64, but
got {runEndsTypeId}");
+ }
+
+ var slicedValues = new List<ArrayData>(_arrayDataList.Count);
+ ArrowBuffer.Builder<short> int16Builder = null;
+ ArrowBuffer.Builder<int> int32Builder = null;
+ ArrowBuffer.Builder<long> int64Builder = null;
+
+ switch (runEndsTypeId)
+ {
+ case ArrowTypeId.Int16: int16Builder = new
ArrowBuffer.Builder<short>(); break;
+ case ArrowTypeId.Int32: int32Builder = new
ArrowBuffer.Builder<int>(); break;
+ case ArrowTypeId.Int64: int64Builder = new
ArrowBuffer.Builder<long>(); break;
+ }
+
+ long baseOffset = 0;
+ int physicalRunCount = 0;
+
+ foreach (ArrayData arrayData in _arrayDataList)
+ {
+ arrayData.EnsureDataType(type.TypeId);
+
+ ArrayData runEndsData = arrayData.Children[0];
+ ArrayData valuesData = arrayData.Children[1];
+
+ if (runEndsData.DataType.TypeId != runEndsTypeId)
+ {
+ throw new ArgumentException(
+ $"All run-end encoded arrays must have the same
run-ends type. Expected <{runEndsTypeId}> but got
<{runEndsData.DataType.TypeId}>.");
+ }
+ if (valuesData.DataType.TypeId !=
type.ValuesDataType.TypeId)
+ {
+ throw new ArgumentException(
+ $"All run-end encoded arrays must have the same
values type. Expected <{type.ValuesDataType.TypeId}> but got
<{valuesData.DataType.TypeId}>.");
+ }
+
+ if (arrayData.Length == 0)
+ {
+ continue;
+ }
+
+ int logicalOffset = arrayData.Offset;
+ int logicalLength = arrayData.Length;
+ int logicalEnd = logicalOffset + logicalLength;
+
+ int physicalStart;
+ int physicalEndExclusive;
+
+ switch (runEndsTypeId)
+ {
+ case ArrowTypeId.Int16:
+ {
+ ReadOnlySpan<short> re =
runEndsData.Buffers[1].Span.CastTo<short>()
+ .Slice(runEndsData.Offset,
runEndsData.Length);
+ physicalStart = FindPhysicalStartInt16(re,
logicalOffset);
+ physicalEndExclusive =
FindPhysicalEndExclusiveInt16(re, logicalEnd, physicalStart);
+ for (int p = physicalStart; p <
physicalEndExclusive; p++)
+ {
+ int adjustedEnd = Math.Min((int)re[p],
logicalEnd) - logicalOffset;
+
int16Builder.Append(checked((short)(baseOffset + adjustedEnd)));
+ }
+ break;
+ }
+ case ArrowTypeId.Int32:
+ {
+ ReadOnlySpan<int> re =
runEndsData.Buffers[1].Span.CastTo<int>()
+ .Slice(runEndsData.Offset,
runEndsData.Length);
+ physicalStart = FindPhysicalStartInt32(re,
logicalOffset);
+ physicalEndExclusive =
FindPhysicalEndExclusiveInt32(re, logicalEnd, physicalStart);
+ for (int p = physicalStart; p <
physicalEndExclusive; p++)
+ {
+ int adjustedEnd = Math.Min(re[p],
logicalEnd) - logicalOffset;
+
int32Builder.Append(checked((int)(baseOffset + adjustedEnd)));
+ }
+ break;
+ }
+ default: // Int64
+ {
+ ReadOnlySpan<long> re =
runEndsData.Buffers[1].Span.CastTo<long>()
+ .Slice(runEndsData.Offset,
runEndsData.Length);
+ physicalStart = FindPhysicalStartInt64(re,
logicalOffset);
+ physicalEndExclusive =
FindPhysicalEndExclusiveInt64(re, logicalEnd, physicalStart);
+ for (int p = physicalStart; p <
physicalEndExclusive; p++)
+ {
+ long adjustedEnd = Math.Min(re[p],
(long)logicalEnd) - logicalOffset;
+ int64Builder.Append(baseOffset +
adjustedEnd);
+ }
+ break;
+ }
+ }
+
+ int physicalCount = physicalEndExclusive - physicalStart;
+ physicalRunCount += physicalCount;
+ slicedValues.Add(valuesData.Slice(physicalStart,
physicalCount));
+ baseOffset += logicalLength;
+ }
+
+ ArrowBuffer runEndsValueBuffer = runEndsTypeId switch
+ {
+ ArrowTypeId.Int16 => int16Builder.Build(_allocator),
+ ArrowTypeId.Int32 => int32Builder.Build(_allocator),
+ _ => int64Builder.Build(_allocator),
+ };
+
+ ArrayData runEndsResult = new ArrayData(
+ type.RunEndsDataType, physicalRunCount, 0, 0,
+ new[] { ArrowBuffer.Empty, runEndsValueBuffer });
+
+ ArrayData valuesResult;
+ if (slicedValues.Count == 0)
+ {
+ // All inputs were empty. Reuse the first input's values
child sliced to length
+ // 0 so we get a valid ArrayData with the correct
buffer/child layout for the
+ // values type, regardless of what that type is.
+ valuesResult = _arrayDataList[0].Children[1].Slice(0, 0);
+ }
+ else
+ {
+ valuesResult = Concatenate(slicedValues, _allocator);
+ }
+
+ Result = new ArrayData(
+ type, _totalLength, 0, 0,
+ System.Array.Empty<ArrowBuffer>(),
+ new[] { runEndsResult, valuesResult });
+ }
+
+ private static int FindPhysicalStartInt16(ReadOnlySpan<short>
runEnds, int logicalOffset)
+ {
+ // Smallest physical index p where runEnds[p] > logicalOffset
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] > logicalOffset) hi = mid; else lo = mid
+ 1;
+ }
+ return lo;
+ }
+
+ private static int
FindPhysicalEndExclusiveInt16(ReadOnlySpan<short> runEnds, int logicalEnd, int
physicalStart)
+ {
+ // Smallest physical index p (>= physicalStart) where
runEnds[p] >= logicalEnd, then p+1
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] >= logicalEnd) hi = mid; else lo = mid +
1;
+ }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
+ private static int FindPhysicalStartInt32(ReadOnlySpan<int>
runEnds, int logicalOffset)
+ {
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] > logicalOffset) hi = mid; else lo = mid
+ 1;
+ }
+ return lo;
+ }
+
+ private static int FindPhysicalEndExclusiveInt32(ReadOnlySpan<int>
runEnds, int logicalEnd, int physicalStart)
+ {
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] >= logicalEnd) hi = mid; else lo = mid +
1;
+ }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
+ private static int FindPhysicalStartInt64(ReadOnlySpan<long>
runEnds, int logicalOffset)
+ {
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] > logicalOffset) hi = mid; else lo = mid
+ 1;
+ }
+ return lo;
+ }
+
+ private static int
FindPhysicalEndExclusiveInt64(ReadOnlySpan<long> runEnds, int logicalEnd, int
physicalStart)
+ {
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi)
+ {
+ int mid = lo + (hi - lo) / 2;
+ if (runEnds[mid] >= logicalEnd) hi = mid; else lo = mid +
1;
+ }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
public void Visit(IArrowType type)
{
throw new NotImplementedException($"Concatenation for
{type.Name} is not supported yet.");
diff --git a/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
b/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
index 0f81336..7c8a32e 100644
--- a/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
+++ b/src/Apache.Arrow/Arrays/ArrowArrayFactory.cs
@@ -114,6 +114,8 @@ namespace Apache.Arrow
var storageData = new ArrayData(extType.StorageType,
data.Length, data.NullCount, data.Offset, data.Buffers, data.Children,
data.Dictionary);
IArrowArray storageArray = BuildArray(storageData);
return extType.CreateArray(storageArray);
+ case ArrowTypeId.RunEndEncoded:
+ return new RunEndEncodedArray(data);
default:
throw new NotSupportedException($"An ArrowArray cannot be
built for type {data.DataType.TypeId}.");
}
diff --git a/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs
b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs
new file mode 100644
index 0000000..290c3a0
--- /dev/null
+++ b/src/Apache.Arrow/Arrays/RunEndEncodedArray.cs
@@ -0,0 +1,399 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using Apache.Arrow.Memory;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow;
+
+/// <summary>
+/// Represents a run-end encoded array.
+/// A run-end encoded array stores consecutive runs of the same value more
efficiently.
+/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any
type).
+/// The run_ends array stores the cumulative end positions of each run.
+/// </summary>
+public class RunEndEncodedArray : Array
+{
+ /// <summary>
+ /// Gets the run ends array (Int16Array, Int32Array, or Int64Array).
+ /// This array contains the cumulative end indices for each run.
+ /// </summary>
+ public IArrowArray RunEnds { get; }
+
+ /// <summary>
+ /// Gets the values array.
+ /// This array contains the actual values that are run-length encoded.
+ /// </summary>
+ public IArrowArray Values { get; }
+
+ /// <summary>
+ /// Creates a new RunEndEncodedArray from ArrayData.
+ /// </summary>
+ /// <param name="data">The array data containing run ends and values as
children.</param>
+ public RunEndEncodedArray(ArrayData data)
+ : this(data, ArrowArrayFactory.BuildArray(data.Children[0]),
ArrowArrayFactory.BuildArray(data.Children[1]))
+ {
+ }
+
+ /// <summary>
+ /// Creates a new RunEndEncodedArray with specified run ends and values
arrays.
+ /// </summary>
+ /// <param name="runEnds">The run ends array (must be Int16Array,
Int32Array, or Int64Array).</param>
+ /// <param name="values">The values array (can be any type).</param>
+ public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values)
+ : this(CreateArrayData(runEnds, values), runEnds, values)
+ {
+ }
+
+ private RunEndEncodedArray(ArrayData data, IArrowArray runEnds,
IArrowArray values)
+ : base(data)
+ {
+ data.EnsureBufferCount(0); // REE arrays have no buffers, only children
+ data.EnsureDataType(ArrowTypeId.RunEndEncoded);
+
+ if (data.NullCount != 0)
+ {
+ throw new ArgumentException(
+ $"Run-end encoded arrays have no top-level validity bitmap and
must report null count 0, but got {data.NullCount}.",
+ nameof(data));
+ }
+
+ ValidateRunEndsType(runEnds);
+
+ if (runEnds.Length != values.Length)
+ {
+ throw new ArgumentException(
+ $"Run ends array length ({runEnds.Length}) must equal values
array length ({values.Length}).");
+ }
+ if (runEnds.NullCount != 0)
+ {
+ throw new ArgumentException(
+ $"Run ends array must not contain nulls, but had
{runEnds.NullCount} null(s).",
+ nameof(runEnds));
+ }
+
+ RunEnds = runEnds;
+ Values = values;
+ }
+
+ private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray
values)
+ {
+ ValidateRunEndsType(runEnds);
+
+ if (runEnds.Length != values.Length)
+ {
+ throw new ArgumentException(
+ $"Run ends array length ({runEnds.Length}) must equal values
array length ({values.Length}).");
+ }
+ if (runEnds.NullCount != 0)
+ {
+ throw new ArgumentException(
+ $"Run ends array must not contain nulls, but had
{runEnds.NullCount} null(s).",
+ nameof(runEnds));
+ }
+
+ // The logical length of a REE array is determined by the last value
in run_ends
+ int logicalLength = GetLogicalLength(runEnds);
+
+ var dataType = new RunEndEncodedType(runEnds.Data.DataType,
values.Data.DataType);
+
+ return new ArrayData(
+ dataType,
+ logicalLength,
+ nullCount: 0, // REE arrays don't have a validity bitmap
+ offset: 0,
+ buffers: [],
+ children: [runEnds.Data, values.Data]);
+ }
+
+ private static void ValidateRunEndsType(IArrowArray runEnds)
+ {
+ ArrowTypeId typeId = runEnds.Data.DataType.TypeId;
+ if (typeId != ArrowTypeId.Int16 &&
+ typeId != ArrowTypeId.Int32 &&
+ typeId != ArrowTypeId.Int64)
+ {
+ throw new ArgumentException(
+ $"Run ends array must be Int16, Int32, or Int64, but got
{typeId}",
+ nameof(runEnds));
+ }
+ }
+
+ private static int GetLogicalLength(IArrowArray runEnds)
+ {
+ if (runEnds.Length == 0)
+ {
+ return 0;
+ }
+
+ // Get the last run end value which represents the logical length
+ switch (runEnds)
+ {
+ case Int16Array int16Array:
+ return int16Array.GetValue(int16Array.Length - 1) ?? throw new
ArgumentException("invalid length");
+ case Int32Array int32Array:
+ return int32Array.GetValue(int32Array.Length - 1) ?? throw new
ArgumentException("invalid length");
+ case Int64Array int64Array:
+ {
+ long? lastValue = int64Array.GetValue(int64Array.Length -
1);
+ if (lastValue.HasValue && lastValue.Value > int.MaxValue)
+ {
+ throw new ArgumentException("Run ends value exceeds
maximum supported length.");
+ }
+ return (int)(lastValue ?? throw new
ArgumentException("invalid length"));
+ }
+ default:
+ throw new InvalidOperationException($"Unexpected run ends
array type: {runEnds.Data.DataType.TypeId}");
+ }
+ }
+
+ /// <summary>
+ /// Finds the physical index in the run_ends array that contains the
specified logical index.
+ /// </summary>
+ /// <param name="logicalIndex">The logical index in the decoded
array.</param>
+ /// <returns>The physical index in the run_ends/values arrays.</returns>
+ public int FindPhysicalIndex(int logicalIndex)
+ {
+ if (logicalIndex < 0 || logicalIndex >= Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(logicalIndex));
+ }
+
+ // Run ends are stored as cumulative positions in the underlying
physical array,
+ // so the search target must be expressed in those same coordinates by
adding
+ // the slice's logical offset.
+ int searchIndex = logicalIndex + Data.Offset;
+
+ // Binary search to find the run that contains this logical index
+ return RunEnds switch
+ {
+ Int16Array int16Array => BinarySearchRunEnds(int16Array,
searchIndex),
+ Int32Array int32Array => BinarySearchRunEnds(int32Array,
searchIndex),
+ Int64Array int64Array => BinarySearchRunEnds(int64Array,
searchIndex),
+ _ => throw new InvalidOperationException($"Unexpected run ends
array type: {RunEnds.GetType()}"),
+ };
+ }
+
+ private static int BinarySearchRunEnds(Int16Array runEnds, int
logicalIndex)
+ {
+ int left = 0;
+ int right = runEnds.Length - 1;
+
+ while (left < right)
+ {
+ int mid = left + (right - left) / 2;
+ int runEnd = runEnds.GetValue(mid) ?? throw new
ArgumentException("invalid length");
+
+ if (logicalIndex < runEnd)
+ {
+ right = mid;
+ }
+ else
+ {
+ left = mid + 1;
+ }
+ }
+
+ return left;
+ }
+
+ private static int BinarySearchRunEnds(Int32Array runEnds, int
logicalIndex)
+ {
+ int left = 0;
+ int right = runEnds.Length - 1;
+
+ while (left < right)
+ {
+ int mid = left + (right - left) / 2;
+ int runEnd = runEnds.GetValue(mid) ?? throw new
ArgumentException("invalid length");
+
+ if (logicalIndex < runEnd)
+ {
+ right = mid;
+ }
+ else
+ {
+ left = mid + 1;
+ }
+ }
+
+ return left;
+ }
+
+ private static int BinarySearchRunEnds(Int64Array runEnds, int
logicalIndex)
+ {
+ int left = 0;
+ int right = runEnds.Length - 1;
+
+ while (left < right)
+ {
+ int mid = left + (right - left) / 2;
+ long runEnd = runEnds.GetValue(mid) ?? throw new
ArgumentException("invalid length");
+
+ if (logicalIndex < runEnd)
+ {
+ right = mid;
+ }
+ else
+ {
+ left = mid + 1;
+ }
+ }
+
+ return left;
+ }
+
+ /// <summary>
+ /// Returns a logically equivalent <see cref="RunEndEncodedArray"/> whose
underlying
+ /// children have been normalized to the slice range:
+ /// <list type="bullet">
+ /// <item><description><c>Offset</c> is 0,</description></item>
+ /// <item><description>the run-ends array contains only the runs
covering this slice,
+ /// with values shifted so that the last run-end equals the slice
<c>Length</c>,</description></item>
+ /// <item><description>the values array is sliced to the corresponding
physical range.</description></item>
+ /// </list>
+ /// <para>
+ /// The returned array is independently disposable: it owns its own
references to the
+ /// underlying buffers via reference counting, so it remains valid even if
this
+ /// instance is later disposed. Callers are responsible for disposing the
returned
+ /// array when done with it. The result may or may not be the same
instance as
+ /// <c>this</c>; either way, it carries an independent reference.
+ /// </para>
+ /// </summary>
+ public RunEndEncodedArray Normalize()
+ {
+ if (Offset == 0 && GetLogicalLength(RunEnds) == Length)
+ {
+ // Already normalized — return an independently-owned reference so
the caller
+ // can dispose the result without affecting this instance.
+ return (RunEndEncodedArray)ArrowArrayFactory.SliceShared(this, 0,
Length);
+ }
+ if (Length == 0)
+ {
+ return new RunEndEncodedArray(
+ ArrowArrayFactory.SliceShared(RunEnds, 0, 0),
+ ArrowArrayFactory.SliceShared(Values, 0, 0));
+ }
+
+ int logicalEnd = Offset + Length;
+ int physicalStart;
+ int physicalEndExclusive;
+ IArrowArray normalizedRunEnds;
+
+ switch (RunEnds)
+ {
+ case Int16Array re16:
+ {
+ ReadOnlySpan<short> re = re16.Values;
+ physicalStart = FindNormalizePhysicalStart(re, Offset);
+ physicalEndExclusive =
FindNormalizePhysicalEndExclusive(re, logicalEnd, physicalStart);
+ int count = physicalEndExclusive - physicalStart;
+
+ using var native = new NativeBuffer<short,
NoOpAllocationTracker>(count, zeroFill: false);
+ Span<short> dst = native.Span;
+ for (int p = 0; p < count; p++)
+ {
+ int adjusted = Math.Min((int)re[physicalStart + p],
logicalEnd) - Offset;
+ dst[p] = checked((short)adjusted);
+ }
+ normalizedRunEnds = new Int16Array(native.Build(),
ArrowBuffer.Empty, count, 0, 0);
+ break;
+ }
+ case Int32Array re32:
+ {
+ ReadOnlySpan<int> re = re32.Values;
+ physicalStart = FindNormalizePhysicalStart(re, Offset);
+ physicalEndExclusive =
FindNormalizePhysicalEndExclusive(re, logicalEnd, physicalStart);
+ int count = physicalEndExclusive - physicalStart;
+
+ using var native = new NativeBuffer<int,
NoOpAllocationTracker>(count, zeroFill: false);
+ Span<int> dst = native.Span;
+ for (int p = 0; p < count; p++)
+ {
+ dst[p] = Math.Min(re[physicalStart + p], logicalEnd) -
Offset;
+ }
+ normalizedRunEnds = new Int32Array(native.Build(),
ArrowBuffer.Empty, count, 0, 0);
+ break;
+ }
+ case Int64Array re64:
+ {
+ ReadOnlySpan<long> re = re64.Values;
+ physicalStart = FindNormalizePhysicalStart(re, Offset);
+ physicalEndExclusive =
FindNormalizePhysicalEndExclusive(re, logicalEnd, physicalStart);
+ int count = physicalEndExclusive - physicalStart;
+
+ using var native = new NativeBuffer<long,
NoOpAllocationTracker>(count, zeroFill: false);
+ Span<long> dst = native.Span;
+ for (int p = 0; p < count; p++)
+ {
+ dst[p] = Math.Min(re[physicalStart + p],
(long)logicalEnd) - Offset;
+ }
+ normalizedRunEnds = new Int64Array(native.Build(),
ArrowBuffer.Empty, count, 0, 0);
+ break;
+ }
+ default:
+ throw new InvalidOperationException($"Unexpected run-ends
array type: {RunEnds.Data.DataType.TypeId}");
+ }
+
+ int physicalCount = physicalEndExclusive - physicalStart;
+ IArrowArray normalizedValues = ArrowArrayFactory.SliceShared(Values,
physicalStart, physicalCount);
+ return new RunEndEncodedArray(normalizedRunEnds, normalizedValues);
+ }
+
+ private static int FindNormalizePhysicalStart(ReadOnlySpan<short> runEnds,
int logicalOffset)
+ {
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >
logicalOffset) hi = mid; else lo = mid + 1; }
+ return lo;
+ }
+
+ private static int FindNormalizePhysicalEndExclusive(ReadOnlySpan<short>
runEnds, int logicalEnd, int physicalStart)
+ {
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >=
logicalEnd) hi = mid; else lo = mid + 1; }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
+ private static int FindNormalizePhysicalStart(ReadOnlySpan<int> runEnds,
int logicalOffset)
+ {
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >
logicalOffset) hi = mid; else lo = mid + 1; }
+ return lo;
+ }
+
+ private static int FindNormalizePhysicalEndExclusive(ReadOnlySpan<int>
runEnds, int logicalEnd, int physicalStart)
+ {
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >=
logicalEnd) hi = mid; else lo = mid + 1; }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
+ private static int FindNormalizePhysicalStart(ReadOnlySpan<long> runEnds,
int logicalOffset)
+ {
+ int lo = 0, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >
logicalOffset) hi = mid; else lo = mid + 1; }
+ return lo;
+ }
+
+ private static int FindNormalizePhysicalEndExclusive(ReadOnlySpan<long>
runEnds, int logicalEnd, int physicalStart)
+ {
+ int lo = physicalStart, hi = runEnds.Length;
+ while (lo < hi) { int mid = lo + (hi - lo) / 2; if (runEnds[mid] >=
logicalEnd) hi = mid; else lo = mid + 1; }
+ return Math.Min(lo + 1, runEnds.Length);
+ }
+
+ public override void Accept(IArrowArrayVisitor visitor) => Accept(this,
visitor);
+}
diff --git a/src/Apache.Arrow/C/CArrowArrayImporter.cs
b/src/Apache.Arrow/C/CArrowArrayImporter.cs
index e87f3ca..d519df6 100644
--- a/src/Apache.Arrow/C/CArrowArrayImporter.cs
+++ b/src/Apache.Arrow/C/CArrowArrayImporter.cs
@@ -206,6 +206,11 @@ namespace Apache.Arrow.C
children = ProcessListChildren(cArray,
mapType.Fields[0].DataType);
buffers = ImportListBuffers(cArray);
break;
+ case ArrowTypeId.RunEndEncoded:
+ RunEndEncodedType reeType =
(RunEndEncodedType)storageType;
+ children = ProcessStructChildren(cArray,
reeType.Fields);
+ buffers = System.Array.Empty<ArrowBuffer>();
+ break;
case ArrowTypeId.Null:
buffers = System.Array.Empty<ArrowBuffer>();
break;
diff --git a/src/Apache.Arrow/C/CArrowSchemaExporter.cs
b/src/Apache.Arrow/C/CArrowSchemaExporter.cs
index 3206642..acc44e5 100644
--- a/src/Apache.Arrow/C/CArrowSchemaExporter.cs
+++ b/src/Apache.Arrow/C/CArrowSchemaExporter.cs
@@ -237,6 +237,7 @@ namespace Apache.Arrow.C
case StructType _: return "+s";
case UnionType u: return FormatUnion(u);
case MapType _: return "+m";
+ case RunEndEncodedType _: return "+r";
// Dictionary
case DictionaryType dictionaryType:
return GetFormat(dictionaryType.IndexType);
diff --git a/src/Apache.Arrow/C/CArrowSchemaImporter.cs
b/src/Apache.Arrow/C/CArrowSchemaImporter.cs
index 706890b..c9559ba 100644
--- a/src/Apache.Arrow/C/CArrowSchemaImporter.cs
+++ b/src/Apache.Arrow/C/CArrowSchemaImporter.cs
@@ -239,6 +239,15 @@ namespace Apache.Arrow.C
ParseChildren("map").Single(),
(_cSchema->flags &
CArrowSchema.ArrowFlagMapKeysSorted) != 0);
}
+ else if (format == "+r")
+ {
+ if (_cSchema->n_children != 2)
+ {
+ throw new InvalidDataException("Expected run-end
encoded type to have exactly two children.");
+ }
+ List<Field> reeChildren = ParseChildren("run-end encoded");
+ return new RunEndEncodedType(reeChildren[0],
reeChildren[1]);
+ }
// TODO: Large list type
diff --git a/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
b/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
index 28757b9..799a488 100644
--- a/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
+++ b/src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs
@@ -276,6 +276,18 @@ namespace Apache.Arrow.Ipc
{
case ArrowTypeId.Null:
return new ArrayData(field.DataType, fieldLength,
fieldNullCount, 0, System.Array.Empty<ArrowBuffer>());
+ case ArrowTypeId.RunEndEncoded:
+ if (fieldNullCount != 0)
+ {
+ // REE arrays have no top-level validity bitmap, so
the field node's
+ // null count must be 0. A non-zero value would
produce an ArrayData
+ // whose NullCount != 0 with zero buffers, leading to
IndexOutOfRange
+ // when consumers later access the (nonexistent)
validity bitmap.
+ throw new InvalidDataException(
+ $"Run-end encoded array field node has non-zero
null count ({fieldNullCount}); REE arrays have no top-level validity bitmap and
must report null count 0.");
+ }
+ buffers = 0;
+ break;
case ArrowTypeId.Union:
if (version < MetadataVersion.V5)
{
diff --git a/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
b/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index 18cc538..1f95c35 100644
--- a/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -74,7 +74,9 @@ namespace Apache.Arrow.Ipc
IArrowArrayVisitor<Decimal128Array>,
IArrowArrayVisitor<Decimal256Array>,
IArrowArrayVisitor<DictionaryArray>,
- IArrowArrayVisitor<NullArray>
+ IArrowArrayVisitor<RunEndEncodedArray>,
+ IArrowArrayVisitor<NullArray>,
+ IDisposable
{
public readonly struct FieldNode
{
@@ -108,6 +110,10 @@ namespace Apache.Arrow.Ipc
private readonly ICompressionCodec _compressionCodec;
private readonly MemoryAllocator _allocator;
private readonly MemoryStream _fallbackCompressionStream;
+ // Holds visitor-owned arrays (e.g., normalized REE slices) whose
buffers are
+ // referenced by entries in _buffers. They must outlive
WriteBufferData and be
+ // disposed afterwards.
+ private List<IDisposable> _deferredDisposals;
public IReadOnlyList<FieldNode> FieldNodes => _fieldNodes;
public IReadOnlyList<Buffer> Buffers => _buffers;
@@ -125,6 +131,26 @@ namespace Apache.Arrow.Ipc
TotalLength = 0;
}
+ public void Dispose()
+ {
+ if (_deferredDisposals == null)
+ {
+ return;
+ }
+
+ foreach (IDisposable disposable in _deferredDisposals)
+ {
+ disposable.Dispose();
+ }
+ _deferredDisposals.Clear();
+ }
+
+ private void DeferDisposal(IDisposable disposable)
+ {
+ _deferredDisposals ??= new List<IDisposable>();
+ _deferredDisposals.Add(disposable);
+ }
+
public void VisitArray(IArrowArray array)
{
_fieldNodes.Add(new FieldNode(array.Length, array.NullCount));
@@ -358,6 +384,24 @@ namespace Apache.Arrow.Ipc
array.Indices.Accept(this);
}
+ public void Visit(RunEndEncodedArray array)
+ {
+ // REE arrays have no buffers at the top level, only child
arrays.
+ // The IPC stream format always encodes arrays with offset=0,
so a sliced REE
+ // must be normalized to the slice range before writing —
otherwise the
+ // deserialized array's logical position 0 would map to the
underlying array's
+ // physical run 0 instead of the slice's start.
+ //
+ // Normalize() returns an independently-owned array (via
SliceShared on the
+ // values child). The visitor records ReadOnlyMemory<byte>
references to those
+ // buffers in _buffers, so the normalized array must remain
alive until after
+ // WriteBufferData runs. Defer disposal until then.
+ RunEndEncodedArray normalized = array.Normalize();
+ DeferDisposal(normalized);
+ VisitArray(normalized.RunEnds);
+ VisitArray(normalized.Values);
+ }
+
public void Visit(NullArray array)
{
// There are no buffers for a NullArray
@@ -781,6 +825,8 @@ namespace Apache.Arrow.Ipc
(ArrowRecordBatchFlatBufferBuilder recordBatchBuilder,
VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) =
PrepareWritingRecordBatch(recordBatch);
+ using var builder = recordBatchBuilder;
+
VectorOffset buffersVectorOffset = Builder.EndVector();
// Serialize record batch
@@ -820,6 +866,8 @@ namespace Apache.Arrow.Ipc
(ArrowRecordBatchFlatBufferBuilder recordBatchBuilder,
VectorOffset fieldNodesVectorOffset, VectorOffset variadicCountsOffset) =
PrepareWritingRecordBatch(recordBatch);
+ using var builder = recordBatchBuilder;
+
VectorOffset buffersVectorOffset = Builder.EndVector();
// Serialize record batch
@@ -982,6 +1030,8 @@ namespace Apache.Arrow.Ipc
(ArrowRecordBatchFlatBufferBuilder recordBatchBuilder,
Offset<Flatbuf.DictionaryBatch> dictionaryBatchOffset) =
CreateDictionaryBatchOffset(id, valueType, dictionary);
+ using var builder = recordBatchBuilder;
+
long metadataLength =
WriteMessage(Flatbuf.MessageHeader.DictionaryBatch,
dictionaryBatchOffset, recordBatchBuilder.TotalLength);
@@ -1006,6 +1056,8 @@ namespace Apache.Arrow.Ipc
(ArrowRecordBatchFlatBufferBuilder recordBatchBuilder,
Offset<Flatbuf.DictionaryBatch> dictionaryBatchOffset) =
CreateDictionaryBatchOffset(id, valueType, dictionary);
+ using var builder = recordBatchBuilder;
+
long metadataLength = await
WriteMessageAsync(Flatbuf.MessageHeader.DictionaryBatch,
dictionaryBatchOffset, recordBatchBuilder.TotalLength,
cancellationToken).ConfigureAwait(false);
diff --git a/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
b/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
index 893ae63..1f283d9 100644
--- a/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
+++ b/src/Apache.Arrow/Ipc/ArrowTypeFlatbufferBuilder.cs
@@ -82,6 +82,7 @@ namespace Apache.Arrow.Ipc
IArrowTypeVisitor<DictionaryType>,
IArrowTypeVisitor<FixedSizeBinaryType>,
IArrowTypeVisitor<MapType>,
+ IArrowTypeVisitor<RunEndEncodedType>,
IArrowTypeVisitor<NullType>
{
private FlatBufferBuilder Builder { get; }
@@ -344,6 +345,14 @@ namespace Apache.Arrow.Ipc
Flatbuf.Map.CreateMap(Builder, type.KeySorted));
}
+ public void Visit(RunEndEncodedType type)
+ {
+ Flatbuf.RunEndEncoded.StartRunEndEncoded(Builder);
+ Result = FieldType.Build(
+ Flatbuf.Type.RunEndEncoded,
+ Flatbuf.RunEndEncoded.EndRunEndEncoded(Builder));
+ }
+
public void Visit(NullType type)
{
Flatbuf.Null.StartNull(Builder);
diff --git a/src/Apache.Arrow/Ipc/MessageSerializer.cs
b/src/Apache.Arrow/Ipc/MessageSerializer.cs
index 645e6c8..b455958 100644
--- a/src/Apache.Arrow/Ipc/MessageSerializer.cs
+++ b/src/Apache.Arrow/Ipc/MessageSerializer.cs
@@ -264,6 +264,12 @@ namespace Apache.Arrow.Ipc
}
Flatbuf.Map meta = field.Type<Flatbuf.Map>().Value;
return new Types.MapType(childFields[0], meta.KeysSorted);
+ case Flatbuf.Type.RunEndEncoded:
+ if (childFields == null || childFields.Length != 2)
+ {
+ throw new InvalidDataException($"Run-end encoded type
must have exactly two children (run_ends and values).");
+ }
+ return new Types.RunEndEncodedType(childFields[0],
childFields[1]);
default:
throw new InvalidDataException($"Arrow primitive
'{field.TypeType}' is unsupported.");
}
diff --git a/src/Apache.Arrow/Types/IArrowType.cs
b/src/Apache.Arrow/Types/IArrowType.cs
index ef1ec10..1ee95e1 100644
--- a/src/Apache.Arrow/Types/IArrowType.cs
+++ b/src/Apache.Arrow/Types/IArrowType.cs
@@ -60,6 +60,7 @@ namespace Apache.Arrow.Types
Decimal64,
Extension,
LargeListView,
+ RunEndEncoded,
}
public interface IArrowType
diff --git a/src/Apache.Arrow/Types/RunEndEncodedType.cs
b/src/Apache.Arrow/Types/RunEndEncodedType.cs
new file mode 100644
index 0000000..5a5cd66
--- /dev/null
+++ b/src/Apache.Arrow/Types/RunEndEncodedType.cs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+
+namespace Apache.Arrow.Types;
+
+/// <summary>
+/// Represents a run-end encoded array type.
+/// Contains two child arrays: run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit signed integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// </summary>
+public sealed class RunEndEncodedType : NestedType
+{
+ public override ArrowTypeId TypeId => ArrowTypeId.RunEndEncoded;
+ public override string Name => "run_end_encoded";
+
+ /// <summary>
+ /// Gets the run ends field (must be Int16, Int32, or Int64).
+ /// </summary>
+ public Field RunEndsField => Fields[0];
+
+ /// <summary>
+ /// Gets the values field (can be any type).
+ /// </summary>
+ public Field ValuesField => Fields[1];
+
+ /// <summary>
+ /// Gets the data type of the run ends array.
+ /// </summary>
+ public IArrowType RunEndsDataType => RunEndsField.DataType;
+
+ /// <summary>
+ /// Gets the data type of the values array.
+ /// </summary>
+ public IArrowType ValuesDataType => ValuesField.DataType;
+
+ /// <summary>
+ /// Creates a new RunEndEncodedType with the specified run ends and values
fields.
+ /// </summary>
+ /// <param name="runEndsField">The run ends field (must be Int16, Int32,
or Int64).</param>
+ /// <param name="valuesField">The values field (can be any type).</param>
+ public RunEndEncodedType(Field runEndsField, Field valuesField)
+ : base([runEndsField, valuesField])
+ {
+ var typeId = runEndsField.DataType.TypeId;
+ if (typeId != ArrowTypeId.Int16 &&
+ typeId != ArrowTypeId.Int32 &&
+ typeId != ArrowTypeId.Int64)
+ {
+ throw new ArgumentException(
+ $"Run ends type must be Int16, Int32, or Int64, but got
{typeId}",
+ nameof(runEndsField));
+ }
+ }
+
+ /// <summary>
+ /// Creates a new RunEndEncodedType with the specified run ends and values
data types.
+ /// Uses default field names "run_ends" and "values".
+ /// </summary>
+ /// <param name="runEndsDataType">The run ends data type (must be Int16,
Int32, or Int64).</param>
+ /// <param name="valuesDataType">The values data type (can be any
type).</param>
+ public RunEndEncodedType(IArrowType runEndsDataType, IArrowType
valuesDataType)
+ : this(new Field("run_ends", runEndsDataType, nullable: false),
+ new Field("values", valuesDataType, nullable: true))
+ {
+ }
+
+ public override void Accept(IArrowTypeVisitor visitor) => Accept(this,
visitor);
+}
diff --git a/test/Apache.Arrow.IntegrationTest/JsonFile.cs
b/test/Apache.Arrow.IntegrationTest/JsonFile.cs
index 71831a7..af89ba3 100644
--- a/test/Apache.Arrow.IntegrationTest/JsonFile.cs
+++ b/test/Apache.Arrow.IntegrationTest/JsonFile.cs
@@ -196,6 +196,7 @@ namespace Apache.Arrow.IntegrationTest
"struct" => ToStructArrowType(type, children),
"union" => ToUnionArrowType(type, children),
"map" => ToMapArrowType(type, children),
+ "runendencoded" => ToRunEndEncodedArrowType(type, children),
"null" => NullType.Default,
_ => throw new NotSupportedException($"JsonArrowType not
supported: {type.Name}")
};
@@ -346,6 +347,11 @@ namespace Apache.Arrow.IntegrationTest
{
return new MapType(children[0], type.KeysSorted);
}
+
+ private static IArrowType ToRunEndEncodedArrowType(JsonArrowType type,
Field[] children)
+ {
+ return new RunEndEncodedType(children[0], children[1]);
+ }
}
public class JsonField
@@ -495,6 +501,7 @@ namespace Apache.Arrow.IntegrationTest
IArrowTypeVisitor<UnionType>,
IArrowTypeVisitor<MapType>,
IArrowTypeVisitor<DictionaryType>,
+ IArrowTypeVisitor<RunEndEncodedType>,
IArrowTypeVisitor<NullType>
{
private JsonFieldData JsonFieldData { get; set; }
@@ -974,6 +981,27 @@ namespace Apache.Arrow.IntegrationTest
Array = new DictionaryArray(type, Array,
this.dictionaries(type));
}
+ public void Visit(RunEndEncodedType type)
+ {
+ var data = JsonFieldData;
+
+ JsonFieldData = data.Children[0];
+ type.RunEndsDataType.Accept(this);
+ ArrayData runEndsData = Array.Data;
+
+ JsonFieldData = data.Children[1];
+ type.ValuesDataType.Accept(this);
+ ArrayData valuesData = Array.Data;
+
+ JsonFieldData = data;
+
+ ArrayData arrayData = new ArrayData(
+ type, JsonFieldData.Count, nullCount: 0, offset: 0,
+ buffers: System.Array.Empty<ArrowBuffer>(),
+ children: new[] { runEndsData, valuesData });
+ Array = new RunEndEncodedArray(arrayData);
+ }
+
private ArrayData[] GetChildren(NestedType type)
{
ArrayData[] children = new ArrayData[type.Fields.Count];
diff --git a/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
b/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
index e15e8e0..3067ac4 100644
--- a/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
+++ b/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs
@@ -59,6 +59,181 @@ namespace Apache.Arrow.Tests
ArrowReaderVerifier.CompareArrays(array, actualArray);
}
+ [Fact]
+ public void TestRunEndEncodedInt32RunEnds()
+ {
+ // First array: runs [3, 7], values ["A", "B"], logical length 7
+ var first = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 3, 7 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "A", "B"
}).Build());
+
+ // Second array: runs [2, 5], values ["C", "D"], logical length 5
+ var second = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 2, 5 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "C", "D"
}).Build());
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { first, second });
+
+ Assert.Equal(12, concatenated.Length);
+ Assert.Equal(new[] { 3, 7, 9, 12 },
((Int32Array)concatenated.RunEnds).Values.ToArray());
+ var values = (StringArray)concatenated.Values;
+ Assert.Equal(4, values.Length);
+ Assert.Equal("A", values.GetString(0));
+ Assert.Equal("B", values.GetString(1));
+ Assert.Equal("C", values.GetString(2));
+ Assert.Equal("D", values.GetString(3));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedInt16RunEnds()
+ {
+ var first = new RunEndEncodedArray(
+ new Int16Array.Builder().AppendRange(new short[] { 2, 5
}).Build(),
+ new Int32Array.Builder().AppendRange(new[] { 10, 20
}).Build());
+
+ var second = new RunEndEncodedArray(
+ new Int16Array.Builder().AppendRange(new short[] { 1, 4
}).Build(),
+ new Int32Array.Builder().AppendRange(new[] { 30, 40
}).Build());
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { first, second });
+
+ Assert.Equal(9, concatenated.Length);
+ Assert.Equal(new short[] { 2, 5, 6, 9 },
((Int16Array)concatenated.RunEnds).Values.ToArray());
+ Assert.Equal(new[] { 10, 20, 30, 40 },
((Int32Array)concatenated.Values).Values.ToArray());
+ }
+
+ [Fact]
+ public void TestRunEndEncodedInt64RunEnds()
+ {
+ var first = new RunEndEncodedArray(
+ new Int64Array.Builder().AppendRange(new long[] { 4 }).Build(),
+ new Int64Array.Builder().AppendRange(new long[] { 100
}).Build());
+
+ var second = new RunEndEncodedArray(
+ new Int64Array.Builder().AppendRange(new long[] { 2, 6
}).Build(),
+ new Int64Array.Builder().AppendRange(new long[] { 200, 300
}).Build());
+
+ var third = new RunEndEncodedArray(
+ new Int64Array.Builder().AppendRange(new long[] { 3 }).Build(),
+ new Int64Array.Builder().AppendRange(new long[] { 400
}).Build());
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { first, second, third });
+
+ Assert.Equal(13, concatenated.Length);
+ Assert.Equal(new long[] { 4, 6, 10, 13 },
((Int64Array)concatenated.RunEnds).Values.ToArray());
+ Assert.Equal(new long[] { 100, 200, 300, 400 },
((Int64Array)concatenated.Values).Values.ToArray());
+ }
+
+ [Fact]
+ public void TestRunEndEncodedSlicedInputs()
+ {
+ // Underlying: runs [3, 7, 10], values ["A", "B", "C"], logical
length 10
+ // Slice to logical [2, 8) → covers position 2 in run "A",
positions 3..6 in run "B",
+ // position 7 in run "C". New logical length 6.
+ var firstFull = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 3, 7, 10
}).Build(),
+ new StringArray.Builder().AppendRange(new[] { "A", "B", "C"
}).Build());
+ var firstSliced =
(RunEndEncodedArray)ArrowArrayFactory.Slice(firstFull, 2, 6);
+
+ // Underlying: runs [4, 8], values ["X", "Y"], logical length 8
+ // Slice to logical [3, 6) → position 3 in "X", positions 4..5 in
"Y". New length 3.
+ var secondFull = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 4, 8 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "X", "Y"
}).Build());
+ var secondSliced =
(RunEndEncodedArray)ArrowArrayFactory.Slice(secondFull, 3, 3);
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { firstSliced, secondSliced });
+
+ Assert.Equal(9, concatenated.Length);
+ // Expected runs after concatenation:
+ // "A" up to position 1 (was 1 element), "B" up to 5 (4
elements), "C" up to 6 (1 element),
+ // "X" up to 7 (1 element), "Y" up to 9 (2 elements).
+ Assert.Equal(new[] { 1, 5, 6, 7, 9 },
((Int32Array)concatenated.RunEnds).Values.ToArray());
+ var values = (StringArray)concatenated.Values;
+ Assert.Equal(5, values.Length);
+ Assert.Equal("A", values.GetString(0));
+ Assert.Equal("B", values.GetString(1));
+ Assert.Equal("C", values.GetString(2));
+ Assert.Equal("X", values.GetString(3));
+ Assert.Equal("Y", values.GetString(4));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedMismatchedRunEndsTypeThrows()
+ {
+ var int32RunEnds = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 2 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "A" }).Build());
+
+ var int16RunEnds = new RunEndEncodedArray(
+ new Int16Array.Builder().AppendRange(new short[] { 3
}).Build(),
+ new StringArray.Builder().AppendRange(new[] { "B" }).Build());
+
+ var ex = Assert.Throws<ArgumentException>(() =>
+ ArrowArrayConcatenator.Concatenate(new IArrowArray[] {
int32RunEnds, int16RunEnds }));
+ Assert.Contains("run-ends type", ex.Message);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedMismatchedValuesTypeThrows()
+ {
+ var stringValues = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 2 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "A" }).Build());
+
+ var intValues = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 3 }).Build(),
+ new Int32Array.Builder().AppendRange(new[] { 99 }).Build());
+
+ var ex = Assert.Throws<ArgumentException>(() =>
+ ArrowArrayConcatenator.Concatenate(new IArrowArray[] {
stringValues, intValues }));
+ Assert.Contains("values type", ex.Message);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedAllEmptyInputs()
+ {
+ // Use String values (3 buffers) and Int32 children (2 buffers) so
that any
+ // wrong-shape empty placeholder would crash array construction.
+ RunEndEncodedArray MakeEmpty() => new RunEndEncodedArray(
+ new Int32Array.Builder().Build(),
+ new StringArray.Builder().Build());
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { MakeEmpty(), MakeEmpty(), MakeEmpty() });
+
+ Assert.Equal(0, concatenated.Length);
+ Assert.Equal(0, concatenated.RunEnds.Length);
+ Assert.Equal(0, concatenated.Values.Length);
+ Assert.IsType<Int32Array>(concatenated.RunEnds);
+ Assert.IsType<StringArray>(concatenated.Values);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedWithEmptyInput()
+ {
+ var empty = new RunEndEncodedArray(
+ new Int32Array.Builder().Build(),
+ new StringArray.Builder().Build());
+
+ var nonEmpty = new RunEndEncodedArray(
+ new Int32Array.Builder().AppendRange(new[] { 2, 4 }).Build(),
+ new StringArray.Builder().AppendRange(new[] { "A", "B"
}).Build());
+
+ var concatenated =
(RunEndEncodedArray)ArrowArrayConcatenator.Concatenate(
+ new IArrowArray[] { empty, nonEmpty, empty });
+
+ Assert.Equal(4, concatenated.Length);
+ Assert.Equal(new[] { 2, 4 },
((Int32Array)concatenated.RunEnds).Values.ToArray());
+ var values = (StringArray)concatenated.Values;
+ Assert.Equal("A", values.GetString(0));
+ Assert.Equal("B", values.GetString(1));
+ }
+
private static IEnumerable<Tuple<List<IArrowArray>, IArrowArray>>
GenerateTestData(bool slicedArrays = false)
{
var targetTypes = new List<IArrowType>() {
diff --git a/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
b/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 2f647ae..b024338 100644
--- a/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -112,6 +112,7 @@ namespace Apache.Arrow.Tests
IArrowArrayVisitor<Decimal128Array>,
IArrowArrayVisitor<Decimal256Array>,
IArrowArrayVisitor<DictionaryArray>,
+ IArrowArrayVisitor<RunEndEncodedArray>,
IArrowArrayVisitor<NullArray>
{
private readonly IArrowArray _expectedArray;
@@ -246,6 +247,25 @@ namespace Apache.Arrow.Tests
array.Dictionary.Accept(dictionaryComparer);
}
+ public void Visit(RunEndEncodedArray array)
+ {
+ Assert.IsAssignableFrom<RunEndEncodedArray>(_expectedArray);
+ RunEndEncodedArray expectedArray =
(RunEndEncodedArray)_expectedArray;
+
+ array.Data.DataType.Accept(_arrayTypeComparer);
+
+ Assert.Equal(expectedArray.Length, array.Length);
+
+ // The IPC writer normalizes sliced REE arrays so the
deserialized children
+ // are slice-relative, while the expected (in-memory) array's
children are still
+ // the unsliced underlying ones. Compare normalized forms so
the same logical
+ // content matches regardless of physical layout.
+ using RunEndEncodedArray expectedNormalized =
expectedArray.Normalize();
+ using RunEndEncodedArray actualNormalized = array.Normalize();
+ actualNormalized.RunEnds.Accept(new
ArrayComparer(expectedNormalized.RunEnds, _strictCompare));
+ actualNormalized.Values.Accept(new
ArrayComparer(expectedNormalized.Values, _strictCompare));
+ }
+
public void Visit(NullArray array)
{
Assert.IsAssignableFrom<NullArray>(_expectedArray);
diff --git a/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs
b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs
new file mode 100644
index 0000000..fdf5311
--- /dev/null
+++ b/test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.IO;
+using Apache.Arrow.Ipc;
+using Apache.Arrow.Types;
+using Xunit;
+
+namespace Apache.Arrow.Tests;
+
+public class RunEndEncodedArrayTests
+{
+ [Fact]
+ public void TestRunEndEncodedTypeCreation()
+ {
+ // Test with explicit fields
+ var runEndsField = new Field("run_ends", Int32Type.Default, nullable:
false);
+ var valuesField = new Field("values", StringType.Default, nullable:
true);
+ var reeType = new RunEndEncodedType(runEndsField, valuesField);
+
+ Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId);
+ Assert.Equal("run_end_encoded", reeType.Name);
+ Assert.Equal(runEndsField, reeType.RunEndsField);
+ Assert.Equal(valuesField, reeType.ValuesField);
+ Assert.Equal(Int32Type.Default.TypeId, reeType.RunEndsDataType.TypeId);
+ Assert.Equal(StringType.Default.TypeId, reeType.ValuesDataType.TypeId);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedTypeCreationWithDataTypes()
+ {
+ // Test with data types (uses default field names)
+ var reeType = new RunEndEncodedType(Int32Type.Default,
StringType.Default);
+
+ Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId);
+ Assert.Equal("run_ends", reeType.RunEndsField.Name);
+ Assert.Equal("values", reeType.ValuesField.Name);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedTypeValidation()
+ {
+ // Invalid run ends type (must be Int16, Int32, or Int64)
+ Assert.Throws<ArgumentException>(() => new
RunEndEncodedType(Int8Type.Default, StringType.Default));
+ Assert.Throws<ArgumentException>(() => new
RunEndEncodedType(FloatType.Default, StringType.Default));
+ Assert.Throws<ArgumentException>(() => new
RunEndEncodedType(StringType.Default, StringType.Default));
+
+ // Valid run ends types
+ Assert.NotNull(new RunEndEncodedType(Int16Type.Default,
StringType.Default)); // Should not throw
+ Assert.NotNull(new RunEndEncodedType(Int32Type.Default,
StringType.Default)); // Should not throw
+ Assert.NotNull(new RunEndEncodedType(Int64Type.Default,
StringType.Default)); // Should not throw
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayWithInt32RunEnds()
+ {
+ // Create run ends: [3, 7, 10, 15]
+ // This represents: 3 'A's, 4 'B's, 3 'C's, 5 'D's
+ var runEndsBuilder = new Int32Array.Builder();
+ runEndsBuilder.AppendRange([3, 7, 10, 15]);
+ Int32Array runEnds = runEndsBuilder.Build();
+
+ // Create values: ['A', 'B', 'C', 'D']
+ var valuesBuilder = new StringArray.Builder();
+ valuesBuilder.AppendRange(["A", "B", "C", "D"]);
+ StringArray values = valuesBuilder.Build();
+
+ // Create REE array
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Equal(15, reeArray.Length); // Logical length is the last run
end value
+ Assert.Equal(0, reeArray.NullCount); // REE arrays don't have nulls at
the top level
+ Assert.Equal(runEnds, reeArray.RunEnds);
+ Assert.Equal(values, reeArray.Values);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayWithInt16RunEnds()
+ {
+ var runEndsBuilder = new Int16Array.Builder();
+ runEndsBuilder.AppendRange([2, 5, 8]);
+ Int16Array runEnds = runEndsBuilder.Build();
+
+ var valuesBuilder = new Int32Array.Builder();
+ valuesBuilder.AppendRange([100, 200, 300]);
+ Int32Array values = valuesBuilder.Build();
+
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Equal(8, reeArray.Length);
+ Assert.Equal(runEnds, reeArray.RunEnds);
+ Assert.Equal(values, reeArray.Values);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayWithInt64RunEnds()
+ {
+ var runEndsBuilder = new Int64Array.Builder();
+ runEndsBuilder.AppendRange([1000, 2000, 3000]);
+ Int64Array runEnds = runEndsBuilder.Build();
+
+ var valuesBuilder = new DoubleArray.Builder();
+ valuesBuilder.AppendRange([1.5, 2.5, 3.5]);
+ DoubleArray values = valuesBuilder.Build();
+
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Equal(3000, reeArray.Length);
+ Assert.Equal(runEnds, reeArray.RunEnds);
+ Assert.Equal(values, reeArray.Values);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayInvalidRunEndsType()
+ {
+ Int8Array invalidRunEnds = new Int8Array.Builder().AppendRange([1, 2,
3]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["A", "B",
"C"]).Build();
+
+ Assert.Throws<ArgumentException>(() => new
RunEndEncodedArray(invalidRunEnds, values));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayEmpty()
+ {
+ Int32Array runEnds = new Int32Array.Builder().Build();
+ StringArray values = new StringArray.Builder().Build();
+
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Equal(0, reeArray.Length);
+ }
+
+ [Fact]
+ public void TestFindPhysicalIndexInt32()
+ {
+ // Run ends: [3, 7, 10, 15] means:
+ // Logical indices 0-2 map to physical index 0 (value 'A')
+ // Logical indices 3-6 map to physical index 1 (value 'B')
+ // Logical indices 7-9 map to physical index 2 (value 'C')
+ // Logical indices 10-14 map to physical index 3 (value 'D')
+ Int32Array runEnds = new Int32Array.Builder()
+ .AppendRange([3, 7, 10, 15])
+ .Build();
+ StringArray values = new StringArray.Builder()
+ .AppendRange(["A", "B", "C", "D"])
+ .Build();
+
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Equal(0, reeArray.FindPhysicalIndex(0));
+ Assert.Equal(0, reeArray.FindPhysicalIndex(1));
+ Assert.Equal(0, reeArray.FindPhysicalIndex(2));
+ Assert.Equal(1, reeArray.FindPhysicalIndex(3));
+ Assert.Equal(1, reeArray.FindPhysicalIndex(4));
+ Assert.Equal(1, reeArray.FindPhysicalIndex(5));
+ Assert.Equal(1, reeArray.FindPhysicalIndex(6));
+ Assert.Equal(2, reeArray.FindPhysicalIndex(7));
+ Assert.Equal(2, reeArray.FindPhysicalIndex(8));
+ Assert.Equal(2, reeArray.FindPhysicalIndex(9));
+ Assert.Equal(3, reeArray.FindPhysicalIndex(10));
+ Assert.Equal(3, reeArray.FindPhysicalIndex(11));
+ Assert.Equal(3, reeArray.FindPhysicalIndex(14));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedLengthMismatchThrows()
+ {
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3,
7]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["A", "B",
"C"]).Build();
+
+ var ex = Assert.Throws<ArgumentException>(() => new
RunEndEncodedArray(runEnds, values));
+ Assert.Contains("length", ex.Message);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedNullRunEndsThrows()
+ {
+ Int32Array runEnds = new
Int32Array.Builder().Append(3).AppendNull().Build();
+ StringArray values = new StringArray.Builder().AppendRange(["A",
"B"]).Build();
+
+ var ex = Assert.Throws<ArgumentException>(() => new
RunEndEncodedArray(runEnds, values));
+ Assert.Contains("null", ex.Message);
+ }
+
+ [Fact]
+ public void TestFindPhysicalIndexOnSlicedArray()
+ {
+ // Run ends: [3, 7, 10] → run 0 (A): positions 0..2, run 1 (B): 3..6,
run 2 (C): 7..9
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7,
10]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["A", "B",
"C"]).Build();
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ // Slice covering underlying positions 2..7 (length 6).
+ // Slice positions: 0→A(2), 1→B(3), 2→B(4), 3→B(5), 4→B(6), 5→C(7)
+ var sliced = (RunEndEncodedArray)ArrowArrayFactory.Slice(reeArray, 2,
6);
+
+ Assert.Equal(6, sliced.Length);
+ Assert.Equal(0, sliced.FindPhysicalIndex(0));
+ Assert.Equal(1, sliced.FindPhysicalIndex(1));
+ Assert.Equal(1, sliced.FindPhysicalIndex(2));
+ Assert.Equal(1, sliced.FindPhysicalIndex(3));
+ Assert.Equal(1, sliced.FindPhysicalIndex(4));
+ Assert.Equal(2, sliced.FindPhysicalIndex(5));
+
+ // Out-of-range against the slice length, even though they would be
valid against
+ // the underlying physical array.
+ Assert.Throws<ArgumentOutOfRangeException>(() =>
sliced.FindPhysicalIndex(6));
+ }
+
+ [Fact]
+ public void TestFindPhysicalIndexOutOfRange()
+ {
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3,
7]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["A",
"B"]).Build();
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ Assert.Throws<ArgumentOutOfRangeException>(() =>
reeArray.FindPhysicalIndex(-1));
+ Assert.Throws<ArgumentOutOfRangeException>(() =>
reeArray.FindPhysicalIndex(7));
+ Assert.Throws<ArgumentOutOfRangeException>(() =>
reeArray.FindPhysicalIndex(100));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArraySerialization()
+ {
+ // Create a REE array
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7,
10]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["foo",
"bar", "baz"]).Build();
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+
+ // Create a record batch with the REE array
+ var reeField = new Field("ree_column", reeArray.Data.DataType,
nullable: false);
+ var schema = new Schema([reeField], null);
+ var recordBatch = new RecordBatch(schema, [reeArray], reeArray.Length);
+
+ // Serialize and deserialize
+ using var stream = new MemoryStream();
+ using (var writer = new ArrowStreamWriter(stream, schema, leaveOpen:
true))
+ {
+ writer.WriteRecordBatch(recordBatch);
+ writer.WriteEnd();
+ }
+
+ stream.Position = 0;
+
+ using var reader = new ArrowStreamReader(stream);
+ RecordBatch readBatch = reader.ReadNextRecordBatch();
+
+ Assert.NotNull(readBatch);
+ Assert.Equal(1, readBatch.ColumnCount);
+ Assert.Equal(10, readBatch.Length);
+
+ var readArray = readBatch.Column(0) as RunEndEncodedArray;
+ Assert.NotNull(readArray);
+ Assert.Equal(10, readArray.Length);
+ Assert.Equal(ArrowTypeId.RunEndEncoded,
readArray.Data.DataType.TypeId);
+
+ // Verify run ends
+ var readRunEnds = readArray.RunEnds as Int32Array;
+ Assert.NotNull(readRunEnds);
+ Assert.Equal(3, readRunEnds.Length);
+ Assert.Equal(3, readRunEnds.GetValue(0));
+ Assert.Equal(7, readRunEnds.GetValue(1));
+ Assert.Equal(10, readRunEnds.GetValue(2));
+
+ // Verify values
+ var readValues = readArray.Values as StringArray;
+ Assert.NotNull(readValues);
+ Assert.Equal(3, readValues.Length);
+ Assert.Equal("foo", readValues.GetString(0));
+ Assert.Equal("bar", readValues.GetString(1));
+ Assert.Equal("baz", readValues.GetString(2));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedSlicedArraySerialization()
+ {
+ // Underlying: runs [3, 7, 10], values [foo, bar, baz], logical length
10.
+ // Slice covers underlying positions 2..7 → length 6:
+ // slice pos 0 → foo, 1..4 → bar, 5 → baz.
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7,
10]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["foo",
"bar", "baz"]).Build();
+ var reeArray = new RunEndEncodedArray(runEnds, values);
+ var sliced = (RunEndEncodedArray)ArrowArrayFactory.Slice(reeArray, 2,
6);
+
+ var reeField = new Field("ree_column", sliced.Data.DataType, nullable:
false);
+ var schema = new Schema([reeField], null);
+ var recordBatch = new RecordBatch(schema, [sliced], sliced.Length);
+
+ using var stream = new MemoryStream();
+ using (var writer = new ArrowStreamWriter(stream, schema, leaveOpen:
true))
+ {
+ writer.WriteRecordBatch(recordBatch);
+ writer.WriteEnd();
+ }
+
+ stream.Position = 0;
+ using var reader = new ArrowStreamReader(stream);
+ RecordBatch readBatch = reader.ReadNextRecordBatch();
+
+ Assert.NotNull(readBatch);
+ Assert.Equal(6, readBatch.Length);
+
+ var readArray = (RunEndEncodedArray)readBatch.Column(0);
+ Assert.Equal(6, readArray.Length);
+ Assert.Equal(0, readArray.Offset);
+
+ // Run-ends should be normalized to the slice range, ending at the
slice length.
+ var readRunEnds = (Int32Array)readArray.RunEnds;
+ Assert.Equal(new[] { 1, 5, 6 }, readRunEnds.Values.ToArray());
+
+ var readValues = (StringArray)readArray.Values;
+ Assert.Equal(3, readValues.Length);
+ Assert.Equal("foo", readValues.GetString(0));
+ Assert.Equal("bar", readValues.GetString(1));
+ Assert.Equal("baz", readValues.GetString(2));
+
+ // FindPhysicalIndex on the deserialized array should map slice
positions correctly.
+ Assert.Equal(0, readArray.FindPhysicalIndex(0));
+ Assert.Equal(1, readArray.FindPhysicalIndex(1));
+ Assert.Equal(1, readArray.FindPhysicalIndex(4));
+ Assert.Equal(2, readArray.FindPhysicalIndex(5));
+ }
+
+ [Fact]
+ public void TestRunEndEncodedConstructorRejectsNonZeroNullCount()
+ {
+ // The ArrayData constructor path must reject a REE whose top-level
null count
+ // is non-zero — REE has no validity bitmap, so any non-zero value is
invalid.
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7,
10]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["foo",
"bar", "baz"]).Build();
+ var reeType = new RunEndEncodedType(Int32Type.Default,
StringType.Default);
+ var malformedData = new ArrayData(
+ reeType, length: 10, nullCount: 1, offset: 0,
+ buffers: System.Array.Empty<ArrowBuffer>(),
+ children: new[] { runEnds.Data, values.Data });
+
+ var ex = Assert.Throws<ArgumentException>(() => new
RunEndEncodedArray(malformedData));
+ Assert.Contains("null count", ex.Message,
StringComparison.OrdinalIgnoreCase);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayWithDifferentValueTypes()
+ {
+ // Test with boolean values
+ Int32Array runEnds1 = new Int32Array.Builder().AppendRange([5,
10]).Build();
+ BooleanArray values1 = new BooleanArray.Builder().AppendRange([true,
false]).Build();
+ var reeArray1 = new RunEndEncodedArray(runEnds1, values1);
+ Assert.Equal(10, reeArray1.Length);
+
+ // Test with double values
+ Int32Array runEnds2 = new Int32Array.Builder().AppendRange([3,
8]).Build();
+ DoubleArray values2 = new DoubleArray.Builder().AppendRange([1.5,
2.5]).Build();
+ var reeArray2 = new RunEndEncodedArray(runEnds2, values2);
+ Assert.Equal(8, reeArray2.Length);
+
+ // Test with list values
+ var listBuilder = new ListArray.Builder(Int32Type.Default);
+ var int32Builder = (Int32Array.Builder)listBuilder.ValueBuilder;
+ listBuilder.Append();
+ int32Builder.Append(1);
+ int32Builder.Append(2);
+ listBuilder.Append();
+ int32Builder.Append(3);
+ int32Builder.Append(4);
+ ListArray listValues = listBuilder.Build();
+
+ Int32Array runEnds3 = new Int32Array.Builder().AppendRange([2,
5]).Build();
+ var reeArray3 = new RunEndEncodedArray(runEnds3, listValues);
+ Assert.Equal(5, reeArray3.Length);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayFromArrayData()
+ {
+ // Create arrays
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([2,
5]).Build();
+ StringArray values = new StringArray.Builder().AppendRange(["X",
"Y"]).Build();
+
+ // Create ArrayData manually
+ var reeType = new RunEndEncodedType(Int32Type.Default,
StringType.Default);
+ var arrayData = new ArrayData(
+ reeType,
+ length: 5,
+ nullCount: 0,
+ offset: 0,
+ buffers: [],
+ children: [runEnds.Data, values.Data]);
+
+ // Create REE array from ArrayData
+ var reeArray = new RunEndEncodedArray(arrayData);
+
+ Assert.Equal(5, reeArray.Length);
+ Assert.Equal(0, reeArray.NullCount);
+ Assert.IsType<Int32Array>(reeArray.RunEnds);
+ Assert.IsType<StringArray>(reeArray.Values);
+ }
+
+ [Fact]
+ public void TestRunEndEncodedArrayFactoryBuild()
+ {
+ // Test that ArrowArrayFactory can build REE arrays
+ Int32Array runEnds = new Int32Array.Builder().AppendRange([3,
6]).Build();
+ Int64Array values = new Int64Array.Builder().AppendRange([100,
200]).Build();
+
+ var reeType = new RunEndEncodedType(Int32Type.Default,
Int64Type.Default);
+ var arrayData = new ArrayData(
+ reeType,
+ length: 6,
+ nullCount: 0,
+ offset: 0,
+ buffers: [],
+ children: [runEnds.Data, values.Data]);
+
+ IArrowArray array = ArrowArrayFactory.BuildArray(arrayData);
+
+ Assert.IsType<RunEndEncodedArray>(array);
+ var reeArray = (RunEndEncodedArray)array;
+ Assert.Equal(6, reeArray.Length);
+ }
+}
diff --git a/test/Apache.Arrow.Tests/TableTests.cs
b/test/Apache.Arrow.Tests/TableTests.cs
index 309ef7b..2dde48f 100644
--- a/test/Apache.Arrow.Tests/TableTests.cs
+++ b/test/Apache.Arrow.Tests/TableTests.cs
@@ -63,9 +63,9 @@ namespace Apache.Arrow.Tests
Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema,
recordBatches);
Assert.Equal(20, table1.RowCount);
#if NET5_0_OR_GREATER
- Assert.Equal(41, table1.ColumnCount);
+ Assert.Equal(44, table1.ColumnCount);
#else
- Assert.Equal(40, table1.ColumnCount);
+ Assert.Equal(43, table1.ColumnCount);
#endif
Assert.Equal("ChunkedArray: Length=20, DataType=list",
table1.Column(0).Data.ToString());
diff --git a/test/Apache.Arrow.Tests/TestData.cs
b/test/Apache.Arrow.Tests/TestData.cs
index 08205cb..1b0897d 100644
--- a/test/Apache.Arrow.Tests/TestData.cs
+++ b/test/Apache.Arrow.Tests/TestData.cs
@@ -91,6 +91,18 @@ namespace Apache.Arrow.Tests
AddField(CreateField(new DictionaryType(Int32Type.Default,
StringType.Default, false), i));
AddField(CreateField(new LargeBinaryType(), i));
AddField(CreateField(new LargeStringType(), i));
+ AddField(new Field(
+ $"run_end_encoded_i16_{i}",
+ new RunEndEncodedType(Int16Type.Default,
Int32Type.Default),
+ nullable: false));
+ AddField(new Field(
+ $"run_end_encoded_i32_{i}",
+ new RunEndEncodedType(Int32Type.Default,
StringType.Default),
+ nullable: false));
+ AddField(new Field(
+ $"run_end_encoded_i64_{i}",
+ new RunEndEncodedType(Int64Type.Default,
Int64Type.Default),
+ nullable: false));
}
Schema schema = builder.Build();
@@ -170,6 +182,7 @@ namespace Apache.Arrow.Tests
IArrowTypeVisitor<FixedSizeBinaryType>,
IArrowTypeVisitor<MapType>,
IArrowTypeVisitor<IntervalType>,
+ IArrowTypeVisitor<RunEndEncodedType>,
#if NET5_0_OR_GREATER
IArrowTypeVisitor<HalfFloatType>,
#endif
@@ -813,6 +826,61 @@ namespace Apache.Arrow.Tests
}
}
+ public void Visit(RunEndEncodedType type)
+ {
+ // Build a small set of runs whose cumulative end equals
Length.
+ int[] cumulativeRunEnds;
+ if (Length == 0)
+ {
+ cumulativeRunEnds = System.Array.Empty<int>();
+ }
+ else if (Length <= 2)
+ {
+ cumulativeRunEnds = new[] { Length };
+ }
+ else
+ {
+ int third = Math.Max(1, Length / 3);
+ cumulativeRunEnds = new[] { third, 2 * third, Length };
+ }
+ int runCount = cumulativeRunEnds.Length;
+
+ IArrowArray runEnds;
+ switch (type.RunEndsDataType.TypeId)
+ {
+ case ArrowTypeId.Int16:
+ {
+ var b = new Int16Array.Builder().Reserve(runCount);
+ foreach (var v in cumulativeRunEnds)
b.Append((short)v);
+ runEnds = b.Build();
+ break;
+ }
+ case ArrowTypeId.Int32:
+ {
+ var b = new Int32Array.Builder().Reserve(runCount);
+ foreach (var v in cumulativeRunEnds) b.Append(v);
+ runEnds = b.Build();
+ break;
+ }
+ case ArrowTypeId.Int64:
+ {
+ var b = new Int64Array.Builder().Reserve(runCount);
+ foreach (var v in cumulativeRunEnds)
b.Append((long)v);
+ runEnds = b.Build();
+ break;
+ }
+ default:
+ throw new InvalidOperationException(
+ $"Unsupported run-ends type
{type.RunEndsDataType.TypeId}");
+ }
+
+ IArrowArray values = CreateArray(
+ new Field("values", type.ValuesDataType, nullable: true),
+ runCount);
+
+ Array = new RunEndEncodedArray(runEnds, values);
+ }
+
public void Visit(NullType type)
{
Array = new NullArray(Length);