This is an automated email from the ASF dual-hosted git repository. curth pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 9ba789d111 GH-43267: [C#] Correctly import sliced arrays through the C Data interface (#44117) 9ba789d111 is described below commit 9ba789d1115494b00e6772a3170c8ba2f1a9a02c Author: Curt Hagenlocher <c...@hagenlocher.org> AuthorDate: Sun Sep 15 19:20:59 2024 -0700 GH-43267: [C#] Correctly import sliced arrays through the C Data interface (#44117) ### What changes are included in this PR? Changes to the C Data importer to correctly handle nonzero offsets. ### Are these changes tested? Yes ### Are there any user-facing changes? No Closes #43267 * GitHub Issue: #43267 Authored-by: Curt Hagenlocher <c...@hagenlocher.org> Signed-off-by: Curt Hagenlocher <c...@hagenlocher.org> --- csharp/src/Apache.Arrow/Apache.Arrow.csproj | 6 +-- csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs | 22 +++++------ csharp/src/Apache.Arrow/RecordBatch.cs | 11 ++++++ csharp/src/Apache.Arrow/Utility.cs | 2 - .../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 4 +- .../Apache.Arrow.Tests/CDataInterfaceDataTests.cs | 18 +++++++++ .../CDataInterfacePythonTests.cs | 43 ++++++++++++++++++++++ 7 files changed, 88 insertions(+), 18 deletions(-) diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj b/csharp/src/Apache.Arrow/Apache.Arrow.csproj index 034876a114..a845f8e693 100644 --- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj +++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj @@ -7,18 +7,16 @@ <Description>Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware.</Description> </PropertyGroup> - <PropertyGroup Condition="'$(IsWindows)'=='true'"> + <PropertyGroup> <TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks> </PropertyGroup> - <PropertyGroup Condition="'$(IsWindows)'!='true'"> - <TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks> - </PropertyGroup> <ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or '$(TargetFramework)' == 'net462'"> <PackageReference Include="System.Buffers" Version="4.5.1" /> <PackageReference Include="System.Memory" Version="4.5.5" /> <PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.7.1" /> <PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.4" /> + <PackageReference Include="System.ValueTuple" Version="4.5.0" /> </ItemGroup> <ItemGroup> diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs index 68b67f3d7c..c454380e17 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs @@ -260,7 +260,7 @@ namespace Apache.Arrow.C private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray) { - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int validityLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8); return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength)); } @@ -285,7 +285,7 @@ namespace Apache.Arrow.C throw new InvalidOperationException("Byte arrays are expected to have exactly three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 4; int* offsets = (int*)cArray->buffers[1]; Debug.Assert(offsets != null); @@ -306,7 +306,7 @@ namespace Apache.Arrow.C throw new InvalidOperationException("Byte array views are expected to have at least three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int viewsLength = length * 16; long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers - 1]; @@ -336,7 +336,7 @@ namespace Apache.Arrow.C $"is greater than the maximum supported large byte array length ({maxLength})"); } - int length = (int)cArray->length; + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 8; long* offsets = (long*)cArray->buffers[1]; Debug.Assert(offsets != null); @@ -364,7 +364,7 @@ namespace Apache.Arrow.C throw new InvalidOperationException("List arrays are expected to have exactly two buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -381,7 +381,7 @@ namespace Apache.Arrow.C throw new InvalidOperationException("List view arrays are expected to have exactly three buffers"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[3]; @@ -407,7 +407,7 @@ namespace Apache.Arrow.C $"is greater than the maximum supported large list array length ({maxLength})"); } - int length = (int)cArray->length; + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = (length + 1) * 8; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -436,7 +436,7 @@ namespace Apache.Arrow.C { throw new InvalidOperationException("Dense union arrays are expected to have exactly two children"); } - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int offsetsLength = length * 4; ArrowBuffer[] buffers = new ArrowBuffer[2]; @@ -454,7 +454,7 @@ namespace Apache.Arrow.C } ArrowBuffer[] buffers = new ArrowBuffer[1]; - buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->length)); + buffers[0] = ImportCArrayBuffer(cArray, 0, checked((int)cArray->offset + (int)cArray->length)); return buffers; } @@ -467,10 +467,10 @@ namespace Apache.Arrow.C } // validity, data - int length = checked((int)cArray->length); + int length = checked((int)cArray->offset + (int)cArray->length); int valuesLength; if (bitWidth >= 8) - valuesLength = checked((int)(cArray->length * bitWidth / 8)); + valuesLength = checked(length * bitWidth / 8); else valuesLength = checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8); diff --git a/csharp/src/Apache.Arrow/RecordBatch.cs b/csharp/src/Apache.Arrow/RecordBatch.cs index 9cc81b1648..4067ba9ac6 100644 --- a/csharp/src/Apache.Arrow/RecordBatch.cs +++ b/csharp/src/Apache.Arrow/RecordBatch.cs @@ -100,6 +100,17 @@ namespace Apache.Arrow return new RecordBatch(Schema, arrays, Length); } + public RecordBatch Slice(int offset, int length) + { + if (offset > Length) + { + throw new ArgumentException($"Offset {offset} cannot be greater than Length {Length} for RecordBatch.Slice"); + } + + length = Math.Min(Length - offset, length); + return new RecordBatch(Schema, _arrays.Select(a => ArrowArrayFactory.Slice(a, offset, length)), length); + } + public void Accept(IArrowArrayVisitor visitor) { switch (visitor) diff --git a/csharp/src/Apache.Arrow/Utility.cs b/csharp/src/Apache.Arrow/Utility.cs index c4e5732e6e..22b3ff15f1 100644 --- a/csharp/src/Apache.Arrow/Utility.cs +++ b/csharp/src/Apache.Arrow/Utility.cs @@ -13,10 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -using Apache.Arrow.Flatbuf; using System; using System.Collections.Generic; -using System.Text; namespace Apache.Arrow { diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 85f7b75f93..35b2c4e7f2 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -566,7 +566,9 @@ namespace Apache.Arrow.Tests var listSize = ((FixedSizeListType)expectedArray.Data.DataType).ListSize; var expectedValuesSlice = ArrowArrayFactory.Slice( expectedArray.Values, expectedArray.Offset * listSize, expectedArray.Length * listSize); - actualArray.Values.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare)); + var actualValuesSlice = ArrowArrayFactory.Slice( + actualArray.Values, actualArray.Offset * listSize, actualArray.Length * listSize); + actualValuesSlice.Accept(new ArrayComparer(expectedValuesSlice, _strictCompare)); } private void CompareValidityBuffer(int nullCount, int arrayLength, ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer actualValidityBuffer, int actualBufferOffset) diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs index 2bd4d4d661..70ab1fdae2 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs @@ -92,5 +92,23 @@ namespace Apache.Arrow.Tests GC.KeepAlive(releaseCallback); } #endif + + [Fact] + public unsafe void RoundTripInt32ArrayWithOffset() + { + Int32Array array = new Int32Array.Builder() + .AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }) + .Build(); + IArrowArray sliced = array.Slice(2, 6); + CArrowArray* cArray = CArrowArray.Create(); + CArrowArrayExporter.ExportArray(sliced, cArray); + using (var importedSlice = (Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType)) + { + Assert.Equal(6, importedSlice.Length); + Assert.Equal(2, importedSlice.Offset); + Assert.Equal(2, importedSlice.GetValue(0)); + } + CArrowArray.Free(cArray); + } } } diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs index fee18d165c..638cbfb272 100644 --- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs +++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs @@ -792,6 +792,49 @@ namespace Apache.Arrow.Tests CArrowSchema.Free(cImportSchema); } + [SkippableFact] + public unsafe void RoundTripTestSlicedBatch() + { + // TODO: Enable these once this the version of pyarrow referenced during testing supports them + HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> { ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView }; + RecordBatch batch1 = TestData.CreateSampleRecordBatch(4, excludedTypes: unsupported); + RecordBatch batch1slice = batch1.Slice(1, 2); + RecordBatch batch2 = batch1slice.Clone(); + + CArrowArray* cExportArray = CArrowArray.Create(); + CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray); + + CArrowSchema* cExportSchema = CArrowSchema.Create(); + CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema); + + CArrowArray* cImportArray = CArrowArray.Create(); + CArrowSchema* cImportSchema = CArrowSchema.Create(); + + // For Python, we need to provide the pointers + long exportArrayPtr = ((IntPtr)cExportArray).ToInt64(); + long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64(); + long importArrayPtr = ((IntPtr)cImportArray).ToInt64(); + long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64(); + + using (Py.GIL()) + { + dynamic pa = Py.Import("pyarrow"); + dynamic exportedPyArray = pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr); + exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr); + } + + Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema); + RecordBatch importedBatch = CArrowArrayImporter.ImportRecordBatch(cImportArray, schema); + + ArrowReaderVerifier.CompareBatches(batch2, importedBatch, strictCompare: false); // Non-strict because span lengths won't match. + + // Since we allocated, we are responsible for freeing the pointer. + CArrowArray.Free(cExportArray); + CArrowSchema.Free(cExportSchema); + CArrowArray.Free(cImportArray); + CArrowSchema.Free(cImportSchema); + } + [SkippableFact] public unsafe void ExportBatchReader() {