This is an automated email from the ASF dual-hosted git repository.
CurtHagenlocher pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git
The following commit(s) were added to refs/heads/main by this push:
new ea58c94 perf: improve StringArray GetString decoding (#334)
ea58c94 is described below
commit ea58c946d7400349b6d6d14d4f124b42433fefc6
Author: InCerryGit <[email protected]>
AuthorDate: Mon Apr 27 12:01:09 2026 +0800
perf: improve StringArray GetString decoding (#334)
## Summary
`StringArray.GetString` previously routed through `GetBytes`, which
repeated bounds/null/offset work before decoding the returned byte span.
This PR decodes directly from the array's offsets and value buffer while
preserving the materialized-string fast path.
## Benchmark
BenchmarkDotNet, `StringArrayGetStringBenchmark`, Count=1024:
| Method | Before | After |
|---|---:|---:|
| `GetString` | 23.23 us / 48.08 KB | 18.04 us / 48.08 KB |
| `GetStringFromSlice` | 23.90 us / 48.00 KB | 18.05 us / 48.00 KB |
## Validation
- `dotnet format Apache.Arrow.sln --include
src/Apache.Arrow/Arrays/StringArray.cs
test/Apache.Arrow.Tests/StringArrayTests.cs
test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
--no-restore`
- `dotnet test test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -c
Release --filter
"FullyQualifiedName~Apache.Arrow.Tests.StringArrayTests"`
- `dotnet build Apache.Arrow.sln -c Release`
---
src/Apache.Arrow/Arrays/StringArray.cs | 21 +++-
.../StringArrayGetStringBenchmark.cs | 133 +++++++++++++++++++++
test/Apache.Arrow.Tests/StringArrayTests.cs | 99 +++++++++++++++
3 files changed, 248 insertions(+), 5 deletions(-)
diff --git a/src/Apache.Arrow/Arrays/StringArray.cs
b/src/Apache.Arrow/Arrays/StringArray.cs
index 4998fae..a0961ef 100644
--- a/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/src/Apache.Arrow/Arrays/StringArray.cs
@@ -124,22 +124,33 @@ namespace Apache.Arrow
return materializedStrings[index];
}
- ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);
+ if (index < 0 || index >= Length)
+ {
+ throw new ArgumentOutOfRangeException(nameof(index));
+ }
- if (isNull)
+ if (IsNull(index))
{
return null;
}
- if (bytes.Length == 0)
+ ReadOnlySpan<int> offsets = ValueOffsets;
+ int valueOffset = offsets[index];
+ int valueLength = offsets[index + 1] - valueOffset;
+
+ if (valueLength == 0)
{
return string.Empty;
}
+ ReadOnlySpan<byte> values = Values;
+
+ // Decode directly from the shared value buffer so the hot path
only pays one
+ // bounds/null/offset pass before handing off to the requested
encoding.
unsafe
{
- fixed (byte* data = &MemoryMarshal.GetReference(bytes))
- return encoding.GetString(data, bytes.Length);
+ fixed (byte* data = &MemoryMarshal.GetReference(values))
+ return encoding.GetString(data + valueOffset, valueLength);
}
}
diff --git a/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
new file mode 100644
index 0000000..3c49971
--- /dev/null
+++ b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Text;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+
+namespace Apache.Arrow.Benchmarks
+{
+ [MemoryDiagnoser]
+ [ShortRunJob]
+ public class StringArrayGetStringBenchmark
+ {
+ private StringArray _array;
+ private StringArray _slice;
+
+ [Params(1_024)]
+ public int Count { get; set; }
+
+ [GlobalSetup]
+ public void GlobalSetup()
+ {
+ var builder = new StringArray.Builder();
+ builder.Append("prefix");
+
+ for (int i = 0; i < Count; i++)
+ {
+ if ((i & 7) == 0)
+ {
+ builder.AppendNull();
+ }
+ else if ((i & 7) == 1)
+ {
+ builder.Append(string.Empty);
+ }
+ else
+ {
+ builder.Append($"value-{i:0000}-payload");
+ }
+ }
+
+ builder.Append("suffix");
+
+ _array = builder.Build();
+ _slice = (StringArray)_array.Slice(1, Count);
+ }
+
+ [GlobalCleanup]
+ public void GlobalCleanup()
+ {
+ _slice.Dispose();
+ _array.Dispose();
+ }
+
+ [Benchmark(Baseline = true)]
+ public int LegacyGetString()
+ {
+ int totalLength = 0;
+ for (int i = 0; i < _array.Length; i++)
+ {
+ totalLength += GetStringLegacy(_array, i)?.Length ?? 0;
+ }
+
+ return totalLength;
+ }
+
+ [Benchmark]
+ public int GetString()
+ {
+ int totalLength = 0;
+ for (int i = 0; i < _array.Length; i++)
+ {
+ totalLength += _array.GetString(i)?.Length ?? 0;
+ }
+
+ return totalLength;
+ }
+
+ [Benchmark]
+ public int LegacyGetStringFromSlice()
+ {
+ int totalLength = 0;
+ for (int i = 0; i < _slice.Length; i++)
+ {
+ totalLength += GetStringLegacy(_slice, i)?.Length ?? 0;
+ }
+
+ return totalLength;
+ }
+
+ [Benchmark]
+ public int GetStringFromSlice()
+ {
+ int totalLength = 0;
+ for (int i = 0; i < _slice.Length; i++)
+ {
+ totalLength += _slice.GetString(i)?.Length ?? 0;
+ }
+
+ return totalLength;
+ }
+
+ private static string GetStringLegacy(StringArray array, int index)
+ {
+ ReadOnlySpan<byte> bytes = array.GetBytes(index, out bool isNull);
+
+ if (isNull)
+ {
+ return null;
+ }
+
+ if (bytes.Length == 0)
+ {
+ return string.Empty;
+ }
+
+ return Encoding.UTF8.GetString(bytes);
+ }
+ }
+}
diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs
b/test/Apache.Arrow.Tests/StringArrayTests.cs
index d79726f..95b0caa 100644
--- a/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -82,6 +82,105 @@ namespace Apache.Arrow.Tests
Assert.True(array.IsMaterialized());
Assert.Equal(firstValue, retrievedValue);
}
+
+ [Fact]
+ public void ReturnsAppendedValueForSlice()
+ {
+ // Arrange
+ var array = new StringArray.Builder()
+ .Append("prefix")
+ .Append("value")
+ .AppendNull()
+ .Append(string.Empty)
+ .Build();
+
+ var slice = (StringArray)array.Slice(1, 3);
+
+ // Act / Assert
+ Assert.Equal("value", slice.GetString(0));
+ Assert.Null(slice.GetString(1));
+ Assert.Equal(string.Empty, slice.GetString(2));
+ }
+
+ [Fact]
+ public void ReturnsAppendedValueForSliceAfterMaterialize()
+ {
+ // Arrange
+ var array = new StringArray.Builder()
+ .Append("prefix")
+ .Append("value")
+ .AppendNull()
+ .Append(string.Empty)
+ .Build();
+
+ var slice = (StringArray)array.Slice(1, 3);
+
+ // Act
+ slice.Materialize();
+
+ // Assert
+ Assert.True(slice.IsMaterialized());
+ Assert.Equal("value", slice.GetString(0));
+ Assert.Null(slice.GetString(1));
+ Assert.Equal(string.Empty, slice.GetString(2));
+ }
+
+ [Fact]
+ public void ReturnsAppendedValueWithCustomEncoding()
+ {
+ // Arrange
+ const string expected = "héllø";
+ var array = new StringArray.Builder()
+ .Append(expected, Encoding.Unicode)
+ .Build();
+
+ // Act
+ var retrievedValue = array.GetString(0, Encoding.Unicode);
+
+ // Assert
+ Assert.Equal(expected, retrievedValue);
+ }
+
+ [Fact]
+ public void
ReturnsAppendedValueWithCustomEncodingAfterMaterialize()
+ {
+ // Arrange
+ const string expected = "héllø";
+ var array = new StringArray.Builder()
+ .Append(expected, Encoding.Unicode)
+ .Build();
+
+ // Act
+ array.Materialize(Encoding.Unicode);
+ var retrievedValue = array.GetString(0, Encoding.Unicode);
+
+ // Assert
+ Assert.True(array.IsMaterialized(Encoding.Unicode));
+ Assert.Equal(expected, retrievedValue);
+ }
+
+ [Fact]
+ public void
ReturnsAppendedValueForCustomEncodingSliceAfterMaterialize()
+ {
+ // Arrange
+ var array = new StringArray.Builder()
+ .Append("prefix", Encoding.Unicode)
+ .Append("héllø", Encoding.Unicode)
+ .AppendNull()
+ .Append(string.Empty, Encoding.Unicode)
+ .Build();
+
+ var slice = (StringArray)array.Slice(1, 3);
+
+ // Act
+ slice.Materialize(Encoding.Unicode);
+
+ // Assert
+ Assert.True(slice.IsMaterialized(Encoding.Unicode));
+ Assert.Equal("héllø", slice.GetString(0, Encoding.Unicode));
+ Assert.Null(slice.GetString(1, Encoding.Unicode));
+ Assert.Equal(string.Empty, slice.GetString(2,
Encoding.Unicode));
+ }
}
public class Builder