This is an automated email from the ASF dual-hosted git repository.
CurtHagenlocher pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git
The following commit(s) were added to refs/heads/main by this push:
new c0e20b6 perf: improve StringArray builder append paths (#331)
c0e20b6 is described below
commit c0e20b608ccd2ed92d0867d6205ec8bf1c708ac3
Author: InCerryGit <[email protected]>
AuthorDate: Sun Apr 26 23:40:20 2026 +0800
perf: improve StringArray builder append paths (#331)
## Summary
- Avoid temporary byte-array allocations for small
`StringArray.Builder.Append(string)` values by encoding into stack
memory before appending.
- Pre-reserve offsets, validity, and value-buffer capacity for
known-count `AppendRange` inputs.
- Add focused correctness coverage for nulls, empty strings, custom
encodings, large-string fallback, collection inputs, and non-collection
enumerables.
`AppendRange(ICollection<string>)` now performs a counting prepass to
reserve value-buffer capacity before appending, so collection inputs are
enumerated twice by design.
## Benchmark
BenchmarkDotNet ShortRun, `StringBuilderAppendBenchmark`, 10,000 ASCII
strings of length 32:
| Method | Before | After |
| --- | ---: | ---: |
| `AppendSmallStrings` | 432.0 us / 1.66 MB | 341.0 us / 1157.5 KB |
| `AppendRangeSmallStrings` | 426.2 us / 1.66 MB | 311.8 us / 353.68 KB
|
## Validation
- `dotnet test test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -c
Release --filter
"FullyQualifiedName~Apache.Arrow.Tests.StringArrayTests"`
- `rtk dotnet build "Apache.Arrow.sln" -c Release`
- LSP diagnostics clean on changed files
- Code review completed before commit; no blockers found
---
src/Apache.Arrow/Arrays/BinaryArray.cs | 39 +++++++++++
src/Apache.Arrow/Arrays/StringArray.cs | 40 ++++++++++-
.../StringBuilderAppendBenchmark.cs | 64 +++++++++++++++++
test/Apache.Arrow.Tests/StringArrayTests.cs | 80 ++++++++++++++++++++++
4 files changed, 221 insertions(+), 2 deletions(-)
diff --git a/src/Apache.Arrow/Arrays/BinaryArray.cs
b/src/Apache.Arrow/Arrays/BinaryArray.cs
index 2d11207..80985d0 100644
--- a/src/Apache.Arrow/Arrays/BinaryArray.cs
+++ b/src/Apache.Arrow/Arrays/BinaryArray.cs
@@ -60,6 +60,7 @@ namespace Apache.Arrow
protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; }
protected int Offset { get; set; }
protected int NullCount => this.ValidityBuffer.UnsetBitCount;
+ private int _availableValueBufferByteCount;
protected BuilderBase(IArrowType dataType)
{
@@ -82,6 +83,44 @@ namespace Apache.Arrow
protected abstract TArray Build(ArrayData data);
+ /// <summary>
+ /// Returns writable value-buffer space without changing the
committed buffer length.
+ /// </summary>
+ /// <param name="sizeHint">The minimum number of writable bytes
required.</param>
+ /// <returns>A span starting at the first uncommitted byte in the
value buffer.</returns>
+ protected Span<byte> GetValueBufferSpan(int sizeHint)
+ {
+ if (sizeHint < 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(sizeHint));
+ }
+
+ ValueBuffer.Reserve(sizeHint);
+ Span<byte> span = ValueBuffer.Span.Slice(ValueBuffer.Length);
+ _availableValueBufferByteCount = span.Length;
+ return span;
+ }
+
+ /// <summary>
+ /// Commits bytes previously written into the span returned by
<see cref="GetValueBufferSpan"/>.
+ /// </summary>
+ /// <param name="count">The number of bytes written to the span
returned by the latest <see cref="GetValueBufferSpan"/> call.</param>
+ protected void AdvanceValueBuffer(int count)
+ {
+ if (count < 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(count));
+ }
+
+ if (count > _availableValueBufferByteCount)
+ {
+ throw new ArgumentOutOfRangeException(nameof(count));
+ }
+
+ ValueBuffer.Resize(checked(ValueBuffer.Length + count));
+ _availableValueBufferByteCount = 0;
+ }
+
/// <summary>
/// Gets the length of the array built so far.
/// </summary>
diff --git a/src/Apache.Arrow/Arrays/StringArray.cs
b/src/Apache.Arrow/Arrays/StringArray.cs
index 918f828..4998fae 100644
--- a/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/src/Apache.Arrow/Arrays/StringArray.cs
@@ -43,13 +43,49 @@ namespace Apache.Arrow
{
return AppendNull();
}
+
encoding = encoding ?? DefaultEncoding;
- byte[] span = encoding.GetBytes(value);
- return Append(span.AsSpan());
+
+ int byteCount = encoding.GetByteCount(value);
+ Span<byte> destination =
GetValueBufferSpan(byteCount).Slice(0, byteCount);
+
+ if (byteCount > 0)
+ {
+ unsafe
+ {
+ fixed (char* chars = value)
+ fixed (byte* data = destination)
+ encoding.GetBytes(chars, value.Length, data,
byteCount);
+ }
+ }
+
+ AdvanceValueBuffer(byteCount);
+ ValidityBuffer.Append(true);
+ Offset += byteCount;
+ ValueOffsets.Append(Offset);
+ return this;
}
public Builder AppendRange(IEnumerable<string> values, Encoding
encoding = null)
{
+ encoding = encoding ?? DefaultEncoding;
+
+ if (values is ICollection<string> collection &&
collection.Count > 0)
+ {
+ int totalByteCount = 0;
+ foreach (string value in collection)
+ {
+ if (value != null)
+ {
+ totalByteCount = checked(totalByteCount +
encoding.GetByteCount(value));
+ }
+ }
+
+ ValueOffsets.Reserve(collection.Count);
+ ValidityBuffer.Reserve(collection.Count);
+ ValueBuffer.Reserve(totalByteCount);
+ }
+
foreach (string value in values)
{
Append(value, encoding);
diff --git a/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
new file mode 100644
index 0000000..4da75e8
--- /dev/null
+++ b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using BenchmarkDotNet.Attributes;
+
+namespace Apache.Arrow.Benchmarks
+{
+ [MemoryDiagnoser]
+ [ShortRunJob]
+ public class StringBuilderAppendBenchmark
+ {
+ private const int Count = 10_000;
+ private string _payload;
+ private string[] _values;
+
+ [GlobalSetup]
+ public void GlobalSetup()
+ {
+ _payload = new string('a', 32);
+ _values = new string[Count];
+
+ for (int i = 0; i < _values.Length; i++)
+ {
+ _values[i] = _payload;
+ }
+ }
+
+ [Benchmark]
+ public int AppendSmallStrings()
+ {
+ var builder = new StringArray.Builder();
+
+ for (int i = 0; i < Count; i++)
+ {
+ builder.Append(_payload);
+ }
+
+ using StringArray array = builder.Build();
+ return array.Length;
+ }
+
+ [Benchmark]
+ public int AppendRangeSmallStrings()
+ {
+ using StringArray array = new StringArray.Builder()
+ .AppendRange(_values)
+ .Build();
+
+ return array.Length;
+ }
+ }
+}
diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs
b/test/Apache.Arrow.Tests/StringArrayTests.cs
index b197315..d79726f 100644
--- a/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -13,6 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+using System.Collections.Generic;
+using System.Text;
using Xunit;
namespace Apache.Arrow.Tests
@@ -81,5 +83,83 @@ namespace Apache.Arrow.Tests
Assert.Equal(firstValue, retrievedValue);
}
}
+
+ public class Builder
+ {
+ [Fact]
+ public void AppendUsesCustomEncoding()
+ {
+ const string expected = "héllø";
+
+ var array = new StringArray.Builder()
+ .Append(expected, Encoding.Unicode)
+ .Build();
+
+ Assert.Equal(expected, array.GetString(0, Encoding.Unicode));
+ }
+
+ [Fact]
+ public void AppendLargeStringUsesFallbackPath()
+ {
+ string expected = new string('x', 512);
+
+ var array = new StringArray.Builder()
+ .Append(expected)
+ .Build();
+
+ Assert.Equal(expected, array.GetString(0));
+ }
+
+ [Fact]
+ public void AppendRangePreservesCollectionValues()
+ {
+ string[] values = { "first", null, string.Empty, "last" };
+
+ var array = new StringArray.Builder()
+ .AppendRange(values)
+ .Build();
+
+ Assert.Equal("first", array.GetString(0));
+ Assert.Null(array.GetString(1));
+ Assert.Equal(string.Empty, array.GetString(2));
+ Assert.Equal("last", array.GetString(3));
+ }
+
+ [Fact]
+ public void
AppendRangePreservesCollectionValuesWithCustomEncoding()
+ {
+ string[] values = { "héllø", null, string.Empty, "wørld" };
+
+ var array = new StringArray.Builder()
+ .AppendRange(values, Encoding.Unicode)
+ .Build();
+
+ Assert.Equal("héllø", array.GetString(0, Encoding.Unicode));
+ Assert.Null(array.GetString(1, Encoding.Unicode));
+ Assert.Equal(string.Empty, array.GetString(2,
Encoding.Unicode));
+ Assert.Equal("wørld", array.GetString(3, Encoding.Unicode));
+ }
+
+ [Fact]
+ public void AppendRangePreservesEnumerableValues()
+ {
+ var array = new StringArray.Builder()
+ .AppendRange(YieldValues())
+ .Build();
+
+ Assert.Equal("first", array.GetString(0));
+ Assert.Null(array.GetString(1));
+ Assert.Equal(string.Empty, array.GetString(2));
+ Assert.Equal("last", array.GetString(3));
+ }
+
+ private static IEnumerable<string> YieldValues()
+ {
+ yield return "first";
+ yield return null;
+ yield return string.Empty;
+ yield return "last";
+ }
+ }
}
}