This is an automated email from the ASF dual-hosted git repository.

CurtHagenlocher pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git


The following commit(s) were added to refs/heads/main by this push:
     new c0e20b6  perf: improve StringArray builder append paths (#331)
c0e20b6 is described below

commit c0e20b608ccd2ed92d0867d6205ec8bf1c708ac3
Author: InCerryGit <[email protected]>
AuthorDate: Sun Apr 26 23:40:20 2026 +0800

    perf: improve StringArray builder append paths (#331)
    
    ## Summary
    
    - Avoid temporary byte-array allocations for small
    `StringArray.Builder.Append(string)` values by encoding into stack
    memory before appending.
    - Pre-reserve offsets, validity, and value-buffer capacity for
    known-count `AppendRange` inputs.
    - Add focused correctness coverage for nulls, empty strings, custom
    encodings, large-string fallback, collection inputs, and non-collection
    enumerables.
    
    `AppendRange(ICollection<string>)` now performs a counting prepass to
    reserve value-buffer capacity before appending, so collection inputs are
    enumerated twice by design.
    
    ## Benchmark
    
    BenchmarkDotNet ShortRun, `StringBuilderAppendBenchmark`, 10,000 ASCII
    strings of length 32:
    
    | Method | Before | After |
    | --- | ---: | ---: |
    | `AppendSmallStrings` | 432.0 us / 1.66 MB | 341.0 us / 1157.5 KB |
    | `AppendRangeSmallStrings` | 426.2 us / 1.66 MB | 311.8 us / 353.68 KB
    |
    
    ## Validation
    
    - `dotnet test test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -c
    Release --filter
    "FullyQualifiedName~Apache.Arrow.Tests.StringArrayTests"`
    - `rtk dotnet build "Apache.Arrow.sln" -c Release`
    - LSP diagnostics clean on changed files
    - Code review completed before commit; no blockers found
---
 src/Apache.Arrow/Arrays/BinaryArray.cs             | 39 +++++++++++
 src/Apache.Arrow/Arrays/StringArray.cs             | 40 ++++++++++-
 .../StringBuilderAppendBenchmark.cs                | 64 +++++++++++++++++
 test/Apache.Arrow.Tests/StringArrayTests.cs        | 80 ++++++++++++++++++++++
 4 files changed, 221 insertions(+), 2 deletions(-)

diff --git a/src/Apache.Arrow/Arrays/BinaryArray.cs 
b/src/Apache.Arrow/Arrays/BinaryArray.cs
index 2d11207..80985d0 100644
--- a/src/Apache.Arrow/Arrays/BinaryArray.cs
+++ b/src/Apache.Arrow/Arrays/BinaryArray.cs
@@ -60,6 +60,7 @@ namespace Apache.Arrow
             protected ArrowBuffer.BitmapBuilder ValidityBuffer { get; }
             protected int Offset { get; set; }
             protected int NullCount => this.ValidityBuffer.UnsetBitCount;
+            private int _availableValueBufferByteCount;
 
             protected BuilderBase(IArrowType dataType)
             {
@@ -82,6 +83,44 @@ namespace Apache.Arrow
 
             protected abstract TArray Build(ArrayData data);
 
+            /// <summary>
+            /// Returns writable value-buffer space without changing the 
committed buffer length.
+            /// </summary>
+            /// <param name="sizeHint">The minimum number of writable bytes 
required.</param>
+            /// <returns>A span starting at the first uncommitted byte in the 
value buffer.</returns>
+            protected Span<byte> GetValueBufferSpan(int sizeHint)
+            {
+                if (sizeHint < 0)
+                {
+                    throw new ArgumentOutOfRangeException(nameof(sizeHint));
+                }
+
+                ValueBuffer.Reserve(sizeHint);
+                Span<byte> span = ValueBuffer.Span.Slice(ValueBuffer.Length);
+                _availableValueBufferByteCount = span.Length;
+                return span;
+            }
+
+            /// <summary>
+            /// Commits bytes previously written into the span returned by 
<see cref="GetValueBufferSpan"/>.
+            /// </summary>
+            /// <param name="count">The number of bytes written to the span 
returned by the latest <see cref="GetValueBufferSpan"/> call.</param>
+            protected void AdvanceValueBuffer(int count)
+            {
+                if (count < 0)
+                {
+                    throw new ArgumentOutOfRangeException(nameof(count));
+                }
+
+                if (count > _availableValueBufferByteCount)
+                {
+                    throw new ArgumentOutOfRangeException(nameof(count));
+                }
+
+                ValueBuffer.Resize(checked(ValueBuffer.Length + count));
+                _availableValueBufferByteCount = 0;
+            }
+
             /// <summary>
             /// Gets the length of the array built so far.
             /// </summary>
diff --git a/src/Apache.Arrow/Arrays/StringArray.cs 
b/src/Apache.Arrow/Arrays/StringArray.cs
index 918f828..4998fae 100644
--- a/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/src/Apache.Arrow/Arrays/StringArray.cs
@@ -43,13 +43,49 @@ namespace Apache.Arrow
                 {
                     return AppendNull();
                 }
+
                 encoding = encoding ?? DefaultEncoding;
-                byte[] span = encoding.GetBytes(value);
-                return Append(span.AsSpan());
+
+                int byteCount = encoding.GetByteCount(value);
+                Span<byte> destination = 
GetValueBufferSpan(byteCount).Slice(0, byteCount);
+
+                if (byteCount > 0)
+                {
+                    unsafe
+                    {
+                        fixed (char* chars = value)
+                        fixed (byte* data = destination)
+                            encoding.GetBytes(chars, value.Length, data, 
byteCount);
+                    }
+                }
+
+                AdvanceValueBuffer(byteCount);
+                ValidityBuffer.Append(true);
+                Offset += byteCount;
+                ValueOffsets.Append(Offset);
+                return this;
             }
 
             public Builder AppendRange(IEnumerable<string> values, Encoding 
encoding = null)
             {
+                encoding = encoding ?? DefaultEncoding;
+
+                if (values is ICollection<string> collection && 
collection.Count > 0)
+                {
+                    int totalByteCount = 0;
+                    foreach (string value in collection)
+                    {
+                        if (value != null)
+                        {
+                            totalByteCount = checked(totalByteCount + 
encoding.GetByteCount(value));
+                        }
+                    }
+
+                    ValueOffsets.Reserve(collection.Count);
+                    ValidityBuffer.Reserve(collection.Count);
+                    ValueBuffer.Reserve(totalByteCount);
+                }
+
                 foreach (string value in values)
                 {
                     Append(value, encoding);
diff --git a/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs 
b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
new file mode 100644
index 0000000..4da75e8
--- /dev/null
+++ b/test/Apache.Arrow.Benchmarks/StringBuilderAppendBenchmark.cs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using BenchmarkDotNet.Attributes;
+
+namespace Apache.Arrow.Benchmarks
+{
+    [MemoryDiagnoser]
+    [ShortRunJob]
+    public class StringBuilderAppendBenchmark
+    {
+        private const int Count = 10_000;
+        private string _payload;
+        private string[] _values;
+
+        [GlobalSetup]
+        public void GlobalSetup()
+        {
+            _payload = new string('a', 32);
+            _values = new string[Count];
+
+            for (int i = 0; i < _values.Length; i++)
+            {
+                _values[i] = _payload;
+            }
+        }
+
+        [Benchmark]
+        public int AppendSmallStrings()
+        {
+            var builder = new StringArray.Builder();
+
+            for (int i = 0; i < Count; i++)
+            {
+                builder.Append(_payload);
+            }
+
+            using StringArray array = builder.Build();
+            return array.Length;
+        }
+
+        [Benchmark]
+        public int AppendRangeSmallStrings()
+        {
+            using StringArray array = new StringArray.Builder()
+                .AppendRange(_values)
+                .Build();
+
+            return array.Length;
+        }
+    }
+}
diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs 
b/test/Apache.Arrow.Tests/StringArrayTests.cs
index b197315..d79726f 100644
--- a/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -13,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+using System.Collections.Generic;
+using System.Text;
 using Xunit;
 
 namespace Apache.Arrow.Tests
@@ -81,5 +83,83 @@ namespace Apache.Arrow.Tests
                 Assert.Equal(firstValue, retrievedValue);
             }
         }
+
+        public class Builder
+        {
+            [Fact]
+            public void AppendUsesCustomEncoding()
+            {
+                const string expected = "héllø";
+
+                var array = new StringArray.Builder()
+                    .Append(expected, Encoding.Unicode)
+                    .Build();
+
+                Assert.Equal(expected, array.GetString(0, Encoding.Unicode));
+            }
+
+            [Fact]
+            public void AppendLargeStringUsesFallbackPath()
+            {
+                string expected = new string('x', 512);
+
+                var array = new StringArray.Builder()
+                    .Append(expected)
+                    .Build();
+
+                Assert.Equal(expected, array.GetString(0));
+            }
+
+            [Fact]
+            public void AppendRangePreservesCollectionValues()
+            {
+                string[] values = { "first", null, string.Empty, "last" };
+
+                var array = new StringArray.Builder()
+                    .AppendRange(values)
+                    .Build();
+
+                Assert.Equal("first", array.GetString(0));
+                Assert.Null(array.GetString(1));
+                Assert.Equal(string.Empty, array.GetString(2));
+                Assert.Equal("last", array.GetString(3));
+            }
+
+            [Fact]
+            public void 
AppendRangePreservesCollectionValuesWithCustomEncoding()
+            {
+                string[] values = { "héllø", null, string.Empty, "wørld" };
+
+                var array = new StringArray.Builder()
+                    .AppendRange(values, Encoding.Unicode)
+                    .Build();
+
+                Assert.Equal("héllø", array.GetString(0, Encoding.Unicode));
+                Assert.Null(array.GetString(1, Encoding.Unicode));
+                Assert.Equal(string.Empty, array.GetString(2, 
Encoding.Unicode));
+                Assert.Equal("wørld", array.GetString(3, Encoding.Unicode));
+            }
+
+            [Fact]
+            public void AppendRangePreservesEnumerableValues()
+            {
+                var array = new StringArray.Builder()
+                    .AppendRange(YieldValues())
+                    .Build();
+
+                Assert.Equal("first", array.GetString(0));
+                Assert.Null(array.GetString(1));
+                Assert.Equal(string.Empty, array.GetString(2));
+                Assert.Equal("last", array.GetString(3));
+            }
+
+            private static IEnumerable<string> YieldValues()
+            {
+                yield return "first";
+                yield return null;
+                yield return string.Empty;
+                yield return "last";
+            }
+        }
     }
 }

Reply via email to