This is an automated email from the ASF dual-hosted git repository.

CurtHagenlocher pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-dotnet.git


The following commit(s) were added to refs/heads/main by this push:
     new ea58c94  perf: improve StringArray GetString decoding (#334)
ea58c94 is described below

commit ea58c946d7400349b6d6d14d4f124b42433fefc6
Author: InCerryGit <[email protected]>
AuthorDate: Mon Apr 27 12:01:09 2026 +0800

    perf: improve StringArray GetString decoding (#334)
    
    ## Summary
    
    `StringArray.GetString` previously routed through `GetBytes`, which
    repeated bounds/null/offset work before decoding the returned byte span.
    This PR decodes directly from the array's offsets and value buffer while
    preserving the materialized-string fast path.
    
    ## Benchmark
    
    BenchmarkDotNet, `StringArrayGetStringBenchmark`, Count=1024:
    
    | Method | Before | After |
    |---|---:|---:|
    | `GetString` | 23.23 us / 48.08 KB | 18.04 us / 48.08 KB |
    | `GetStringFromSlice` | 23.90 us / 48.00 KB | 18.05 us / 48.00 KB |
    
    ## Validation
    
    - `dotnet format Apache.Arrow.sln --include
    src/Apache.Arrow/Arrays/StringArray.cs
    test/Apache.Arrow.Tests/StringArrayTests.cs
    test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
    --no-restore`
    - `dotnet test test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj -c
    Release --filter
    "FullyQualifiedName~Apache.Arrow.Tests.StringArrayTests"`
    - `dotnet build Apache.Arrow.sln -c Release`
---
 src/Apache.Arrow/Arrays/StringArray.cs             |  21 +++-
 .../StringArrayGetStringBenchmark.cs               | 133 +++++++++++++++++++++
 test/Apache.Arrow.Tests/StringArrayTests.cs        |  99 +++++++++++++++
 3 files changed, 248 insertions(+), 5 deletions(-)

diff --git a/src/Apache.Arrow/Arrays/StringArray.cs 
b/src/Apache.Arrow/Arrays/StringArray.cs
index 4998fae..a0961ef 100644
--- a/src/Apache.Arrow/Arrays/StringArray.cs
+++ b/src/Apache.Arrow/Arrays/StringArray.cs
@@ -124,22 +124,33 @@ namespace Apache.Arrow
                 return materializedStrings[index];
             }
 
-            ReadOnlySpan<byte> bytes = GetBytes(index, out bool isNull);
+            if (index < 0 || index >= Length)
+            {
+                throw new ArgumentOutOfRangeException(nameof(index));
+            }
 
-            if (isNull)
+            if (IsNull(index))
             {
                 return null;
             }
 
-            if (bytes.Length == 0)
+            ReadOnlySpan<int> offsets = ValueOffsets;
+            int valueOffset = offsets[index];
+            int valueLength = offsets[index + 1] - valueOffset;
+
+            if (valueLength == 0)
             {
                 return string.Empty;
             }
 
+            ReadOnlySpan<byte> values = Values;
+
+            // Decode directly from the shared value buffer so the hot path 
only pays one
+            // bounds/null/offset pass before handing off to the requested 
encoding.
             unsafe
             {
-                fixed (byte* data = &MemoryMarshal.GetReference(bytes))
-                    return encoding.GetString(data, bytes.Length);
+                fixed (byte* data = &MemoryMarshal.GetReference(values))
+                    return encoding.GetString(data + valueOffset, valueLength);
             }
         }
 
diff --git a/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs 
b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
new file mode 100644
index 0000000..3c49971
--- /dev/null
+++ b/test/Apache.Arrow.Benchmarks/StringArrayGetStringBenchmark.cs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Text;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+
+namespace Apache.Arrow.Benchmarks
+{
+    [MemoryDiagnoser]
+    [ShortRunJob]
+    public class StringArrayGetStringBenchmark
+    {
+        private StringArray _array;
+        private StringArray _slice;
+
+        [Params(1_024)]
+        public int Count { get; set; }
+
+        [GlobalSetup]
+        public void GlobalSetup()
+        {
+            var builder = new StringArray.Builder();
+            builder.Append("prefix");
+
+            for (int i = 0; i < Count; i++)
+            {
+                if ((i & 7) == 0)
+                {
+                    builder.AppendNull();
+                }
+                else if ((i & 7) == 1)
+                {
+                    builder.Append(string.Empty);
+                }
+                else
+                {
+                    builder.Append($"value-{i:0000}-payload");
+                }
+            }
+
+            builder.Append("suffix");
+
+            _array = builder.Build();
+            _slice = (StringArray)_array.Slice(1, Count);
+        }
+
+        [GlobalCleanup]
+        public void GlobalCleanup()
+        {
+            _slice.Dispose();
+            _array.Dispose();
+        }
+
+        [Benchmark(Baseline = true)]
+        public int LegacyGetString()
+        {
+            int totalLength = 0;
+            for (int i = 0; i < _array.Length; i++)
+            {
+                totalLength += GetStringLegacy(_array, i)?.Length ?? 0;
+            }
+
+            return totalLength;
+        }
+
+        [Benchmark]
+        public int GetString()
+        {
+            int totalLength = 0;
+            for (int i = 0; i < _array.Length; i++)
+            {
+                totalLength += _array.GetString(i)?.Length ?? 0;
+            }
+
+            return totalLength;
+        }
+
+        [Benchmark]
+        public int LegacyGetStringFromSlice()
+        {
+            int totalLength = 0;
+            for (int i = 0; i < _slice.Length; i++)
+            {
+                totalLength += GetStringLegacy(_slice, i)?.Length ?? 0;
+            }
+
+            return totalLength;
+        }
+
+        [Benchmark]
+        public int GetStringFromSlice()
+        {
+            int totalLength = 0;
+            for (int i = 0; i < _slice.Length; i++)
+            {
+                totalLength += _slice.GetString(i)?.Length ?? 0;
+            }
+
+            return totalLength;
+        }
+
+        private static string GetStringLegacy(StringArray array, int index)
+        {
+            ReadOnlySpan<byte> bytes = array.GetBytes(index, out bool isNull);
+
+            if (isNull)
+            {
+                return null;
+            }
+
+            if (bytes.Length == 0)
+            {
+                return string.Empty;
+            }
+
+            return Encoding.UTF8.GetString(bytes);
+        }
+    }
+}
diff --git a/test/Apache.Arrow.Tests/StringArrayTests.cs 
b/test/Apache.Arrow.Tests/StringArrayTests.cs
index d79726f..95b0caa 100644
--- a/test/Apache.Arrow.Tests/StringArrayTests.cs
+++ b/test/Apache.Arrow.Tests/StringArrayTests.cs
@@ -82,6 +82,105 @@ namespace Apache.Arrow.Tests
                 Assert.True(array.IsMaterialized());
                 Assert.Equal(firstValue, retrievedValue);
             }
+
+            [Fact]
+            public void ReturnsAppendedValueForSlice()
+            {
+                // Arrange
+                var array = new StringArray.Builder()
+                    .Append("prefix")
+                    .Append("value")
+                    .AppendNull()
+                    .Append(string.Empty)
+                    .Build();
+
+                var slice = (StringArray)array.Slice(1, 3);
+
+                // Act / Assert
+                Assert.Equal("value", slice.GetString(0));
+                Assert.Null(slice.GetString(1));
+                Assert.Equal(string.Empty, slice.GetString(2));
+            }
+
+            [Fact]
+            public void ReturnsAppendedValueForSliceAfterMaterialize()
+            {
+                // Arrange
+                var array = new StringArray.Builder()
+                    .Append("prefix")
+                    .Append("value")
+                    .AppendNull()
+                    .Append(string.Empty)
+                    .Build();
+
+                var slice = (StringArray)array.Slice(1, 3);
+
+                // Act
+                slice.Materialize();
+
+                // Assert
+                Assert.True(slice.IsMaterialized());
+                Assert.Equal("value", slice.GetString(0));
+                Assert.Null(slice.GetString(1));
+                Assert.Equal(string.Empty, slice.GetString(2));
+            }
+
+            [Fact]
+            public void ReturnsAppendedValueWithCustomEncoding()
+            {
+                // Arrange
+                const string expected = "héllø";
+                var array = new StringArray.Builder()
+                    .Append(expected, Encoding.Unicode)
+                    .Build();
+
+                // Act
+                var retrievedValue = array.GetString(0, Encoding.Unicode);
+
+                // Assert
+                Assert.Equal(expected, retrievedValue);
+            }
+
+            [Fact]
+            public void 
ReturnsAppendedValueWithCustomEncodingAfterMaterialize()
+            {
+                // Arrange
+                const string expected = "héllø";
+                var array = new StringArray.Builder()
+                    .Append(expected, Encoding.Unicode)
+                    .Build();
+
+                // Act
+                array.Materialize(Encoding.Unicode);
+                var retrievedValue = array.GetString(0, Encoding.Unicode);
+
+                // Assert
+                Assert.True(array.IsMaterialized(Encoding.Unicode));
+                Assert.Equal(expected, retrievedValue);
+            }
+
+            [Fact]
+            public void 
ReturnsAppendedValueForCustomEncodingSliceAfterMaterialize()
+            {
+                // Arrange
+                var array = new StringArray.Builder()
+                    .Append("prefix", Encoding.Unicode)
+                    .Append("héllø", Encoding.Unicode)
+                    .AppendNull()
+                    .Append(string.Empty, Encoding.Unicode)
+                    .Build();
+
+                var slice = (StringArray)array.Slice(1, 3);
+
+                // Act
+                slice.Materialize(Encoding.Unicode);
+
+                // Assert
+                Assert.True(slice.IsMaterialized(Encoding.Unicode));
+                Assert.Equal("héllø", slice.GetString(0, Encoding.Unicode));
+                Assert.Null(slice.GetString(1, Encoding.Unicode));
+                Assert.Equal(string.Empty, slice.GetString(2, 
Encoding.Unicode));
+            }
         }
 
         public class Builder

Reply via email to