This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 19da8eb156 [arrow] Optimize Arrow string write performance (#6240)
19da8eb156 is described below

commit 19da8eb1568a5e8b5af03de5435073c7ed03c3f4
Author: Jingsong Lee <[email protected]>
AuthorDate: Thu Sep 11 17:14:54 2025 +0800

    [arrow] Optimize Arrow string write performance (#6240)
---
 .../paimon/arrow/writer/ArrowFieldWriters.java     | 22 ++++++-
 paimon-benchmark/paimon-micro-benchmarks/pom.xml   |  6 ++
 .../benchmark/arrow/ArrowWriteBenchmark.java       | 68 ++++++++++++++++++++++
 3 files changed, 95 insertions(+), 1 deletion(-)

diff --git 
a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java
 
b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java
index feb3bc2460..1c7bb742f5 100644
--- 
a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java
+++ 
b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java
@@ -19,6 +19,7 @@
 package org.apache.paimon.arrow.writer;
 
 import org.apache.paimon.arrow.ArrowUtils;
+import org.apache.paimon.data.BinaryString;
 import org.apache.paimon.data.DataGetters;
 import org.apache.paimon.data.InternalArray;
 import org.apache.paimon.data.InternalMap;
@@ -39,6 +40,7 @@ import org.apache.paimon.data.columnar.RowColumnVector;
 import org.apache.paimon.data.columnar.ShortColumnVector;
 import org.apache.paimon.data.columnar.TimestampColumnVector;
 import org.apache.paimon.data.columnar.VectorizedColumnBatch;
+import org.apache.paimon.memory.MemorySegment;
 import org.apache.paimon.utils.IntArrayList;
 
 import org.apache.arrow.vector.BigIntVector;
@@ -94,7 +96,25 @@ public class ArrowFieldWriters {
 
         @Override
         protected void doWrite(int rowIndex, DataGetters getters, int pos) {
-            ((VarCharVector) fieldVector).setSafe(rowIndex, 
getters.getString(pos).toBytes());
+            BinaryString binaryString = getters.getString(pos);
+            MemorySegment[] segments = binaryString.getSegments();
+
+            // Very important performance optimization, which can avoid 
copying out new byte array
+            if (segments.length == 1) {
+                byte[] heapMemory = segments[0].getHeapMemory();
+                if (heapMemory != null) {
+                    ((VarCharVector) fieldVector)
+                            .setSafe(
+                                    rowIndex,
+                                    heapMemory,
+                                    binaryString.getOffset(),
+                                    binaryString.getSizeInBytes());
+                    return;
+                }
+            }
+
+            // Else copy new byte array
+            ((VarCharVector) fieldVector).setSafe(rowIndex, 
binaryString.toBytes());
         }
     }
 
diff --git a/paimon-benchmark/paimon-micro-benchmarks/pom.xml 
b/paimon-benchmark/paimon-micro-benchmarks/pom.xml
index 319433f41f..ddac785a1e 100644
--- a/paimon-benchmark/paimon-micro-benchmarks/pom.xml
+++ b/paimon-benchmark/paimon-micro-benchmarks/pom.xml
@@ -146,6 +146,12 @@ under the License.
             <version>${project.version}</version>
         </dependency>
 
+        <dependency>
+            <groupId>org.apache.paimon</groupId>
+            <artifactId>paimon-arrow</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+
         <!-- Test -->
 
         <dependency>
diff --git 
a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java
 
b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java
new file mode 100644
index 0000000000..8bdedbc0fd
--- /dev/null
+++ 
b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/arrow/ArrowWriteBenchmark.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.benchmark.arrow;
+
+import org.apache.paimon.arrow.vector.ArrowFormatWriter;
+import org.apache.paimon.benchmark.Benchmark;
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.data.GenericRow;
+import org.apache.paimon.types.DataTypes;
+import org.apache.paimon.types.RowType;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.concurrent.ThreadLocalRandom;
+
+/** Benchmark for arrow write. */
+public class ArrowWriteBenchmark {
+
+    @Test
+    public void testWrite() {
+        int batch = 1024 * 20;
+        Benchmark benchmark =
+                new Benchmark("read", batch * batch)
+                        .setNumWarmupIters(1)
+                        .setOutputPerIteration(true);
+        RowType rowType =
+                RowType.of(
+                        DataTypes.STRING(),
+                        DataTypes.STRING(),
+                        DataTypes.STRING(),
+                        DataTypes.STRING());
+        ThreadLocalRandom rnd = ThreadLocalRandom.current();
+        byte[] bytes = new byte[100];
+        rnd.nextBytes(bytes);
+        BinaryString binaryString = BinaryString.fromBytes(bytes, 1, 99);
+        GenericRow row = GenericRow.of(binaryString, binaryString, 
binaryString, binaryString);
+        benchmark.addCase(
+                "write",
+                1,
+                () -> {
+                    try (ArrowFormatWriter writer = new 
ArrowFormatWriter(rowType, batch, true)) {
+                        for (int i = 0; i < batch; i++) {
+                            writer.reset();
+                            for (int j = 0; j < batch; j++) {
+                                writer.write(row);
+                            }
+                        }
+                    }
+                });
+        benchmark.run();
+    }
+}

Reply via email to