(incubator-gluten) branch main updated: [VL] Fix incosistent issue of PartitionFile path unescaping & GPL issue (#8793)

felixybw Thu, 20 Feb 2025 12:48:37 -0800

This is an automated email from the ASF dual-hosted git repository.

felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 18cbb27673 [VL] Fix incosistent issue of PartitionFile path unescaping 
& GPL issue (#8793)
18cbb27673 is described below

commit 18cbb276737bfe533069f2b9928e0aa3a2879f96
Author: Kent Yao <[email protected]>
AuthorDate: Fri Feb 21 04:48:01 2025 +0800

    [VL] Fix incosistent issue of PartitionFile path unescaping & GPL issue 
(#8793)
    
    GlutenURLDecoder.java is copied from OpenJDK and it's under GPL v2 which 
belongs to Category X, we can't have it in Apache Releases.
    
    URLDecoder decode/encode is not fully compatible with the Hive catalog path 
escaping/unescaping, which Spark also follows.
    
    Besides, apache/spark#46938 has improved unescapePathName's speed at the 
Spark side by ~10x. So This PR also helps gluten gain perf which handles 
datasets w/ large partition numbers.
---
 .../org/apache/gluten/utils/GlutenURLDecoder.java  | 118 ---------------------
 .../backendsapi/velox/VeloxIteratorApi.scala       |  56 ++++++++--
 2 files changed, 50 insertions(+), 124 deletions(-)

diff --git 
a/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java 
b/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java
deleted file mode 100644
index 856ddf1597..0000000000
--- a/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.gluten.utils;
-
-import java.io.*;
-import java.net.URLEncoder;
-
-public class GlutenURLDecoder {
-  // This Util is copied from
-  // 
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/net/URLDecoder.java
-  // Just remove the special handle for '+'
-  /**
-   * Decodes a <code>application/x-www-form-urlencoded</code> string using a 
specific encoding
-   * scheme. The supplied encoding is used to determine what characters are 
represented by any
-   * consecutive sequences of the form "<code>%<i>xy</i></code>".
-   *
-   * <p><em><strong>Note:</strong> The <a href=
-   * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars";> World 
Wide Web Consortium
-   * Recommendation</a> states that UTF-8 should be used. Not doing so may 
introduce
-   * incompatibilities.</em>
-   *
-   * @param s the <code>String</code> to decode
-   * @param enc The name of a supported <a 
href="../lang/package-summary.html#charenc">character
-   *     encoding</a>.
-   * @return the newly decoded <code>String</code>
-   * @exception UnsupportedEncodingException If character encoding needs to be 
consulted, but named
-   *     character encoding is not supported
-   * @see URLEncoder#encode(java.lang.String, java.lang.String)
-   * @since 1.4
-   */
-  public static String decode(String s, String enc) throws 
UnsupportedEncodingException {
-
-    boolean needToChange = false;
-    int numChars = s.length();
-    StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : 
numChars);
-    int i = 0;
-
-    if (enc.length() == 0) {
-      throw new UnsupportedEncodingException("URLDecoder: empty string enc 
parameter");
-    }
-
-    char c;
-    byte[] bytes = null;
-    while (i < numChars) {
-      c = s.charAt(i);
-      switch (c) {
-        case '%':
-          /*
-           * Starting with this instance of %, process all
-           * consecutive substrings of the form %xy. Each
-           * substring %xy will yield a byte. Convert all
-           * consecutive  bytes obtained this way to whatever
-           * character(s) they represent in the provided
-           * encoding.
-           */
-
-          try {
-
-            // (numChars-i)/3 is an upper bound for the number
-            // of remaining bytes
-            if (bytes == null) {
-              bytes = new byte[(numChars - i) / 3];
-            }
-            int pos = 0;
-
-            while (((i + 2) < numChars) && (c == '%')) {
-              int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);
-              if (v < 0) {
-                throw new IllegalArgumentException(
-                    "URLDecoder: Illegal "
-                        + "hex characters in escape (%) pattern - negative 
value");
-              }
-              bytes[pos++] = (byte) v;
-              i += 3;
-              if (i < numChars) {
-                c = s.charAt(i);
-              }
-            }
-
-            // A trailing, incomplete byte encoding such as
-            // "%x" will cause an exception to be thrown
-
-            if ((i < numChars) && (c == '%')) {
-              throw new IllegalArgumentException(
-                  "URLDecoder: Incomplete trailing escape (%) pattern");
-            }
-
-            sb.append(new String(bytes, 0, pos, enc));
-          } catch (NumberFormatException e) {
-            throw new IllegalArgumentException(
-                "URLDecoder: Illegal hex characters in escape (%) pattern - " 
+ e.getMessage());
-          }
-          needToChange = true;
-          break;
-        default:
-          sb.append(c);
-          i++;
-          break;
-      }
-    }
-
-    return (needToChange ? sb.toString() : s);
-  }
-}
diff --git 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
index badea33f58..27b721d1cd 100644
--- 
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
+++ 
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
@@ -17,6 +17,7 @@
 package org.apache.gluten.backendsapi.velox
 
 import org.apache.gluten.backendsapi.{BackendsApiManager, IteratorApi}
+import org.apache.gluten.backendsapi.velox.VeloxIteratorApi.unescapePathName
 import org.apache.gluten.config.GlutenNumaBindingInfo
 import org.apache.gluten.execution._
 import org.apache.gluten.iterator.Iterators
@@ -25,7 +26,6 @@ import org.apache.gluten.sql.shims.SparkShimLoader
 import org.apache.gluten.substrait.plan.PlanNode
 import org.apache.gluten.substrait.rel.{LocalFilesBuilder, LocalFilesNode, 
SplitInfo}
 import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat
-import org.apache.gluten.utils._
 import org.apache.gluten.vectorized._
 
 import org.apache.spark.{SparkConf, TaskContext}
@@ -119,11 +119,7 @@ class VeloxIteratorApi extends IteratorApi with Logging {
     val metadataColumns = new JArrayList[JMap[String, String]]
     files.foreach {
       file =>
-        // The "file.filePath" in PartitionedFile is not the original encoded 
path, so the decoded
-        // path is incorrect in some cases and here fix the case of ' ' by 
using GlutenURLDecoder
-        paths.add(
-          GlutenURLDecoder
-            .decode(file.filePath.toString, StandardCharsets.UTF_8.name()))
+        paths.add(unescapePathName(file.filePath.toString))
         starts.add(JLong.valueOf(file.start))
         lengths.add(JLong.valueOf(file.length))
         val (fileSize, modificationTime) =
@@ -269,3 +265,51 @@ class VeloxIteratorApi extends IteratorApi with Logging {
   }
   // scalastyle:on argcount
 }
+
+object VeloxIteratorApi {
+  // lookup table to translate '0' -> 0 ... 'F'/'f' -> 15
+  private val unhexDigits = {
+    val array = Array.fill[Byte](128)(-1)
+    (0 to 9).foreach(i => array('0' + i) = i.toByte)
+    (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+    (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+    array
+  }
+
+  def unescapePathName(path: String): String = {
+    if (path == null || path.isEmpty) {
+      return path
+    }
+    var plaintextEndIdx = path.indexOf('%')
+    val length = path.length
+    if (plaintextEndIdx == -1 || plaintextEndIdx + 2 >= length) {
+      // fast path, no %xx encoding found then return the string identity
+      path
+    } else {
+      val sb = new java.lang.StringBuilder(length)
+      var plaintextStartIdx = 0
+      while (plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
+        if (plaintextEndIdx > plaintextStartIdx) sb.append(path, 
plaintextStartIdx, plaintextEndIdx)
+        val high = path.charAt(plaintextEndIdx + 1)
+        if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
+          val low = path.charAt(plaintextEndIdx + 2)
+          if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
+            sb.append((unhexDigits(high) << 4 | 
unhexDigits(low)).asInstanceOf[Char])
+            plaintextStartIdx = plaintextEndIdx + 3
+          } else {
+            sb.append('%')
+            plaintextStartIdx = plaintextEndIdx + 1
+          }
+        } else {
+          sb.append('%')
+          plaintextStartIdx = plaintextEndIdx + 1
+        }
+        plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
+      }
+      if (plaintextStartIdx < length) {
+        sb.append(path, plaintextStartIdx, length)
+      }
+      sb.toString
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [VL] Fix incosistent issue of PartitionFile path unescaping & GPL issue (#8793)

Reply via email to