This is an automated email from the ASF dual-hosted git repository.
felixybw pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 18cbb27673 [VL] Fix incosistent issue of PartitionFile path unescaping
& GPL issue (#8793)
18cbb27673 is described below
commit 18cbb276737bfe533069f2b9928e0aa3a2879f96
Author: Kent Yao <[email protected]>
AuthorDate: Fri Feb 21 04:48:01 2025 +0800
[VL] Fix incosistent issue of PartitionFile path unescaping & GPL issue
(#8793)
GlutenURLDecoder.java is copied from OpenJDK and it's under GPL v2 which
belongs to Category X, we can't have it in Apache Releases.
URLDecoder decode/encode is not fully compatible with the Hive catalog path
escaping/unescaping, which Spark also follows.
Besides, apache/spark#46938 has improved unescapePathName's speed at the
Spark side by ~10x. So This PR also helps gluten gain perf which handles
datasets w/ large partition numbers.
---
.../org/apache/gluten/utils/GlutenURLDecoder.java | 118 ---------------------
.../backendsapi/velox/VeloxIteratorApi.scala | 56 ++++++++--
2 files changed, 50 insertions(+), 124 deletions(-)
diff --git
a/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java
b/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java
deleted file mode 100644
index 856ddf1597..0000000000
--- a/backends-velox/src/main/java/org/apache/gluten/utils/GlutenURLDecoder.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.gluten.utils;
-
-import java.io.*;
-import java.net.URLEncoder;
-
-public class GlutenURLDecoder {
- // This Util is copied from
- //
https://github.com/openjdk-mirror/jdk7u-jdk/blob/master/src/share/classes/java/net/URLDecoder.java
- // Just remove the special handle for '+'
- /**
- * Decodes a <code>application/x-www-form-urlencoded</code> string using a
specific encoding
- * scheme. The supplied encoding is used to determine what characters are
represented by any
- * consecutive sequences of the form "<code>%<i>xy</i></code>".
- *
- * <p><em><strong>Note:</strong> The <a href=
- * "http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars"> World
Wide Web Consortium
- * Recommendation</a> states that UTF-8 should be used. Not doing so may
introduce
- * incompatibilities.</em>
- *
- * @param s the <code>String</code> to decode
- * @param enc The name of a supported <a
href="../lang/package-summary.html#charenc">character
- * encoding</a>.
- * @return the newly decoded <code>String</code>
- * @exception UnsupportedEncodingException If character encoding needs to be
consulted, but named
- * character encoding is not supported
- * @see URLEncoder#encode(java.lang.String, java.lang.String)
- * @since 1.4
- */
- public static String decode(String s, String enc) throws
UnsupportedEncodingException {
-
- boolean needToChange = false;
- int numChars = s.length();
- StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 :
numChars);
- int i = 0;
-
- if (enc.length() == 0) {
- throw new UnsupportedEncodingException("URLDecoder: empty string enc
parameter");
- }
-
- char c;
- byte[] bytes = null;
- while (i < numChars) {
- c = s.charAt(i);
- switch (c) {
- case '%':
- /*
- * Starting with this instance of %, process all
- * consecutive substrings of the form %xy. Each
- * substring %xy will yield a byte. Convert all
- * consecutive bytes obtained this way to whatever
- * character(s) they represent in the provided
- * encoding.
- */
-
- try {
-
- // (numChars-i)/3 is an upper bound for the number
- // of remaining bytes
- if (bytes == null) {
- bytes = new byte[(numChars - i) / 3];
- }
- int pos = 0;
-
- while (((i + 2) < numChars) && (c == '%')) {
- int v = Integer.parseInt(s.substring(i + 1, i + 3), 16);
- if (v < 0) {
- throw new IllegalArgumentException(
- "URLDecoder: Illegal "
- + "hex characters in escape (%) pattern - negative
value");
- }
- bytes[pos++] = (byte) v;
- i += 3;
- if (i < numChars) {
- c = s.charAt(i);
- }
- }
-
- // A trailing, incomplete byte encoding such as
- // "%x" will cause an exception to be thrown
-
- if ((i < numChars) && (c == '%')) {
- throw new IllegalArgumentException(
- "URLDecoder: Incomplete trailing escape (%) pattern");
- }
-
- sb.append(new String(bytes, 0, pos, enc));
- } catch (NumberFormatException e) {
- throw new IllegalArgumentException(
- "URLDecoder: Illegal hex characters in escape (%) pattern - "
+ e.getMessage());
- }
- needToChange = true;
- break;
- default:
- sb.append(c);
- i++;
- break;
- }
- }
-
- return (needToChange ? sb.toString() : s);
- }
-}
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
index badea33f58..27b721d1cd 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
@@ -17,6 +17,7 @@
package org.apache.gluten.backendsapi.velox
import org.apache.gluten.backendsapi.{BackendsApiManager, IteratorApi}
+import org.apache.gluten.backendsapi.velox.VeloxIteratorApi.unescapePathName
import org.apache.gluten.config.GlutenNumaBindingInfo
import org.apache.gluten.execution._
import org.apache.gluten.iterator.Iterators
@@ -25,7 +26,6 @@ import org.apache.gluten.sql.shims.SparkShimLoader
import org.apache.gluten.substrait.plan.PlanNode
import org.apache.gluten.substrait.rel.{LocalFilesBuilder, LocalFilesNode,
SplitInfo}
import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat
-import org.apache.gluten.utils._
import org.apache.gluten.vectorized._
import org.apache.spark.{SparkConf, TaskContext}
@@ -119,11 +119,7 @@ class VeloxIteratorApi extends IteratorApi with Logging {
val metadataColumns = new JArrayList[JMap[String, String]]
files.foreach {
file =>
- // The "file.filePath" in PartitionedFile is not the original encoded
path, so the decoded
- // path is incorrect in some cases and here fix the case of ' ' by
using GlutenURLDecoder
- paths.add(
- GlutenURLDecoder
- .decode(file.filePath.toString, StandardCharsets.UTF_8.name()))
+ paths.add(unescapePathName(file.filePath.toString))
starts.add(JLong.valueOf(file.start))
lengths.add(JLong.valueOf(file.length))
val (fileSize, modificationTime) =
@@ -269,3 +265,51 @@ class VeloxIteratorApi extends IteratorApi with Logging {
}
// scalastyle:on argcount
}
+
+object VeloxIteratorApi {
+ // lookup table to translate '0' -> 0 ... 'F'/'f' -> 15
+ private val unhexDigits = {
+ val array = Array.fill[Byte](128)(-1)
+ (0 to 9).foreach(i => array('0' + i) = i.toByte)
+ (0 to 5).foreach(i => array('A' + i) = (i + 10).toByte)
+ (0 to 5).foreach(i => array('a' + i) = (i + 10).toByte)
+ array
+ }
+
+ def unescapePathName(path: String): String = {
+ if (path == null || path.isEmpty) {
+ return path
+ }
+ var plaintextEndIdx = path.indexOf('%')
+ val length = path.length
+ if (plaintextEndIdx == -1 || plaintextEndIdx + 2 >= length) {
+ // fast path, no %xx encoding found then return the string identity
+ path
+ } else {
+ val sb = new java.lang.StringBuilder(length)
+ var plaintextStartIdx = 0
+ while (plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
+ if (plaintextEndIdx > plaintextStartIdx) sb.append(path,
plaintextStartIdx, plaintextEndIdx)
+ val high = path.charAt(plaintextEndIdx + 1)
+ if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
+ val low = path.charAt(plaintextEndIdx + 2)
+ if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
+ sb.append((unhexDigits(high) << 4 |
unhexDigits(low)).asInstanceOf[Char])
+ plaintextStartIdx = plaintextEndIdx + 3
+ } else {
+ sb.append('%')
+ plaintextStartIdx = plaintextEndIdx + 1
+ }
+ } else {
+ sb.append('%')
+ plaintextStartIdx = plaintextEndIdx + 1
+ }
+ plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
+ }
+ if (plaintextStartIdx < length) {
+ sb.append(path, plaintextStartIdx, length)
+ }
+ sb.toString
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]