(spark) branch master updated: [SPARK-46173][SQL] Skipping trimAll call during date parsing

wenchen Wed, 06 Dec 2023 07:16:22 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 348d88161a9 [SPARK-46173][SQL] Skipping trimAll call during date 
parsing
348d88161a9 is described below

commit 348d88161a9bc98b37c0e097c81c9ecabeb2d76a
Author: Aleksandar Tomic <aleksandar.to...@databricks.com>
AuthorDate: Wed Dec 6 07:15:53 2023 -0800

    [SPARK-46173][SQL] Skipping trimAll call during date parsing
    
    ### What changes were proposed in this pull request?
    
    This PR is a response to a user complaint that stated that date to string 
casts are too slow if input string has many whitespace/isocontrol character as 
it's prefix or sufix.
    
    Current behaviour is that in StringToDate function we first call trimAll 
function to remove any whitespace/isocontrol chars. TrimAll creates new string 
and then we work against that string. This is not really needed since 
StringToDate can just simply skip these characters and do all the processing in 
single loop without extra allocations. Proposed fix is exactly that.
    
    ### Why are the changes needed?
    
    These changes should drastically improve edge case where input string in 
cast to date has many whitespace as prefix/sufix.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Extending existing tests to cover both prefix and suffix cases with 
whitespaces/control chars.
    
    The change also includes a small unit benchmark for this particular case. 
Looking for feedback if we want to keep the benchmark, given that this is a 
rather esoteric edge case.
    
    The benchmark measures time needed to do 10k calls of stringToDate function 
against a input strings that are in format <65k whitespace><correct_date><65 
whitespace> and <correct_date><65k whitespace>. Here are the results:
    
    | input example | prior to change | after the change |
    | -------------- | ------------------ | ------------- |
    | 65k whitespace prefix + 65k suffix | 3250ms | 484ms |
    | 65k whitespace suffix | 1572ms | <1ms |
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #44110 from dbatomic/SPARK-46173-trim-all-skip.
    
    Authored-by: Aleksandar Tomic <aleksandar.to...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../org/apache/spark/unsafe/types/UTF8String.java  | 16 ++++++++--------
 .../sql/catalyst/util/SparkDateTimeUtils.scala     | 22 ++++++++++++++++++----
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     | 20 +++++++++++++++++++-
 3 files changed, 45 insertions(+), 13 deletions(-)

diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index e362a13eb5f..a2dd03ff18b 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -148,6 +148,14 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     return fromBytes(spaces);
   }
 
+  /**
+   * Determines if the specified character (Unicode code point) is white space 
or an ISO control
+   * character according to Java.
+   */
+  public static boolean isWhitespaceOrISOControl(int codePoint) {
+    return Character.isWhitespace(codePoint) || 
Character.isISOControl(codePoint);
+  }
+
   private UTF8String(Object base, long offset, int numBytes) {
     this.base = base;
     this.offset = offset;
@@ -498,14 +506,6 @@ public final class UTF8String implements 
Comparable<UTF8String>, Externalizable,
     return UTF8String.fromBytes(newBytes);
   }
 
-  /**
-   * Determines if the specified character (Unicode code point) is white space 
or an ISO control
-   * character according to Java.
-   */
-  private boolean isWhitespaceOrISOControl(int codePoint) {
-    return Character.isWhitespace(codePoint) || 
Character.isISOControl(codePoint);
-  }
-
   /**
    * Trims space characters (ASCII 32) from both ends of this string.
    *
diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala
 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala
index 5e9fb0dd25f..35118b449e2 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala
@@ -305,21 +305,35 @@ trait SparkDateTimeUtils {
       (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
         (segment != 0 && digits > 0 && digits <= 2)
     }
-    if (s == null || s.trimAll().numBytes() == 0) {
+    if (s == null) {
       return None
     }
+
     val segments: Array[Int] = Array[Int](1, 1, 1)
     var sign = 1
     var i = 0
     var currentSegmentValue = 0
     var currentSegmentDigits = 0
-    val bytes = s.trimAll().getBytes
+    val bytes = s.getBytes
     var j = 0
+    var strEndTrimmed = bytes.length
+
+    while (j < bytes.length && UTF8String.isWhitespaceOrISOControl(bytes(j))) {
+      j += 1;
+    }
+    if (j == bytes.length) {
+      return None;
+    }
+
+    while (strEndTrimmed > j && 
UTF8String.isWhitespaceOrISOControl(bytes(strEndTrimmed - 1))) {
+      strEndTrimmed -= 1;
+    }
+
     if (bytes(j) == '-' || bytes(j) == '+') {
       sign = if (bytes(j) == '-') -1 else 1
       j += 1
     }
-    while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 
'T'))) {
+    while (j < strEndTrimmed && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 
'T'))) {
       val b = bytes(j)
       if (i < 2 && b == '-') {
         if (!isValidDigits(i, currentSegmentDigits)) {
@@ -343,7 +357,7 @@ trait SparkDateTimeUtils {
     if (!isValidDigits(i, currentSegmentDigits)) {
       return None
     }
-    if (i < 2 && j < bytes.length) {
+    if (i < 2 && j < strEndTrimmed) {
       // For the `yyyy` and `yyyy-[m]m` formats, entire input must be consumed.
       return None
     }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d12d01460f7..178d24352df 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -128,7 +128,21 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
   }
 
   test("SPARK-32559: string to date trim Control Characters") {
-    assert(toDate("MIDDLE\u0003") === None)
+    Seq("2015-03-18", "2015-03-18T123321", " 2015-03-18 123321", "+2015-03-18")
+      .foreach {
+        input => Seq(input, "\u0003", "\u0003", " ", " ")
+          .permutations.map(p => p.mkString).foreach {
+          s => assert(toDate(s).get === days(2015, 3, 18))
+        }
+    }
+
+    Seq("INVALID_INPUT", " ", "1999-08-", "2015-03-18\u0003123321", 
"2015-03-18Q123321")
+      .foreach {
+        input =>
+          Seq(input, "\u0003", "\u0003", " ", " ").permutations.map(p => 
p.mkString).foreach {
+            s => assert(toDate(s).isEmpty)
+          }
+      }
   }
 
   test("string to date") {
@@ -154,6 +168,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
     assert(toDate("1999-08-").isEmpty)
     assert(toDate("").isEmpty)
     assert(toDate("   ").isEmpty)
+    assert(toDate("+").isEmpty)
+    assert(toDate("-").isEmpty)
+    assert(toDate("xxx2015-01-28").isEmpty)
+    assert(toDate("--2015-01-28").isEmpty)
   }
 
   test("SPARK-35780: support full range of date string") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-46173][SQL] Skipping trimAll call during date parsing

Reply via email to