This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new c832e2ac1d04 [SPARK-47492][SQL] Widen whitespace rules in lexer
c832e2ac1d04 is described below

commit c832e2ac1d04668c77493577662c639785808657
Author: Serge Rielau <srie...@users.noreply.github.com>
AuthorDate: Thu Mar 28 15:51:32 2024 -0700

    [SPARK-47492][SQL] Widen whitespace rules in lexer
    
    ### What changes were proposed in this pull request?
    
    In this pull PR we extend the Lexer's understanding of WhiteSpace (what 
separates tokens) from the ASCII: <SPACE>, <LF><TAB><CR> to the various Unicode 
flavors of "space" such as "narrow" and "wide".
    
    ### Why are the changes needed?
    
    SQL statements are frequently copy pasted from various sources. Many of 
these sources are "rich text" and based on Unicode.
    When doing do it is inevitable that non ASCII whitespace characters are 
copied.
    This results today in often incomprehensible syntax errors.
    Incomprehensible because the error message prints the "bad" whitespace just 
like an ASCII whitespace.
    So the user stands little chance to find root cause unless they use 
possible editor options to to highlight non ASCII space or they, by sheer luck, 
happen to remove the whitespace.
    
    So in this PR we acknowledge the reality and stop "discriminating" against 
non-ASCII whitespace.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Queries that used to fail before with a Syntax error, now succeed.
    
    ### How was this patch tested?
    
    Added a new set of unit tests in SparkSQLParserSuite
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #45620 from srielau/SPARK-47492-Widen-whitespace-rules-in-lexer.
    
    Lead-authored-by: Serge Rielau <srie...@users.noreply.github.com>
    Co-authored-by: Serge Rielau <se...@rielau.com>
    Signed-off-by: Gengliang Wang <gengli...@apache.org>
---
 .../spark/sql/catalyst/parser/SqlBaseLexer.g4      |  2 +-
 .../spark/sql/execution/SparkSqlParserSuite.scala  | 80 ++++++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
index 7c376e226850..f5565f0a63fb 100644
--- 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
+++ 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
@@ -554,7 +554,7 @@ BRACKETED_COMMENT
     ;
 
 WS
-    : [ \r\n\t]+ -> channel(HIDDEN)
+    : [ 
\t\n\f\r\u000B\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u202F\u205F\u3000]+
 -> channel(HIDDEN)
     ;
 
 // Catch-all for anything we can't recognize.
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
index c3768afa90f1..f60df77b7e9b 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala
@@ -800,4 +800,84 @@ class SparkSqlParserSuite extends AnalysisTest with 
SharedSparkSession {
         start = 0,
         stop = 63))
   }
+
+  test("verify whitespace handling - standard whitespace") {
+    parser.parsePlan("SELECT 1") // ASCII space
+    parser.parsePlan("SELECT\r1") // ASCII carriage return
+    parser.parsePlan("SELECT\n1") // ASCII line feed
+    parser.parsePlan("SELECT\t1") // ASCII tab
+    parser.parsePlan("SELECT\u000B1") // ASCII vertical tab
+    parser.parsePlan("SELECT\f1") // ASCII form feed
+  }
+
+  // Need to switch off scala style for Unicode characters
+  // scalastyle:off
+  test("verify whitespace handling - Unicode no-break space") {
+    parser.parsePlan("SELECT\u00A01") // Unicode no-break space
+  }
+
+  test("verify whitespace handling - Unicode ogham space mark") {
+    parser.parsePlan("SELECT\u16801") // Unicode ogham space mark
+  }
+
+  test("verify whitespace handling - Unicode en quad") {
+    parser.parsePlan("SELECT\u20001") // Unicode en quad
+  }
+
+  test("verify whitespace handling - Unicode em quad") {
+    parser.parsePlan("SELECT\u20011") // Unicode em quad
+  }
+
+  test("verify whitespace handling - Unicode en space") {
+    parser.parsePlan("SELECT\u20021") // Unicode en space
+  }
+
+  test("verify whitespace handling - Unicode em space") {
+    parser.parsePlan("SELECT\u20031") // Unicode em space
+  }
+
+  test("verify whitespace handling - Unicode three-per-em space") {
+    parser.parsePlan("SELECT\u20041") // Unicode three-per-em space
+  }
+
+  test("verify whitespace handling - Unicode four-per-em space") {
+    parser.parsePlan("SELECT\u20051") // Unicode four-per-em space
+  }
+
+  test("verify whitespace handling - Unicode six-per-em space") {
+    parser.parsePlan("SELECT\u20061") // Unicode six-per-em space
+  }
+
+  test("verify whitespace handling - Unicode figure space") {
+    parser.parsePlan("SELECT\u20071") // Unicode figure space
+  }
+
+  test("verify whitespace handling - Unicode punctuation space") {
+    parser.parsePlan("SELECT\u20081") // Unicode punctuation space
+  }
+
+  test("verify whitespace handling - Unicode thin space") {
+    parser.parsePlan("SELECT\u20091") // Unicode thin space
+  }
+
+  test("verify whitespace handling - Unicode hair space") {
+    parser.parsePlan("SELECT\u200A1") // Unicode hair space
+  }
+
+  test("verify whitespace handling - Unicode line separator") {
+    parser.parsePlan("SELECT\u20281") // Unicode line separator
+  }
+
+  test("verify whitespace handling - Unicode narrow no-break space") {
+    parser.parsePlan("SELECT\u202F1") // Unicode narrow no-break space
+  }
+
+  test("verify whitespace handling - Unicode medium mathematical space") {
+    parser.parsePlan("SELECT\u205F1") // Unicode medium mathematical space
+  }
+
+  test("verify whitespace handling - Unicode ideographic space") {
+    parser.parsePlan("SELECT\u30001") // Unicode ideographic space
+  }
+  // scalastyle:on
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to