spark git commit: [SPARK-22169][SQL] support byte length literal as identifier

lixiao Wed, 04 Oct 2017 13:14:08 -0700

Repository: spark
Updated Branches:
  refs/heads/master 4a779bdac -> bb035f1ee



[SPARK-22169][SQL] support byte length literal as identifier

## What changes were proposed in this pull request?

By definition the table name in Spark can be something like `123x`, `25a`, 
etc., with exceptions for literals like `12L`, `23BD`, etc. However, Spark SQL 
has a special byte length literal, which stops users to use digits followed by 
`b`, `k`, `m`, `g` as identifiers.

byte length literal is not a standard sql literal and is only used in the 
`tableSample` parser rule. This PR move the parsing of byte length literal from 
lexer to parser, so that users can use it as identifiers.

## How was this patch tested?

regression test

Author: Wenchen Fan <wenc...@databricks.com>

Closes #19392 from cloud-fan/parser-bug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bb035f1e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bb035f1e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bb035f1e

Branch: refs/heads/master
Commit: bb035f1ee5cdf88e476b7ed83d59140d669fbe12
Parents: 4a779bd
Author: Wenchen Fan <wenc...@databricks.com>
Authored: Wed Oct 4 13:13:51 2017 -0700
Committer: gatorsmile <gatorsm...@gmail.com>
Committed: Wed Oct 4 13:13:51 2017 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 25 ++++++++-----------
 .../sql/catalyst/catalog/SessionCatalog.scala   |  2 +-
 .../spark/sql/catalyst/parser/AstBuilder.scala  | 26 ++++++++++++++------
 .../sql/catalyst/parser/PlanParserSuite.scala   |  1 +
 .../sql/execution/command/DDLParserSuite.scala  | 19 ++++++++++++++
 5 files changed, 49 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bb035f1e/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index d0a5428..17c8404 100644
--- 
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ 
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -25,7 +25,7 @@ grammar SqlBase;
    * For char stream "2.3", "2." is not a valid decimal token, because it is 
followed by digit '3'.
    * For char stream "2.3_", "2.3" is not a valid decimal token, because it is 
followed by '_'.
    * For char stream "2.3W", "2.3" is not a valid decimal token, because it is 
followed by 'W'.
-   * For char stream "12.0D 34.E2+0.12 "  12.0D is a valid decimal token 
because it is folllowed
+   * For char stream "12.0D 34.E2+0.12 "  12.0D is a valid decimal token 
because it is followed
    * by a space. 34.E2 is a valid decimal token because it is followed by 
symbol '+'
    * which is not a digit or letter or underscore.
    */
@@ -40,10 +40,6 @@ grammar SqlBase;
   }
 }
 
-tokens {
-    DELIMITER
-}
-
 singleStatement
     : statement EOF
     ;
@@ -447,12 +443,15 @@ joinCriteria
     ;
 
 sample
-    : TABLESAMPLE '('
-      ( (negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) 
sampleType=PERCENTLIT)
-      | (expression sampleType=ROWS)
-      | sampleType=BYTELENGTH_LITERAL
-      | (sampleType=BUCKET numerator=INTEGER_VALUE OUT OF 
denominator=INTEGER_VALUE (ON (identifier | qualifiedName '(' ')'))?))
-      ')'
+    : TABLESAMPLE '(' sampleMethod? ')'
+    ;
+
+sampleMethod
+    : negativeSign=MINUS? percentage=(INTEGER_VALUE | DECIMAL_VALUE) 
PERCENTLIT   #sampleByPercentile
+    | expression ROWS                                                          
   #sampleByRows
+    | sampleType=BUCKET numerator=INTEGER_VALUE OUT OF 
denominator=INTEGER_VALUE
+        (ON (identifier | qualifiedName '(' ')'))?                             
   #sampleByBucket
+    | bytes=expression                                                         
   #sampleByBytes
     ;
 
 identifierList
@@ -1004,10 +1003,6 @@ TINYINT_LITERAL
     : DIGIT+ 'Y'
     ;
 
-BYTELENGTH_LITERAL
-    : DIGIT+ ('B' | 'K' | 'M' | 'G')
-    ;
-
 INTEGER_VALUE
     : DIGIT+
     ;

http://git-wip-us.apache.org/repos/asf/spark/blob/bb035f1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 6ba9ee5..95bc3d6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -99,7 +99,7 @@ class SessionCatalog(
   protected var currentDb: String = formatDatabaseName(DEFAULT_DATABASE)
 
   /**
-   * Checks if the given name conforms the Hive standard ("[a-zA-z_0-9]+"),
+   * Checks if the given name conforms the Hive standard ("[a-zA-Z_0-9]+"),
    * i.e. if this name only contains characters, numbers, and _.
    *
    * This method is intended to have the same behavior of

http://git-wip-us.apache.org/repos/asf/spark/blob/bb035f1e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 85b492e..ce36714 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -699,20 +699,30 @@ class AstBuilder(conf: SQLConf) extends 
SqlBaseBaseVisitor[AnyRef] with Logging
       Sample(0.0, fraction, withReplacement = false, (math.random * 
1000).toInt, query)
     }
 
-    ctx.sampleType.getType match {
-      case SqlBaseParser.ROWS =>
+    if (ctx.sampleMethod() == null) {
+      throw new ParseException("TABLESAMPLE does not accept empty inputs.", 
ctx)
+    }
+
+    ctx.sampleMethod() match {
+      case ctx: SampleByRowsContext =>
         Limit(expression(ctx.expression), query)
 
-      case SqlBaseParser.PERCENTLIT =>
+      case ctx: SampleByPercentileContext =>
         val fraction = ctx.percentage.getText.toDouble
         val sign = if (ctx.negativeSign == null) 1 else -1
         sample(sign * fraction / 100.0d)
 
-      case SqlBaseParser.BYTELENGTH_LITERAL =>
-        throw new ParseException(
-          "TABLESAMPLE(byteLengthLiteral) is not supported", ctx)
+      case ctx: SampleByBytesContext =>
+        val bytesStr = ctx.bytes.getText
+        if (bytesStr.matches("[0-9]+[bBkKmMgG]")) {
+          throw new ParseException("TABLESAMPLE(byteLengthLiteral) is not 
supported", ctx)
+        } else {
+          throw new ParseException(
+            bytesStr + " is not a valid byte length literal, " +
+              "expected syntax: DIGIT+ ('B' | 'K' | 'M' | 'G')", ctx)
+        }
 
-      case SqlBaseParser.BUCKET if ctx.ON != null =>
+      case ctx: SampleByBucketContext if ctx.ON() != null =>
         if (ctx.identifier != null) {
           throw new ParseException(
             "TABLESAMPLE(BUCKET x OUT OF y ON colname) is not supported", ctx)
@@ -721,7 +731,7 @@ class AstBuilder(conf: SQLConf) extends 
SqlBaseBaseVisitor[AnyRef] with Logging
             "TABLESAMPLE(BUCKET x OUT OF y ON function) is not supported", ctx)
         }
 
-      case SqlBaseParser.BUCKET =>
+      case ctx: SampleByBucketContext =>
         sample(ctx.numerator.getText.toDouble / 
ctx.denominator.getText.toDouble)
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/bb035f1e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 306e6f2..d34a83c 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -110,6 +110,7 @@ class PlanParserSuite extends AnalysisTest {
     assertEqual("select distinct a, b from db.c", Distinct(table("db", 
"c").select('a, 'b)))
     assertEqual("select all a, b from db.c", table("db", "c").select('a, 'b))
     assertEqual("select from tbl", OneRowRelation().select('from.as("tbl")))
+    assertEqual("select a from 1k.2m", table("1k", "2m").select('a))
   }
 
   test("reverse select query") {

http://git-wip-us.apache.org/repos/asf/spark/blob/bb035f1e/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index fa5172c..eb7c335 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -525,6 +525,25 @@ class DDLParserSuite extends PlanTest with 
SharedSQLContext {
     assert(e.message.contains("you can only specify one of them."))
   }
 
+  test("create table - byte length literal table name") {
+    val sql = "CREATE TABLE 1m.2g(a INT) USING parquet"
+
+    val expectedTableDesc = CatalogTable(
+      identifier = TableIdentifier("2g", Some("1m")),
+      tableType = CatalogTableType.MANAGED,
+      storage = CatalogStorageFormat.empty,
+      schema = new StructType().add("a", IntegerType),
+      provider = Some("parquet"))
+
+    parser.parsePlan(sql) match {
+      case CreateTable(tableDesc, _, None) =>
+        assert(tableDesc == expectedTableDesc.copy(createTime = 
tableDesc.createTime))
+      case other =>
+        fail(s"Expected to parse 
${classOf[CreateTableCommand].getClass.getName} from query," +
+          s"got ${other.getClass.getName}: $sql")
+    }
+  }
+
   test("insert overwrite directory") {
     val v1 = "INSERT OVERWRITE DIRECTORY '/tmp/file' USING parquet SELECT 1 as 
a"
     parser.parsePlan(v1) match {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22169][SQL] support byte length literal as identifier

Reply via email to