This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ebe9f669e3fd [SPARK-47344] Extend INVALID_IDENTIFIER error beyond catching '-' in an unquoted identifier and fix "IS ! NULL" et al. ebe9f669e3fd is described below commit ebe9f669e3fd4f391336c12c2e15df048eaa11bc Author: Serge Rielau <se...@rielau.com> AuthorDate: Wed Mar 13 13:19:29 2024 -0700 [SPARK-47344] Extend INVALID_IDENTIFIER error beyond catching '-' in an unquoted identifier and fix "IS ! NULL" et al. ### What changes were proposed in this pull request? In this PR we propose to extend the lexing of IDENTIFIER beyond what is legitimate for unquoted identifiers to include "plausible" identifiers. We then use the "exit" hook in the parser raise INVALID_IDENTIFIER error which is more meaningful than a syntax error. Specifically we allow: * general letters beyond the ASCII a-z. This will catch locale specific names * URIs which are used for table's represented by a path. As part of this PR we also found that rolling `NOT` and `!` into one token is a "bad idea". We allow: CREATE TABLE t(c1 INT ! NULL); etc. This is clearly not intended. ! is now ONLY allowed as a boolean prefix operator. ### Why are the changes needed? This change improves the user experience in case of an error by returning a more meaningful error. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing test suite + new unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #45470 from srielau/SPARK-47344. Lead-authored-by: Serge Rielau <se...@rielau.com> Co-authored-by: Wenchen Fan <cloud0...@gmail.com> Signed-off-by: Gengliang Wang <gengli...@apache.org> --- .../utils/src/main/resources/error/error-classes.json | 5 ++++- docs/sql-error-conditions.md | 5 ++++- .../apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 | 14 ++++++++++++-- .../apache/spark/sql/catalyst/parser/SqlBaseParser.g4 | 3 ++- .../org/apache/spark/sql/catalyst/parser/parsers.scala | 12 ++++++++++++ .../apache/spark/sql/errors/QueryParsingErrors.scala | 2 +- .../spark/sql/catalyst/parser/ErrorParserSuite.scala | 17 +++++++++++++++++ .../resources/sql-tests/results/ansi/keywords.sql.out | 2 ++ .../test/resources/sql-tests/results/keywords.sql.out | 1 + .../ThriftServerWithSparkContextSuite.scala | 2 +- 10 files changed, 56 insertions(+), 7 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 92c72e03e483..8272c442ddfa 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -2098,7 +2098,10 @@ }, "INVALID_IDENTIFIER" : { "message" : [ - "The identifier <ident> is invalid. Please, consider quoting it with back-quotes as `<ident>`." + "The unquoted identifier <ident> is invalid and must be back quoted as: `<ident>`.", + "Unquoted identifiers can only contain ASCII letters ('a' - 'z', 'A' - 'Z'), digits ('0' - '9'), and underbar ('_').", + "Unquoted identifiers must also not start with a digit.", + "Different data sources and meta stores may impose additional restrictions on valid identifiers." ], "sqlState" : "42602" }, diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index efead13251c1..dba87bf0136e 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -1240,7 +1240,10 @@ For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error- [SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) -The identifier `<ident>` is invalid. Please, consider quoting it with back-quotes as ``<ident>``. +The unquoted identifier `<ident>` is invalid and must be back quoted as: ``<ident>``. +Unquoted identifiers can only contain ASCII letters ('a' - 'z', 'A' - 'Z'), digits ('0' - '9'), and underbar ('_'). +Unquoted identifiers must also not start with a digit. +Different data sources and meta stores may impose additional restrictions on valid identifiers. ### INVALID_INDEX_OF_ZERO diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 index 174887def66d..7c376e226850 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 @@ -79,6 +79,7 @@ COMMA: ','; DOT: '.'; LEFT_BRACKET: '['; RIGHT_BRACKET: ']'; +BANG: '!'; // NOTE: If you add a new token in the list below, you should update the list of keywords // and reserved tag in `docs/sql-ref-ansi-compliance.md#sql-keywords`, and @@ -273,7 +274,7 @@ NANOSECOND: 'NANOSECOND'; NANOSECONDS: 'NANOSECONDS'; NATURAL: 'NATURAL'; NO: 'NO'; -NOT: 'NOT' | '!'; +NOT: 'NOT'; NULL: 'NULL'; NULLS: 'NULLS'; NUMERIC: 'NUMERIC'; @@ -510,8 +511,13 @@ BIGDECIMAL_LITERAL | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}? ; +// Generalize the identifier to give a sensible INVALID_IDENTIFIER error message: +// * Unicode letters rather than a-z and A-Z only +// * URI paths for table references using paths +// We then narrow down to ANSI rules in exitUnquotedIdentifier() in the parser. IDENTIFIER - : (LETTER | DIGIT | '_')+ + : (UNICODE_LETTER | DIGIT | '_')+ + | UNICODE_LETTER+ '://' (UNICODE_LETTER | DIGIT | '_' | '/' | '-' | '.' | '?' | '=' | '&' | '#' | '%')+ ; BACKQUOTED_IDENTIFIER @@ -535,6 +541,10 @@ fragment LETTER : [A-Z] ; +fragment UNICODE_LETTER + : [\p{L}] + ; + SIMPLE_COMMENT : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN) ; diff --git a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 index 801cc62491a2..41a5ec241c64 100644 --- a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 +++ b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 @@ -388,6 +388,7 @@ describeFuncName | comparisonOperator | arithmeticOperator | predicateOperator + | BANG ; describeColName @@ -946,7 +947,7 @@ expressionSeq ; booleanExpression - : NOT booleanExpression #logicalNot + : (NOT | BANG) booleanExpression #logicalNot | EXISTS LEFT_PAREN query RIGHT_PAREN #exists | valueExpression predicate? #predicated | left=booleanExpression operator=AND right=booleanExpression #logicalBinary diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/parsers.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/parsers.scala index 2689e317128a..6cfa7ed195a7 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/parsers.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/parser/parsers.scala @@ -303,6 +303,18 @@ case object PostProcessor extends SqlBaseParserBaseListener { throw QueryParsingErrors.invalidIdentifierError(ident, ctx) } + /** Throws error message when unquoted identifier contains characters outside a-z, A-Z, 0-9, _ */ + override def exitUnquotedIdentifier(ctx: SqlBaseParser.UnquotedIdentifierContext): Unit = { + val ident = ctx.getText + if (ident.exists(c => + !(c >= 'a' && c <= 'z') && + !(c >= 'A' && c <= 'Z') && + !(c >= '0' && c <= '9') && + c != '_')) { + throw QueryParsingErrors.invalidIdentifierError(ident, ctx) + } + } + /** Remove the back ticks from an Identifier. */ override def exitQuotedIdentifier(ctx: SqlBaseParser.QuotedIdentifierContext): Unit = { if (ctx.BACKQUOTED_IDENTIFIER() != null) { diff --git a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala index 60899df3edac..9d0d4ea79974 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala @@ -446,7 +446,7 @@ private[sql] object QueryParsingErrors extends DataTypeErrorsBase { messageParameters = Map.empty) } - def invalidIdentifierError(ident: String, ctx: ErrorIdentContext): Throwable = { + def invalidIdentifierError(ident: String, ctx: ParserRuleContext): Throwable = { new ParseException( errorClass = "INVALID_IDENTIFIER", messageParameters = Map("ident" -> ident), diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala index 7cf853b0812f..ac22f32c8523 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ErrorParserSuite.scala @@ -39,6 +39,23 @@ class ErrorParserSuite extends AnalysisTest { context = ExpectedContext(fragment = "order by q\ncluster by q", start = 16, stop = 38)) } + test("Illegal characters in unquoted identifier") { + // scalastyle:off + checkError( + exception = parseException("USE \u0196pfel"), + errorClass = "INVALID_IDENTIFIER", + parameters = Map("ident" -> "\u0196pfel")) + checkError( + exception = parseException("USE \u88681"), + errorClass = "INVALID_IDENTIFIER", + parameters = Map("ident" -> "\u88681")) + // scalastyle:on + checkError( + exception = parseException("USE https://www.spa.rk/bucket/pa-th.json?=&#%"), + errorClass = "INVALID_IDENTIFIER", + parameters = Map("ident" -> "https://www.spa.rk/bucket/pa-th.json?=&#%")) + } + test("hyphen in identifier - DDL tests") { checkError( exception = parseException("USE test-test"), diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out index 87f9d262ec70..c0b3a1e8cc55 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/keywords.sql.out @@ -190,6 +190,7 @@ NANOSECOND false NANOSECONDS false NATURAL true NO false +NOT true NULL true NULLS false NUMERIC false @@ -389,6 +390,7 @@ LATERAL LEADING LEFT NATURAL +NOT NULL OFFSET ON diff --git a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out index fda2d6ead598..70df01e786ce 100644 --- a/sql/core/src/test/resources/sql-tests/results/keywords.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/keywords.sql.out @@ -190,6 +190,7 @@ NANOSECOND false NANOSECONDS false NATURAL false NO false +NOT false NULL false NULLS false NUMERIC false diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala index 800ac69fcfab..26cf62d2323c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -214,7 +214,7 @@ trait ThriftServerWithSparkContextSuite extends SharedThriftServer { val sessionHandle = client.openSession(user, "") val infoValue = client.getInfo(sessionHandle, GetInfoType.CLI_ODBC_KEYWORDS) // scalastyle:off line.size.limit - assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DA [...] + assert(infoValue.getStringValue == "ADD,AFTER,ALL,ALTER,ALWAYS,ANALYZE,AND,ANTI,ANY,ANY_VALUE,ARCHIVE,ARRAY,AS,ASC,AT,AUTHORIZATION,BETWEEN,BIGINT,BINARY,BOOLEAN,BOTH,BUCKET,BUCKETS,BY,BYTE,CACHE,CASCADE,CASE,CAST,CATALOG,CATALOGS,CHANGE,CHAR,CHARACTER,CHECK,CLEAR,CLUSTER,CLUSTERED,CODEGEN,COLLATE,COLLECTION,COLUMN,COLUMNS,COMMENT,COMMIT,COMPACT,COMPACTIONS,COMPUTE,CONCATENATE,CONSTRAINT,COST,CREATE,CROSS,CUBE,CURRENT,CURRENT_DATE,CURRENT_TIME,CURRENT_TIMESTAMP,CURRENT_USER,DATA,DA [...] // scalastyle:on line.size.limit } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org