This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e470e7442f9 [SPARK-45887][SQL] Align codegen and non-codegen implementation of `Encode` e470e7442f9 is described below commit e470e7442f90f7189346993d76733cafe7469ada Author: Max Gekk <max.g...@gmail.com> AuthorDate: Thu Nov 23 18:33:13 2023 +0300 [SPARK-45887][SQL] Align codegen and non-codegen implementation of `Encode` ### What changes were proposed in this pull request? In the PR, I propose to change the implementation of interpretation mode, and make it consistent to codegen. Both implementation raise the same error with new error class `INVALID_PARAMETER_VALUE.CHARSET`. ### Why are the changes needed? To make codegen and non-codegen of the `Encode` expression consistent. So, users will observe the same behaviour in both modes. ### Does this PR introduce _any_ user-facing change? Yes, if user code depends on error from `encode()`. ### How was this patch tested? By running the following test suites: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" $ build/sbt "core/testOnly *SparkThrowableSuite" $ build/sbt "test:testOnly *.StringFunctionsSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43759 from MaxGekk/restrict-charsets-in-encode. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../src/main/resources/error/error-classes.json | 5 ++++ ...nditions-invalid-parameter-value-error-class.md | 4 +++ .../catalyst/expressions/stringExpressions.scala | 19 ++++++++---- .../spark/sql/errors/QueryExecutionErrors.scala | 9 ++++++ .../analyzer-results/ansi/string-functions.sql.out | 15 ++++++++++ .../analyzer-results/string-functions.sql.out | 15 ++++++++++ .../sql-tests/inputs/string-functions.sql | 4 +++ .../results/ansi/string-functions.sql.out | 34 ++++++++++++++++++++++ .../sql-tests/results/string-functions.sql.out | 34 ++++++++++++++++++++++ 9 files changed, 134 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index b0621c44532..19b70307a1c 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -2042,6 +2042,11 @@ "expects an integer value in [0, <upper>), but got <invalidValue>." ] }, + "CHARSET" : { + "message" : [ + "expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got <charset>." + ] + }, "DATETIME_UNIT" : { "message" : [ "expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal <invalidValue>." diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md b/docs/sql-error-conditions-invalid-parameter-value-error-class.md index d58d4fc2f59..8547d8b31f0 100644 --- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md +++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md @@ -45,6 +45,10 @@ expects one of binary formats 'base64', 'hex', 'utf-8', but got `<invalidFormat> expects an integer value in [0, `<upper>`), but got `<invalidValue>`. +## CHARSET + +expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', but got `<charset>`. + ## DATETIME_UNIT expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string literal `<invalidValue>`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 811d6e013ab..0d3239423b2 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions +import java.io.UnsupportedEncodingException import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols} import java.util.{Base64 => JBase64} import java.util.{HashMap, Locale, Map => JMap} @@ -2694,17 +2695,25 @@ case class Encode(value: Expression, charset: Expression) protected override def nullSafeEval(input1: Any, input2: Any): Any = { val toCharset = input2.asInstanceOf[UTF8String].toString - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + try { + input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + } catch { + case _: UnsupportedEncodingException => + throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) + } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (string, charset) => + nullSafeCodeGen(ctx, ev, (string, charset) => { + val toCharset = ctx.freshName("toCharset") s""" + String $toCharset = $charset.toString(); try { - ${ev.value} = $string.toString().getBytes($charset.toString()); + ${ev.value} = $string.toString().getBytes($toCharset); } catch (java.io.UnsupportedEncodingException e) { - org.apache.spark.unsafe.Platform.throwException(e); - }""") + throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); + }""" + }) } override protected def withNewChildrenInternal( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 0abb202a10f..1aa25a51fa9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2758,4 +2758,13 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "upper" -> size.toString, "invalidValue" -> pos.toString)) } + + def invalidCharsetError(functionName: String, charset: String): RuntimeException = { + new SparkIllegalArgumentException( + errorClass = "INVALID_PARAMETER_VALUE.CHARSET", + messageParameters = Map( + "functionName" -> toSQLId(functionName), + "parameter" -> toSQLId("charset"), + "charset" -> charset)) + } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index baed5a62772..9c210a713de 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +select encode('hello', 'Windows-xxx') +-- !query analysis +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select decode() -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index baed5a62772..9c210a713de 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +select encode('hello', 'Windows-xxx') +-- !query analysis +Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select decode() -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 3ef0d0f0cfb..0fbf211ec5c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -117,6 +117,10 @@ SELECT lpad(x'57', 5, 'abc'); SELECT rpad('abc', 5, x'57'); SELECT rpad(x'57', 5, 'abc'); +-- encode +select encode('hello', 'Windows-xxx'); +select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); + -- decode select decode(); select decode(encode('abc', 'utf-8')); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 61cea27d290..082ff03efac 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -803,6 +803,40 @@ struct<rpad(X'57', 5, abc):string> Wabca +-- !query +select encode('hello', 'Windows-xxx') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select decode() -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index d4d61ac9ef1..79140920378 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -735,6 +735,40 @@ struct<rpad(X'57', 5, abc):string> Wabca +-- !query +select encode('hello', 'Windows-xxx') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "Windows-xxx", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select decode() -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org