This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e470e7442f9 [SPARK-45887][SQL] Align codegen and non-codegen 
implementation of `Encode`
e470e7442f9 is described below

commit e470e7442f90f7189346993d76733cafe7469ada
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Thu Nov 23 18:33:13 2023 +0300

    [SPARK-45887][SQL] Align codegen and non-codegen implementation of `Encode`
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to change the implementation of interpretation mode, 
and make it consistent to codegen. Both implementation raise the same error 
with new error class `INVALID_PARAMETER_VALUE.CHARSET`.
    
    ### Why are the changes needed?
    To make codegen and non-codegen of the `Encode` expression consistent. So, 
users will observe the same behaviour in both modes.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, if user code depends on error from `encode()`.
    
    ### How was this patch tested?
    By running the following test suites:
    ```
    $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly 
org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql"
    $ build/sbt "core/testOnly *SparkThrowableSuite"
    $ build/sbt "test:testOnly *.StringFunctionsSuite"
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #43759 from MaxGekk/restrict-charsets-in-encode.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Max Gekk <max.g...@gmail.com>
---
 .../src/main/resources/error/error-classes.json    |  5 ++++
 ...nditions-invalid-parameter-value-error-class.md |  4 +++
 .../catalyst/expressions/stringExpressions.scala   | 19 ++++++++----
 .../spark/sql/errors/QueryExecutionErrors.scala    |  9 ++++++
 .../analyzer-results/ansi/string-functions.sql.out | 15 ++++++++++
 .../analyzer-results/string-functions.sql.out      | 15 ++++++++++
 .../sql-tests/inputs/string-functions.sql          |  4 +++
 .../results/ansi/string-functions.sql.out          | 34 ++++++++++++++++++++++
 .../sql-tests/results/string-functions.sql.out     | 34 ++++++++++++++++++++++
 9 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json 
b/common/utils/src/main/resources/error/error-classes.json
index b0621c44532..19b70307a1c 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -2042,6 +2042,11 @@
           "expects an integer value in [0, <upper>), but got <invalidValue>."
         ]
       },
+      "CHARSET" : {
+        "message" : [
+          "expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 
'UTF-16BE', 'UTF-16LE', 'UTF-16', but got <charset>."
+        ]
+      },
       "DATETIME_UNIT" : {
         "message" : [
           "expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, 
DAY, DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the 
string literal <invalidValue>."
diff --git a/docs/sql-error-conditions-invalid-parameter-value-error-class.md 
b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
index d58d4fc2f59..8547d8b31f0 100644
--- a/docs/sql-error-conditions-invalid-parameter-value-error-class.md
+++ b/docs/sql-error-conditions-invalid-parameter-value-error-class.md
@@ -45,6 +45,10 @@ expects one of binary formats 'base64', 'hex', 'utf-8', but 
got `<invalidFormat>
 
 expects an integer value in [0, `<upper>`), but got `<invalidValue>`.
 
+## CHARSET
+
+expects one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 
'UTF-16LE', 'UTF-16', but got `<charset>`.
+
 ## DATETIME_UNIT
 
 expects one of the units without quotes YEAR, QUARTER, MONTH, WEEK, DAY, 
DAYOFYEAR, HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND, but got the string 
literal `<invalidValue>`.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 811d6e013ab..0d3239423b2 100755
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.io.UnsupportedEncodingException
 import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
 import java.util.{Base64 => JBase64}
 import java.util.{HashMap, Locale, Map => JMap}
@@ -2694,17 +2695,25 @@ case class Encode(value: Expression, charset: 
Expression)
 
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     val toCharset = input2.asInstanceOf[UTF8String].toString
-    input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+    try {
+      input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+    } catch {
+      case _: UnsupportedEncodingException =>
+        throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
+    }
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    nullSafeCodeGen(ctx, ev, (string, charset) =>
+    nullSafeCodeGen(ctx, ev, (string, charset) => {
+      val toCharset = ctx.freshName("toCharset")
       s"""
+        String $toCharset = $charset.toString();
         try {
-          ${ev.value} = $string.toString().getBytes($charset.toString());
+          ${ev.value} = $string.toString().getBytes($toCharset);
         } catch (java.io.UnsupportedEncodingException e) {
-          org.apache.spark.unsafe.Platform.throwException(e);
-        }""")
+          throw QueryExecutionErrors.invalidCharsetError("$prettyName", 
$toCharset);
+        }"""
+    })
   }
 
   override protected def withNewChildrenInternal(
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
index 0abb202a10f..1aa25a51fa9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -2758,4 +2758,13 @@ private[sql] object QueryExecutionErrors extends 
QueryErrorsBase with ExecutionE
         "upper" -> size.toString,
         "invalidValue" -> pos.toString))
   }
+
+  def invalidCharsetError(functionName: String, charset: String): 
RuntimeException = {
+    new SparkIllegalArgumentException(
+      errorClass = "INVALID_PARAMETER_VALUE.CHARSET",
+      messageParameters = Map(
+        "functionName" -> toSQLId(functionName),
+        "parameter" -> toSQLId("charset"),
+        "charset" -> charset))
+  }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index baed5a62772..9c210a713de 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 
5, abc)#x]
 +- OneRowRelation
 
 
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query analysis
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
 -- !query
 select decode()
 -- !query analysis
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index baed5a62772..9c210a713de 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -640,6 +640,21 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 
5, abc)#x]
 +- OneRowRelation
 
 
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query analysis
+Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
 -- !query
 select decode()
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 3ef0d0f0cfb..0fbf211ec5c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -117,6 +117,10 @@ SELECT lpad(x'57', 5, 'abc');
 SELECT rpad('abc', 5, x'57');
 SELECT rpad(x'57', 5, 'abc');
 
+-- encode
+select encode('hello', 'Windows-xxx');
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
+
 -- decode
 select decode();
 select decode(encode('abc', 'utf-8'));
diff --git 
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 61cea27d290..082ff03efac 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -803,6 +803,40 @@ struct<rpad(X'57', 5, abc):string>
 Wabca
 
 
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "Windows-xxx",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "Windows-xxx",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
 -- !query
 select decode()
 -- !query schema
diff --git 
a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index d4d61ac9ef1..79140920378 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -735,6 +735,40 @@ struct<rpad(X'57', 5, abc):string>
 Wabca
 
 
+-- !query
+select encode('hello', 'Windows-xxx')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "Windows-xxx",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "Windows-xxx",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
 -- !query
 select decode()
 -- !query schema


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to