This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 2430e87ac939 [SPARK-46115][SQL] Restrict charsets in `encode()` 2430e87ac939 is described below commit 2430e87ac93952ae7e296faf49734f65af29f9ed Author: Max Gekk <max.g...@gmail.com> AuthorDate: Tue Nov 28 08:47:46 2023 +0900 [SPARK-46115][SQL] Restrict charsets in `encode()` ### What changes were proposed in this pull request? In the PR, I propose to restrict the supported charsets in the `encode()` functions by the list from [the doc](https://spark.apache.org/docs/latest/api/sql/#encode): ``` 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' ``` and introduce the SQL config `spark.sql.legacy.javaCharsets` for restoring the previous behaviour. ### Why are the changes needed? Currently the list of supported charsets in `encode()` is not stable and fully depends on the used JDK version. So, sometimes user code might not work because a devop changed Java version in Spark cluster. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By running new checks: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44020 from MaxGekk/restrict-charsets-in-encode-2. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../explain-results/function_encode.explain | 2 +- .../function_to_binary_with_format.explain | 2 +- docs/sql-migration-guide.md | 1 + python/pyspark/sql/tests/pandas/test_pandas_map.py | 2 +- .../catalyst/expressions/stringExpressions.scala | 25 +++++++- .../org/apache/spark/sql/internal/SQLConf.scala | 11 ++++ .../analyzer-results/ansi/string-functions.sql.out | 54 ++++++++++++++++-- .../analyzer-results/string-functions.sql.out | 54 ++++++++++++++++-- .../typeCoercion/native/concat.sql.out | 18 +++--- .../typeCoercion/native/elt.sql.out | 8 +-- .../sql-tests/inputs/string-functions.sql | 6 ++ .../results/ansi/string-functions.sql.out | 66 ++++++++++++++++++++++ .../sql-tests/results/string-functions.sql.out | 66 ++++++++++++++++++++++ .../scala/org/apache/spark/sql/ExplainSuite.scala | 8 +-- 14 files changed, 288 insertions(+), 35 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 56da919abf4c..2f6543605923 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8) AS encode(g, UTF-8)#0] +Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0] +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index e9513f0103c8..b62ccccc0c15 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8) AS to_binary(g, utf-8)#0] +Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0] +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 5c00ce655851..664bccf26651 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -29,6 +29,7 @@ license: | - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. +- Since Spark 4.0, the `encode()` function supports only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. ## Upgrading from Spark SQL 3.4 to 3.5 diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py index 304b78049b20..ec9f208d08f9 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_map.py @@ -110,7 +110,7 @@ class MapInPandasTestsMixin: df = ( self.spark.range(10, numPartitions=3) .select(col("id").cast("string").alias("str")) - .withColumn("bin", encode(col("str"), "utf8")) + .withColumn("bin", encode(col("str"), "utf-8")) ) actual = df.mapInPandas(func, "str string, bin binary").collect() expected = df.collect() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 0d3239423b22..90cfd13875d0 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2685,18 +2685,26 @@ case class StringDecode(bin: Expression, charset: Expression) since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class Encode(value: Expression, charset: Expression) +case class Encode(value: Expression, charset: Expression, legacyCharsets: Boolean) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this(value: Expression, charset: Expression) = + this(value, charset, SQLConf.get.legacyJavaCharsets) + override def left: Expression = value override def right: Expression = charset override def dataType: DataType = BinaryType override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + private val supportedCharsets = Set( + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") + protected override def nullSafeEval(input1: Any, input2: Any): Any = { val toCharset = input2.asInstanceOf[UTF8String].toString try { - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { + input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + } else throw new UnsupportedEncodingException } catch { case _: UnsupportedEncodingException => throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) @@ -2706,10 +2714,17 @@ case class Encode(value: Expression, charset: Expression) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (string, charset) => { val toCharset = ctx.freshName("toCharset") + val sc = JavaCode.global( + ctx.addReferenceObj("supportedCharsets", supportedCharsets), + supportedCharsets.getClass) s""" String $toCharset = $charset.toString(); try { - ${ev.value} = $string.toString().getBytes($toCharset); + if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) { + ${ev.value} = $string.toString().getBytes($toCharset); + } else { + throw new java.io.UnsupportedEncodingException(); + } } catch (java.io.UnsupportedEncodingException e) { throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); }""" @@ -2720,6 +2735,10 @@ case class Encode(value: Expression, charset: Expression) newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft, charset = newRight) } +object Encode { + def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) +} + /** * Converts the input expression to a binary value based on the supplied format. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 5133c40bc6fa..d4e5c6a3d1e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4584,6 +4584,15 @@ object SQLConf { .checkValue(_ > 0, "The number of stack traces in the DataFrame context must be positive.") .createWithDefault(1) + val LEGACY_JAVA_CHARSETS = buildConf("spark.sql.legacy.javaCharsets") + .internal() + .doc("When set to true, the functions like `encode()` can use charsets from JDK while " + + "encoding or decoding string values. If it is false, such functions support only one of " + + "the charsets: 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * @@ -5474,6 +5483,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def stackTracesInDataFrameContext: Int = getConf(SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT) + def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 9c210a713de3..9d8705e3e862 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 9c210a713de3..9d8705e3e862 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index 676737a4fea8..1b19753b1f6d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,7 +11,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x] + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x] +- Range (0, 10, step=1, splits=None) @@ -29,7 +29,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1, splits=None) @@ -46,7 +46,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -67,7 +67,7 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -84,7 +84,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -101,7 +101,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -122,7 +122,7 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -139,7 +139,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -156,7 +156,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index 5a9b5ddbafa3..4d897a329cfe 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,7 +13,7 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1, splits=None) @@ -30,7 +30,7 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -51,7 +51,7 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 0fbf211ec5c5..645f6bcb8327 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -118,6 +118,12 @@ SELECT rpad('abc', 5, x'57'); SELECT rpad(x'57', 5, 'abc'); -- encode +set spark.sql.legacy.javaCharsets=true; +select encode('hello', 'WINDOWS-1252'); +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.javaCharsets=false; +select encode('hello', 'WINDOWS-1252'); +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); select encode('hello', 'Windows-xxx'); select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 082ff03efacb..89bb20fc1bff 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -803,6 +803,72 @@ struct<rpad(X'57', 5, abc):string> Wabca +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<encode(hello, WINDOWS-1252):binary> +-- !query output +hello + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<encode(scol, ecol):binary> +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select encode('hello', 'Windows-xxx') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 791409203788..6d90a5091578 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -735,6 +735,72 @@ struct<rpad(X'57', 5, abc):string> Wabca +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<encode(hello, WINDOWS-1252):binary> +-- !query output +hello + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<encode(scol, ecol):binary> +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct<key:string,value:string> +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select encode('hello', 'Windows-xxx') -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index 8b5ffe560a1f..da04674b9920 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -193,8 +193,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df2, "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") val df3 = sql( """ @@ -209,8 +209,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df3, "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org