This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new 01cdfb4b785 [SPARK-38590][SQL] New SQL function: try_to_binary 01cdfb4b785 is described below commit 01cdfb4b7858e85a89162435ee176dc64b63b700 Author: Gengliang Wang <gengli...@apache.org> AuthorDate: Thu Apr 7 08:52:50 2022 +0300 [SPARK-38590][SQL] New SQL function: try_to_binary ### What changes were proposed in this pull request? Add a new SQL function: `try_to_binary`. It is identical to the function `to_binary`, except that it returns NULL results instead of throwing an exception on encoding errors. There is a similar function in Snowflake: https://docs.snowflake.com/en/sql-reference/functions/try_to_binary.html ### Why are the changes needed? Users can manage to finish queries without interruptions by encoding errors. ### Does this PR introduce _any_ user-facing change? Yes, adding a new SQL function: `try_to_binary`. It is identical to the function `to_binary`, except that it returns NULL results instead of throwing an exception on encoding errors. ### How was this patch tested? UT Closes #35897 from gengliangwang/try_to_binary. Authored-by: Gengliang Wang <gengli...@apache.org> Signed-off-by: Max Gekk <max.g...@gmail.com> (cherry picked from commit becda3339381b3975ed567c156260eda036d7a1b) Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../spark/sql/catalyst/expressions/TryEval.scala | 35 +++++++++++++++++++++ .../catalyst/expressions/stringExpressions.scala | 24 ++++++++------ .../sql-functions/sql-expression-schema.md | 3 +- .../sql-tests/inputs/string-functions.sql | 4 ++- .../sql-tests/inputs/try-string-functions.sql | 21 +++++++++++++ .../results/ansi/string-functions.sql.out | 11 ++++++- .../sql-tests/results/string-functions.sql.out | 11 ++++++- .../sql-tests/results/try-string-functions.sql.out | Bin 0 -> 1925 bytes 9 files changed, 97 insertions(+), 13 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index bb4aa701102..5befa779d16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -454,6 +454,7 @@ object FunctionRegistry { expression[TryMultiply]("try_multiply"), expression[TryElementAt]("try_element_at"), expression[TrySum]("try_sum"), + expression[TryToBinary]("try_to_binary"), // aggregate functions expression[HyperLogLogPlusPlus]("approx_count_distinct"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala index 7a8a689a1bd..589e5801424 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/TryEval.scala @@ -181,3 +181,38 @@ case class TryMultiply(left: Expression, right: Expression, replacement: Express override protected def withNewChildInternal(newChild: Expression): Expression = this.copy(replacement = newChild) } + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = "_FUNC_(str[, fmt]) - This is a special version of `to_binary` that performs the same operation, but returns a NULL value instead of raising an error if the conversion cannot be performed.", + examples = """ + Examples: + > SELECT _FUNC_('abc', 'utf-8'); + abc + > select _FUNC_('a!', 'base64'); + NULL + > select _FUNC_('abc', 'invalidFormat'); + NULL + """, + since = "3.3.0", + group = "string_funcs") +// scalastyle:on line.size.limit +case class TryToBinary( + expr: Expression, + format: Option[Expression], + replacement: Expression) extends RuntimeReplaceable + with InheritAnalysisRules { + def this(expr: Expression) = + this(expr, None, TryEval(ToBinary(expr, None, nullOnInvalidFormat = true))) + + def this(expr: Expression, formatExpression: Expression) = + this(expr, Some(formatExpression), + TryEval(ToBinary(expr, Some(formatExpression), nullOnInvalidFormat = true))) + + override def prettyName: String = "try_to_binary" + + override def parameters: Seq[Expression] = expr +: format.toSeq + + override protected def withNewChildInternal(newChild: Expression): Expression = + this.copy(replacement = newChild) +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index a08ab84ac6f..88045f85bca 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2638,7 +2638,10 @@ case class Encode(value: Expression, charset: Expression) since = "3.3.0", group = "string_funcs") // scalastyle:on line.size.limit -case class ToBinary(expr: Expression, format: Option[Expression]) extends RuntimeReplaceable +case class ToBinary( + expr: Expression, + format: Option[Expression], + nullOnInvalidFormat: Boolean = false) extends RuntimeReplaceable with ImplicitCastInputTypes { override lazy val replacement: Expression = format.map { f => @@ -2651,6 +2654,7 @@ case class ToBinary(expr: Expression, format: Option[Expression]) extends Runtim case "hex" => Unhex(expr) case "utf-8" => Encode(expr, Literal("UTF-8")) case "base64" => UnBase64(expr) + case _ if nullOnInvalidFormat => Literal(null, BinaryType) case other => throw QueryCompilationErrors.invalidStringLiteralParameter( "to_binary", "format", other, Some("The value has to be a case-insensitive string literal of " + @@ -2659,16 +2663,18 @@ case class ToBinary(expr: Expression, format: Option[Expression]) extends Runtim } }.getOrElse(Unhex(expr)) - def this(expr: Expression) = this(expr, None) + def this(expr: Expression) = this(expr, None, false) def this(expr: Expression, format: Expression) = this(expr, Some({ - // We perform this check in the constructor to make it eager and not go through type coercion. - if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) { - format - } else { - throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string") - } - })) + // We perform this check in the constructor to make it eager and not go through type coercion. + if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) { + format + } else { + throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string") + } + }), + false + ) override def prettyName: String = "to_binary" diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 644bfa926da..1dbf9678af9 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -1,6 +1,6 @@ <!-- Automatically generated by ExpressionsSchemaSuite --> ## Summary - - Number of queries: 385 + - Number of queries: 386 - Number of expressions that missing example: 12 - Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint ## Schema of Built-in Functions @@ -316,6 +316,7 @@ | org.apache.spark.sql.catalyst.expressions.TryElementAt | try_element_at | SELECT try_element_at(array(1, 2, 3), 2) | struct<try_element_at(array(1, 2, 3), 2):int> | | org.apache.spark.sql.catalyst.expressions.TryMultiply | try_multiply | SELECT try_multiply(2, 3) | struct<try_multiply(2, 3):int> | | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct<try_subtract(2, 1):int> | +| org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct<try_to_binary(abc, utf-8):binary> | | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct<typeof(1):string> | | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct<unbase64(U3BhcmsgU1FM):binary> | | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct<negative(1):int> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 7d22e791570..0db28ad9f3e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -184,5 +184,7 @@ select to_binary(null, cast(null as string)); -- 'format' parameter must be string type or void type. select to_binary(null, cast(null as int)); select to_binary('abc', 1); --- invalid inputs. +-- invalid format select to_binary('abc', 'invalidFormat'); +-- invalid string input +select to_binary('a!', 'base64'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/try-string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/try-string-functions.sql new file mode 100644 index 00000000000..20f02374e78 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/try-string-functions.sql @@ -0,0 +1,21 @@ +-- try_to_binary +select try_to_binary('abc'); +select try_to_binary('abc', 'utf-8'); +select try_to_binary('abc', 'base64'); +select try_to_binary('abc', 'hex'); +-- 'format' parameter can be any foldable string value, not just literal. +select try_to_binary('abc', concat('utf', '-8')); +-- 'format' parameter is case insensitive. +select try_to_binary('abc', 'Hex'); +-- null inputs lead to null result. +select try_to_binary('abc', null); +select try_to_binary(null, 'utf-8'); +select try_to_binary(null, null); +select try_to_binary(null, cast(null as string)); +-- 'format' parameter must be string type or void type. +select try_to_binary(null, cast(null as int)); +select try_to_binary('abc', 1); +-- invalid format +select try_to_binary('abc', 'invalidFormat'); +-- invalid string input +select try_to_binary('a!', 'base64'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 01213bd57ad..52d70e22a44 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 141 +-- Number of queries: 142 -- !query @@ -1140,3 +1140,12 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException Invalid value for the 'format' parameter of function 'to_binary': invalidformat. The value has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'. + + +-- !query +select to_binary('a!', 'base64') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Last unit does not have enough valid bits diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 3a7f197e362..ff14da143da 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 141 +-- Number of queries: 142 -- !query @@ -1136,3 +1136,12 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException Invalid value for the 'format' parameter of function 'to_binary': invalidformat. The value has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'. + + +-- !query +select to_binary('a!', 'base64') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Last unit does not have enough valid bits diff --git a/sql/core/src/test/resources/sql-tests/results/try-string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/try-string-functions.sql.out new file mode 100644 index 00000000000..bda723fd19e Binary files /dev/null and b/sql/core/src/test/resources/sql-tests/results/try-string-functions.sql.out differ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org