This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 0ffd9492b01 [SPARK-39795][SQL] New SQL function: try_to_timestamp 0ffd9492b01 is described below commit 0ffd9492b01f87127e46cb0677ded3c5e26c89d6 Author: Gengliang Wang <gengli...@apache.org> AuthorDate: Sat Jul 16 12:53:11 2022 +0300 [SPARK-39795][SQL] New SQL function: try_to_timestamp ### What changes were proposed in this pull request? Add a new function `try_to_timestamp`. It is identical to the function `to_timestamp`, except that it returns `NULL` result instead of throwing an exception on string parsing error under ANSI SQL mode. ### Why are the changes needed? Similar to the other try_* functions from https://issues.apache.org/jira/browse/SPARK-35161, users can manage to finish queries without interruptions in ANSI mode. The function to_timestamp is popular. There is a `try_to_timestamp` function in snowflake as well: https://docs.snowflake.com/en/sql-reference/functions/try_to_timestamp.html ### Does this PR introduce _any_ user-facing change? Yes, a new function `try_to_timestamp`. It is identical to the function `to_timestamp`, except that it returns `NULL` result instead of throwing an exception on string parsing error. ### How was this patch tested? UT Closes #37204 from gengliangwang/try_to_timestamp. Authored-by: Gengliang Wang <gengli...@apache.org> Signed-off-by: Max Gekk <max.g...@gmail.com> --- docs/sql-ref-ansi-compliance.md | 1 + .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../catalyst/expressions/datetimeExpressions.scala | 51 ++++++++++++++++++++-- .../sql-functions/sql-expression-schema.md | 1 + .../inputs/ansi/try_datetime_functions.sql | 1 + .../sql-tests/inputs/try_datetime_functions.sql | 6 +++ .../results/ansi/try_datetime_functions.sql.out | 49 +++++++++++++++++++++ .../results/try_datetime_functions.sql.out | 49 +++++++++++++++++++++ 8 files changed, 156 insertions(+), 3 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index bb55cec52f5..6ad8210ed7e 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -319,6 +319,7 @@ When ANSI mode is on, it throws exceptions for invalid operations. You can use t - `try_sum`: identical to the function `sum`, except that it returns `NULL` result instead of throwing an exception on integral/decimal/interval value overflow. - `try_avg`: identical to the function `avg`, except that it returns `NULL` result instead of throwing an exception on decimal/interval value overflow. - `try_element_at`: identical to the function `element_at`, except that it returns `NULL` result instead of throwing an exception on array's index out of bound or map's key not found. + - `try_to_timestamp`: identical to the function `to_timestamp`, except that it returns `NULL` result instead of throwing an exception on string parsing error. ### SQL Keywords (optional, disabled by default) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d97b344d166..858f2841dcd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -456,6 +456,7 @@ object FunctionRegistry { expression[TryAverage]("try_avg"), expression[TrySum]("try_sum"), expression[TryToBinary]("try_to_binary"), + expressionBuilder("try_to_timestamp", TryToTimestampExpressionBuilder, setAlias = true), // aggregate functions expression[HyperLogLogPlusPlus]("approx_count_distinct"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index c965d50eabf..98b23b1c6a9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -1161,6 +1161,50 @@ object ParseToTimestampLTZExpressionBuilder extends ExpressionBuilder { } } +/** + * * Parses a column to a timestamp based on the supplied format. + */ +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(timestamp_str[, fmt]) - Parses the `timestamp_str` expression with the `fmt` expression + to a timestamp. The function always returns null on an invalid input with/without ANSI SQL + mode enabled. By default, it follows casting rules to a timestamp if the `fmt` is omitted. + The result data type is consistent with the value of configuration `spark.sql.timestampType`. + """, + arguments = """ + Arguments: + * timestamp_str - A string to be parsed to timestamp. + * fmt - Timestamp format pattern to follow. See <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime Patterns</a> for valid + date and time format patterns. + """, + examples = """ + Examples: + > SELECT _FUNC_('2016-12-31 00:12:00'); + 2016-12-31 00:12:00 + > SELECT _FUNC_('2016-12-31', 'yyyy-MM-dd'); + 2016-12-31 00:00:00 + > SELECT _FUNC_('foo', 'yyyy-MM-dd'); + NULL + """, + group = "datetime_funcs", + since = "3.4.0") +// scalastyle:on line.size.limit +object TryToTimestampExpressionBuilder extends ExpressionBuilder { + override def build(funcName: String, expressions: Seq[Expression]): Expression = { + val numArgs = expressions.length + if (numArgs == 1 || numArgs == 2) { + ParseToTimestamp( + expressions.head, + expressions.drop(1).lastOption, + SQLConf.get.timestampType, + failOnError = false) + } else { + throw QueryCompilationErrors.invalidFunctionArgumentNumberError(Seq(1, 2), funcName, numArgs) + } + } +} + abstract class ToTimestamp extends BinaryExpression with TimestampFormatterHelper with ExpectsInputTypes { @@ -2048,12 +2092,13 @@ case class ParseToTimestamp( left: Expression, format: Option[Expression], override val dataType: DataType, - timeZoneId: Option[String] = None) + timeZoneId: Option[String] = None, + failOnError: Boolean = SQLConf.get.ansiEnabled) extends RuntimeReplaceable with ImplicitCastInputTypes with TimeZoneAwareExpression { override lazy val replacement: Expression = format.map { f => - GetTimestamp(left, f, dataType, timeZoneId) - }.getOrElse(Cast(left, dataType, timeZoneId)) + GetTimestamp(left, f, dataType, timeZoneId, failOnError = failOnError) + }.getOrElse(Cast(left, dataType, timeZoneId, ansiEnabled = failOnError)) def this(left: Expression, format: Expression) = { this(left, Option(format), SQLConf.get.timestampType) diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 78a9ce7c386..2859f7f7a60 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -320,6 +320,7 @@ | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct<try_subtract(2, 1):int> | | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct<try_to_binary(abc, utf-8):binary> | | org.apache.spark.sql.catalyst.expressions.TryToNumber | try_to_number | SELECT try_to_number('454', '999') | struct<try_to_number(454, 999):decimal(3,0)> | +| org.apache.spark.sql.catalyst.expressions.TryToTimestampExpressionBuilder | try_to_timestamp | SELECT try_to_timestamp('2016-12-31 00:12:00') | struct<try_to_timestamp(2016-12-31 00:12:00):timestamp> | | org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct<typeof(1):string> | | org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct<unbase64(U3BhcmsgU1FM):binary> | | org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct<negative(1):int> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql new file mode 100644 index 00000000000..ede47f3eecb --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/try_datetime_functions.sql @@ -0,0 +1 @@ +--IMPORT try_datetime_functions.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql b/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql new file mode 100644 index 00000000000..7cf67dce2ae --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/try_datetime_functions.sql @@ -0,0 +1,6 @@ +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd'); +select try_to_timestamp(1); +select try_to_timestamp('2016-12-31 abc'); +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]'); +select try_to_timestamp("02-29", "MM-dd"); +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out new file mode 100644 index 00000000000..a2326ee0814 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/try_datetime_functions.sql.out @@ -0,0 +1,49 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct<try_to_timestamp(NULL):timestamp,try_to_timestamp(2016-12-31 00:12:00):timestamp,try_to_timestamp(2016-12-31, yyyy-MM-dd):timestamp> +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select try_to_timestamp(1) +-- !query schema +struct<try_to_timestamp(1):timestamp> +-- !query output +1969-12-31 16:00:01 + + +-- !query +select try_to_timestamp('2016-12-31 abc') +-- !query schema +struct<try_to_timestamp(2016-12-31 abc):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct<try_to_timestamp(2019-10-06 10:11:12., yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp("02-29", "MM-dd") +-- !query schema +struct<try_to_timestamp(02-29, MM-dd):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +[INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION] You may get a different result due to the upgrading to Spark >= 3.0: +Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html diff --git a/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out b/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out new file mode 100644 index 00000000000..a2326ee0814 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/try_datetime_functions.sql.out @@ -0,0 +1,49 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +select try_to_timestamp(null), try_to_timestamp('2016-12-31 00:12:00'), try_to_timestamp('2016-12-31', 'yyyy-MM-dd') +-- !query schema +struct<try_to_timestamp(NULL):timestamp,try_to_timestamp(2016-12-31 00:12:00):timestamp,try_to_timestamp(2016-12-31, yyyy-MM-dd):timestamp> +-- !query output +NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 + + +-- !query +select try_to_timestamp(1) +-- !query schema +struct<try_to_timestamp(1):timestamp> +-- !query output +1969-12-31 16:00:01 + + +-- !query +select try_to_timestamp('2016-12-31 abc') +-- !query schema +struct<try_to_timestamp(2016-12-31 abc):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]') +-- !query schema +struct<try_to_timestamp(2019-10-06 10:11:12., yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp("02-29", "MM-dd") +-- !query schema +struct<try_to_timestamp(02-29, MM-dd):timestamp> +-- !query output +NULL + + +-- !query +select try_to_timestamp('22 05 2020 Friday', 'dd MM yyyy EEEEEE') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkUpgradeException +[INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION] You may get a different result due to the upgrading to Spark >= 3.0: +Fail to recognize 'dd MM yyyy EEEEEE' pattern in the DateTimeFormatter. 1) You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org