This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push: new cf5c90df3af [SPARK-42635][SQL] Fix the TimestampAdd expression cf5c90df3af is described below commit cf5c90df3afc7b68d61b11b78577f71fd4d85135 Author: Chenhao Li <chenhao...@databricks.com> AuthorDate: Fri Mar 3 09:38:44 2023 +0300 [SPARK-42635][SQL] Fix the TimestampAdd expression ### What changes were proposed in this pull request? This PR fixed the counter-intuitive behaviors of the `TimestampAdd` expression mentioned in https://issues.apache.org/jira/browse/SPARK-42635. See the following *user-facing* changes for details. ### Does this PR introduce _any_ user-facing change? Yes. This PR fixes the three problems mentioned in SPARK-42635: 1. When the time is close to daylight saving time transition, the result may be discontinuous and not monotonic. 2. Adding month, quarter, and year silently ignores `Int` overflow during unit conversion. 3. Adding sub-month units (week, day, hour, minute, second, millisecond, microsecond)silently ignores `Long` overflow during unit conversion. Some examples of the result changes: Old results: ``` // In America/Los_Angeles timezone: timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 (this is correct, put it here for comparison) timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 timestampadd(SECOND, 86400 - 1, 2011-03-12 03:00:00) = 2011-03-13 03:59:59 timestampadd(SECOND, 86400, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 // In UTC timezone: timestampadd(quarter, 1431655764, 1970-01-01 00:00:00) = 1969-09-01 00:00:00 timestampadd(day, 106751992, 1970-01-01 00:00:00) = -290308-12-22 15:58:10.448384 ``` New results: ``` // In America/Los_Angeles timezone: timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 timestampadd(SECOND, 86400 - 1, 2011-03-12 03:00:00) = 2011-03-13 03:59:59 timestampadd(SECOND, 86400, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 // In UTC timezone: timestampadd(quarter, 1431655764, 1970-01-01 00:00:00) = throw overflow exception timestampadd(day, 106751992, 1970-01-01 00:00:00) = throw overflow exception ``` ### How was this patch tested? Pass existing tests and some new tests. Closes #40237 from chenhao-db/SPARK-42635. Authored-by: Chenhao Li <chenhao...@databricks.com> Signed-off-by: Max Gekk <max.g...@gmail.com> (cherry picked from commit 392bdc1595879ea03c90e3f4b550aa5ce7b32bdf) Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../spark/sql/catalyst/util/DateTimeUtils.scala | 22 +++-- .../expressions/DateExpressionsSuite.scala | 98 +++++++++++++++++++++- 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index af0666a98fa..2ae11f6568a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -1227,25 +1227,29 @@ object DateTimeUtils { try { unit.toUpperCase(Locale.ROOT) match { case "MICROSECOND" => - timestampAddDayTime(micros, quantity, zoneId) + timestampAddInterval(micros, 0, 0, quantity, zoneId) case "MILLISECOND" => - timestampAddDayTime(micros, quantity * MICROS_PER_MILLIS, zoneId) + timestampAddInterval(micros, 0, 0, + Math.multiplyExact(quantity.toLong, MICROS_PER_MILLIS), zoneId) case "SECOND" => - timestampAddDayTime(micros, quantity * MICROS_PER_SECOND, zoneId) + timestampAddInterval(micros, 0, 0, + Math.multiplyExact(quantity.toLong, MICROS_PER_SECOND), zoneId) case "MINUTE" => - timestampAddDayTime(micros, quantity * MICROS_PER_MINUTE, zoneId) + timestampAddInterval(micros, 0, 0, + Math.multiplyExact(quantity.toLong, MICROS_PER_MINUTE), zoneId) case "HOUR" => - timestampAddDayTime(micros, quantity * MICROS_PER_HOUR, zoneId) + timestampAddInterval(micros, 0, 0, + Math.multiplyExact(quantity.toLong, MICROS_PER_HOUR), zoneId) case "DAY" | "DAYOFYEAR" => - timestampAddDayTime(micros, quantity * MICROS_PER_DAY, zoneId) + timestampAddInterval(micros, 0, quantity, 0, zoneId) case "WEEK" => - timestampAddDayTime(micros, quantity * MICROS_PER_DAY * DAYS_PER_WEEK, zoneId) + timestampAddInterval(micros, 0, Math.multiplyExact(quantity, DAYS_PER_WEEK), 0, zoneId) case "MONTH" => timestampAddMonths(micros, quantity, zoneId) case "QUARTER" => - timestampAddMonths(micros, quantity * 3, zoneId) + timestampAddMonths(micros, Math.multiplyExact(quantity, 3), zoneId) case "YEAR" => - timestampAddMonths(micros, quantity * MONTHS_PER_YEAR, zoneId) + timestampAddMonths(micros, Math.multiplyExact(quantity, MONTHS_PER_YEAR), zoneId) } } catch { case _: scala.MatchError => diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index e39d1518990..d2010102690 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -28,7 +28,7 @@ import scala.language.postfixOps import scala.reflect.ClassTag import scala.util.Random -import org.apache.spark.{SparkDateTimeException, SparkFunSuite, SparkUpgradeException} +import org.apache.spark.{SparkArithmeticException, SparkDateTimeException, SparkFunSuite, SparkUpgradeException} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils, TimestampFormatter} @@ -1961,6 +1961,102 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } } + test("SPARK-42635: timestampadd near daylight saving transition") { + // In America/Los_Angeles timezone, timestamp value `skippedTime` is 2011-03-13 03:00:00. + // The next second of 2011-03-13 01:59:59 jumps to 2011-03-13 03:00:00. + val skippedTime = 1300010400000000L + // In America/Los_Angeles timezone, both timestamp range `[repeatedTime - MICROS_PER_HOUR, + // repeatedTime)` and `[repeatedTime, repeatedTime + MICROS_PER_HOUR)` map to + // [2011-11-06 01:00:00, 2011-11-06 02:00:00). + // The next second of 2011-11-06 01:59:59 (pre-transition) jumps back to 2011-11-06 01:00:00. + val repeatedTime = 1320570000000000L + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) { + // Adding one day is **not** equivalent to adding <unit>_PER_DAY time units, because not every + // day has 24 hours: 2011-03-13 has 23 hours, 2011-11-06 has 25 hours. + + // timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 + checkEvaluation( + TimestampAdd("DAY", Literal(1), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + skippedTime) + // timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 + checkEvaluation( + TimestampAdd("HOUR", Literal(24), + Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + skippedTime + MICROS_PER_HOUR) + // timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 + checkEvaluation( + TimestampAdd("HOUR", Literal(23), + Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + skippedTime) + // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 04:00:00 + checkEvaluation( + TimestampAdd( + "SECOND", Literal(SECONDS_PER_DAY.toInt), + Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + skippedTime + MICROS_PER_HOUR) + // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 03:59:59 + checkEvaluation( + TimestampAdd( + "SECOND", Literal(SECONDS_PER_DAY.toInt - 1), + Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)), + skippedTime + MICROS_PER_HOUR - MICROS_PER_SECOND) + + // timestampadd(DAY, 1, 2011-11-05 02:00:00) = 2011-11-06 02:00:00 + checkEvaluation( + TimestampAdd("DAY", Literal(1), + Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), + repeatedTime + MICROS_PER_HOUR) + // timestampadd(DAY, 1, 2011-11-05 01:00:00) = 2011-11-06 01:00:00 (pre-transition) + checkEvaluation( + TimestampAdd("DAY", Literal(1), + Literal(repeatedTime - 25 * MICROS_PER_HOUR, TimestampType)), + repeatedTime - MICROS_PER_HOUR) + // timestampadd(DAY, -1, 2011-11-07 01:00:00) = 2011-11-06 01:00:00 (post-transition) + checkEvaluation( + TimestampAdd("DAY", Literal(-1), + Literal(repeatedTime + 24 * MICROS_PER_HOUR, TimestampType)), + repeatedTime) + // timestampadd(MONTH, 1, 2011-10-06 01:00:00) = 2011-11-06 01:00:00 (pre-transition) + checkEvaluation( + TimestampAdd( + "MONTH", Literal(1), + Literal(repeatedTime - MICROS_PER_HOUR - 31 * MICROS_PER_DAY, TimestampType)), + repeatedTime - MICROS_PER_HOUR) + // timestampadd(MONTH, -1, 2011-12-06 01:00:00) = 2011-11-06 01:00:00 (post-transition) + checkEvaluation( + TimestampAdd( + "MONTH", Literal(-1), + Literal(repeatedTime + 30 * MICROS_PER_DAY, TimestampType)), + repeatedTime) + // timestampadd(HOUR, 23, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (pre-transition) + checkEvaluation( + TimestampAdd("HOUR", Literal(23), + Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), + repeatedTime - MICROS_PER_HOUR) + // timestampadd(HOUR, 24, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (post-transition) + checkEvaluation( + TimestampAdd("HOUR", Literal(24), + Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)), + repeatedTime) + } + } + + test("SPARK-42635: timestampadd unit conversion overflow") { + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") { + checkErrorInExpression[SparkArithmeticException](TimestampAdd("DAY", + Literal(106751992), + Literal(0L, TimestampType)), + errorClass = "DATETIME_OVERFLOW", + parameters = Map("operation" -> "add 106751992 DAY to TIMESTAMP '1970-01-01 00:00:00'")) + checkErrorInExpression[SparkArithmeticException](TimestampAdd("QUARTER", + Literal(1431655764), + Literal(0L, TimestampType)), + errorClass = "DATETIME_OVERFLOW", + parameters = Map("operation" -> + "add 1431655764 QUARTER to TIMESTAMP '1970-01-01 00:00:00'")) + } + } + test("SPARK-38284: difference between two timestamps in units") { // Check case-insensitivity checkEvaluation( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org