This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.2 by this push: new 06857f3d70a [SPARK-39184][SQL] Handle undersized result array in date and timestamp sequences 06857f3d70a is described below commit 06857f3d70af468ef4d1911ebae3abfa50830ad0 Author: Bruce Robbins <bersprock...@gmail.com> AuthorDate: Tue Aug 16 11:53:39 2022 +0300 [SPARK-39184][SQL] Handle undersized result array in date and timestamp sequences ### What changes were proposed in this pull request? Add code to defensively check if the pre-allocated result array is big enough to handle the next element in a date or timestamp sequence. ### Why are the changes needed? `InternalSequenceBase.getSequenceLength` is a fast method for estimating the size of the result array. It uses an estimated step size in micros which is not always entirely accurate for the date/time/time-zone combination. As a result, `getSequenceLength` occasionally overestimates the size of the result array and also occasionally underestimates the size of the result array. `getSequenceLength` sometimes overestimates the size of the result array when the step size is in months (because `InternalSequenceBase` assumes 28 days per month). This case is handled: `InternalSequenceBase` will slice the array, if needed. `getSequenceLength` sometimes underestimates the size of the result array when the sequence crosses a DST "spring forward" without a corresponding "fall backward". This case is not handled (thus, this PR). For example: ``` select sequence( timestamp'2022-03-13 00:00:00', timestamp'2022-03-14 00:00:00', interval 1 day) as x; ``` In the America/Los_Angeles time zone, this results in the following error: ``` java.lang.ArrayIndexOutOfBoundsException: 1 at scala.runtime.ScalaRunTime$.array_update(ScalaRunTime.scala:77) ``` This happens because `InternalSequenceBase` calculates an estimated step size of 24 hours. If you add 24 hours to 2022-03-13 00:00:00 in the America/Los_Angeles time zone, you get 2022-03-14 01:00:00 (because 2022-03-13 has only 23 hours due to "spring forward"). Since 2022-03-14 01:00:00 is later than the specified stop value, `getSequenceLength` assumes the stop value is not included in the result. Therefore, `getSequenceLength` estimates an array size of 1. However, when actually creating the sequence, `InternalSequenceBase` does not use a step of 24 hours, but of 1 day. When you add 1 day to 2022-03-13 00:00:00, you get 2022-03-14 00:00:00. Now the stop value *is* included, and we overrun the end of the result array. The new unit test includes examples of problematic date sequences. This PR adds code to to handle the underestimation case: it checks if we're about to overrun the array, and if so, gets a new array that's larger by 1. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit test. Closes #37513 from bersprockets/date_sequence_array_size_issue. Authored-by: Bruce Robbins <bersprock...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> (cherry picked from commit 3a1136aa05dd5e16de81c7ec804416b3498ca967) Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../expressions/collectionOperations.scala | 13 +++++-- .../expressions/CollectionExpressionsSuite.scala | 40 ++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 110745e3a5f..361137ad91b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -2921,17 +2921,23 @@ object Sequence { val startMicros: Long = toMicros(num.toLong(start), scale) val stopMicros: Long = toMicros(num.toLong(stop), scale) - val maxEstimatedArrayLength = + val estimatedArrayLength = getSequenceLength(startMicros, stopMicros, input3, intervalStepInMicros) val stepSign = if (intervalStepInMicros > 0) +1 else -1 val exclusiveItem = stopMicros + stepSign - val arr = new Array[T](maxEstimatedArrayLength) + var arr = new Array[T](estimatedArrayLength) var t = startMicros var i = 0 while (t < exclusiveItem ^ stepSign < 0) { val result = fromMicros(t, scale) + // if we've underestimated the size of the array, due to crossing a DST + // "spring forward" without a corresponding "fall back", make a copy + // that's larger by 1 + if (i == arr.length) { + arr = arr.padTo(estimatedArrayLength + 1, fromLong(0L)) + } arr(i) = fromLong(result) i += 1 t = addInterval(startMicros, i * stepMonths, i * stepDays, i * stepMicros, zoneId) @@ -3039,6 +3045,9 @@ object Sequence { | int $i = 0; | | while ($t < $exclusiveItem ^ $stepSign < 0) { + | if ($i == $arr.length) { + | $arr = java.util.Arrays.copyOf($arr, $i + 1); + | } | $arr[$i] = $fromMicrosCode; | $i += 1; | $t = $addIntervalCode( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala index 201a7e131c7..bfba043a5a7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala @@ -2426,4 +2426,44 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper Literal.create(Seq(Double.NaN, 1d, 2d, null), ArrayType(DoubleType))), Seq(null, 1d, 2d, Double.NaN)) } + + test("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") { + DateTimeTestUtils.withDefaultTimeZone(LA) { + checkEvaluation(new Sequence( + Literal(Timestamp.valueOf("2016-03-13 00:00:00")), + Literal(Timestamp.valueOf("2016-03-14 00:00:00")), + Literal(stringToInterval("interval 1 day"))), + Seq( + Timestamp.valueOf("2016-03-13 00:00:00"), + Timestamp.valueOf("2016-03-14 00:00:00"))) + + checkEvaluation(new Sequence( + Literal(Timestamp.valueOf("2016-03-14 00:00:00")), + Literal(Timestamp.valueOf("2016-03-13 00:00:00")), + Literal(stringToInterval("interval -1 days"))), + Seq( + Timestamp.valueOf("2016-03-14 00:00:00"), + Timestamp.valueOf("2016-03-13 00:00:00"))) + + checkEvaluation(new Sequence( + Literal(Date.valueOf("2016-03-13")), + Literal(Date.valueOf("2016-03-16")), + Literal(stringToInterval("interval 1 day 12 hour"))), + Seq( + Date.valueOf("2016-03-13"), + Date.valueOf("2016-03-14"), + Date.valueOf("2016-03-16"))) + + checkEvaluation(new Sequence( + Literal(Date.valueOf("2017-04-06")), + Literal(Date.valueOf("2017-02-12")), + Literal(stringToInterval("interval -13 days -6 hours"))), + Seq( + Date.valueOf("2017-04-06"), + Date.valueOf("2017-03-23"), + Date.valueOf("2017-03-10"), + Date.valueOf("2017-02-25"), + Date.valueOf("2017-02-12"))) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org