This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 240706600db [SPARK-46070][SQL] Compile regex pattern in SparkDateTimeUtils.getZoneId outside the hot loop 240706600db is described below commit 240706600dbf257dfcf378acaadd608348d666aa Author: Tanel Kiis <tanel.k...@gmail.com> AuthorDate: Thu Nov 23 18:30:37 2023 +0300 [SPARK-46070][SQL] Compile regex pattern in SparkDateTimeUtils.getZoneId outside the hot loop ### What changes were proposed in this pull request? Compile the regex patterns used in `SparkDateTimeUtils.getZoneId` outside of the method, that can be called for each dataset row.. ### Why are the changes needed? `String.replaceFirst` internally does `Pattern.compile(regex).matcher(this).replaceFirst(replacement)`. `Pattern.compile` is very expensive method, that should not be called in a loop. When using method like `from_utc_timestamp` with non-literal timezone, the `SparkDateTimeUtils.getZoneId` is called for each loop. In one of my usecases adding `from_utc_timestamp` increased the runtime from 15min to 6h. ### Does this PR introduce _any_ user-facing change? Performance improvement. ### How was this patch tested? Existing UTs ### Was this patch authored or co-authored using generative AI tooling? No Closes #43976 from tanelk/SPARK-46070_precompile_regex. Authored-by: Tanel Kiis <tanel.k...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index f8a9274a564..5e9fb0dd25f 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -20,6 +20,7 @@ import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate, LocalDateTime, LocalTime, ZonedDateTime, ZoneId, ZoneOffset} import java.util.TimeZone import java.util.concurrent.TimeUnit.{MICROSECONDS, NANOSECONDS} +import java.util.regex.Pattern import scala.util.control.NonFatal @@ -36,12 +37,14 @@ trait SparkDateTimeUtils { final val TimeZoneUTC = TimeZone.getTimeZone("UTC") + final val singleHourTz = Pattern.compile("(\\+|\\-)(\\d):") + final val singleMinuteTz = Pattern.compile("(\\+|\\-)(\\d\\d):(\\d)$") + def getZoneId(timeZoneId: String): ZoneId = { - val formattedZoneId = timeZoneId - // To support the (+|-)h:mm format because it was supported before Spark 3.0. - .replaceFirst("(\\+|\\-)(\\d):", "$10$2:") - // To support the (+|-)hh:m format because it was supported before Spark 3.0. - .replaceFirst("(\\+|\\-)(\\d\\d):(\\d)$", "$1$2:0$3") + // To support the (+|-)h:mm format because it was supported before Spark 3.0. + var formattedZoneId = singleHourTz.matcher(timeZoneId).replaceFirst("$10$2:") + // To support the (+|-)hh:m format because it was supported before Spark 3.0. + formattedZoneId = singleMinuteTz.matcher(formattedZoneId).replaceFirst("$1$2:0$3") ZoneId.of(formattedZoneId, ZoneId.SHORT_IDS) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org