This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 5902a23 [SPARK-31076][SQL][FOLLOWUP] Incapsulate date rebasing to `DaysWritable` 5902a23 is described below commit 5902a232c14516f893895be0d673354933d8a5d0 Author: Maxim Gekk <max.g...@gmail.com> AuthorDate: Mon Mar 16 17:06:15 2020 +0800 [SPARK-31076][SQL][FOLLOWUP] Incapsulate date rebasing to `DaysWritable` ### What changes were proposed in this pull request? Move the code related to days rebasing from/to Julian calendar from `HiveInspectors` to new class `DaysWritable`. ### Why are the changes needed? To improve maintainability of the `HiveInspectors` trait which is already pretty complex. ### Does this PR introduce any user-facing change? No ### How was this patch tested? By `HiveOrcHadoopFsRelationSuite`. Closes #27890 from MaxGekk/replace-DateWritable-by-DaysWritable. Authored-by: Maxim Gekk <max.g...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 57854c736c2ca495eb03962f61857e1600864e95) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../org/apache/spark/sql/hive/DaysWritable.scala | 117 +++++++++++++++++++++ .../org/apache/spark/sql/hive/HiveInspectors.scala | 58 +--------- 2 files changed, 120 insertions(+), 55 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala new file mode 100644 index 0000000..53a0deb --- /dev/null +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/DaysWritable.scala @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import java.io.{DataInput, DataOutput, IOException} +import java.sql.Date +import java.time.LocalDate +import java.util.Calendar + +import org.apache.hadoop.hive.serde2.io.DateWritable +import org.apache.hadoop.io.WritableUtils + +import org.apache.spark.sql.catalyst.util.{DateTimeConstants, DateTimeUtils} + +/** + * The class accepts/returns days in Gregorian calendar and rebase them + * via conversion to local date in Julian calendar for dates before 1582-10-15 + * in read/write for backward compatibility with Spark 2.4 and earlier versions. + * + * @param gregorianDays The number of days since the epoch 1970-01-01 in + * Gregorian calendar. + * @param julianDays The number of days since the epoch 1970-01-01 in + * Julian calendar. + */ +private[hive] class DaysWritable( + var gregorianDays: Int, + var julianDays: Int) + extends DateWritable { + + def this(gregorianDays: Int) = + this(gregorianDays, DaysWritable.rebaseGregorianToJulianDays(gregorianDays)) + def this(dateWritable: DateWritable) = { + this( + gregorianDays = dateWritable match { + case daysWritable: DaysWritable => daysWritable.gregorianDays + case dateWritable: DateWritable => + DaysWritable.rebaseJulianToGregorianDays(dateWritable.getDays) + }, + julianDays = dateWritable.getDays) + } + + override def getDays: Int = julianDays + override def get(): Date = new Date(DateWritable.daysToMillis(julianDays)) + + @throws[IOException] + override def write(out: DataOutput): Unit = { + WritableUtils.writeVInt(out, julianDays) + } + + @throws[IOException] + override def readFields(in: DataInput): Unit = { + julianDays = WritableUtils.readVInt(in) + gregorianDays = DaysWritable.rebaseJulianToGregorianDays(julianDays) + } +} + +private[hive] object DaysWritable { + // Rebasing days since the epoch to store the same number of days + // as by Spark 2.4 and earlier versions. Spark 3.0 switched to + // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, + // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use + // Julian calendar for dates before 1582-10-15. So, the same local date may + // be mapped to different number of days since the epoch in different calendars. + // For example: + // Proleptic Gregorian calendar: 1582-01-01 -> -141714 + // Julian calendar: 1582-01-01 -> -141704 + // The code below converts -141714 to -141704. + def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = { + if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) { + val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY) + val utcCal = new Calendar.Builder() + .setCalendarType("gregory") + .setTimeZone(DateTimeUtils.TimeZoneUTC) + .setInstant(millis) + .build() + val localDate = LocalDate.of( + utcCal.get(Calendar.YEAR), + utcCal.get(Calendar.MONTH) + 1, + utcCal.get(Calendar.DAY_OF_MONTH)) + Math.toIntExact(localDate.toEpochDay) + } else { + daysSinceEpoch + } + } + + def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = { + if (daysSinceEpoch < JULIAN_CUTOVER_DAY) { + val localDate = LocalDate.ofEpochDay(daysSinceEpoch) + val utcCal = new Calendar.Builder() + .setCalendarType("gregory") + .setTimeZone(DateTimeUtils.TimeZoneUTC) + .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) + .build() + Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY)) + } else { + daysSinceEpoch + } + } + + final val JULIAN_CUTOVER_DAY = + rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt) +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index e217c52..e3e9a31 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -18,8 +18,6 @@ package org.apache.spark.sql.hive import java.lang.reflect.{ParameterizedType, Type, WildcardType} -import java.time.LocalDate -import java.util.Calendar import scala.collection.JavaConverters._ @@ -182,33 +180,6 @@ import org.apache.spark.unsafe.types.UTF8String */ private[hive] trait HiveInspectors { - private final val JULIAN_CUTOVER_DAY = - rebaseGregorianToJulianDays(DateTimeUtils.GREGORIAN_CUTOVER_DAY.toInt) - - private def rebaseJulianToGregorianDays(daysSinceEpoch: Int): Int = { - val localDate = LocalDate.ofEpochDay(daysSinceEpoch) - val utcCal = new Calendar.Builder() - .setCalendarType("gregory") - .setTimeZone(DateTimeUtils.TimeZoneUTC) - .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) - .build() - Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, DateTimeConstants.MILLIS_PER_DAY)) - } - - private def rebaseGregorianToJulianDays(daysSinceEpoch: Int): Int = { - val millis = Math.multiplyExact(daysSinceEpoch, DateTimeConstants.MILLIS_PER_DAY) - val utcCal = new Calendar.Builder() - .setCalendarType("gregory") - .setTimeZone(DateTimeUtils.TimeZoneUTC) - .setInstant(millis) - .build() - val localDate = LocalDate.of( - utcCal.get(Calendar.YEAR), - utcCal.get(Calendar.MONTH) + 1, - utcCal.get(Calendar.DAY_OF_MONTH)) - Math.toIntExact(localDate.toEpochDay) - } - def javaTypeToDataType(clz: Type): DataType = clz match { // writable case c: Class[_] if c == classOf[hadoopIo.DoubleWritable] => DoubleType @@ -646,14 +617,7 @@ private[hive] trait HiveInspectors { case x: DateObjectInspector if x.preferWritable() => data: Any => { if (data != null) { - // Rebasing written days via conversion to local dates. - // See the comment for `getDateWritable()`. - val daysSinceEpoch = x.getPrimitiveWritableObject(data).getDays - if (daysSinceEpoch < JULIAN_CUTOVER_DAY) { - rebaseJulianToGregorianDays(daysSinceEpoch) - } else { - daysSinceEpoch - } + new DaysWritable(x.getPrimitiveWritableObject(data)).gregorianDays } else { null } @@ -1045,27 +1009,11 @@ private[hive] trait HiveInspectors { new hadoopIo.BytesWritable(value.asInstanceOf[Array[Byte]]) } - private def getDateWritable(value: Any): hiveIo.DateWritable = + private def getDateWritable(value: Any): DaysWritable = if (value == null) { null } else { - // Rebasing days since the epoch to store the same number of days - // as by Spark 2.4 and earlier versions. Spark 3.0 switched to - // Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, - // this affects dates before 1582-10-15. Spark 2.4 and earlier versions use - // Julian calendar for dates before 1582-10-15. So, the same local date may - // be mapped to different number of days since the epoch in different calendars. - // For example: - // Proleptic Gregorian calendar: 1582-01-01 -> -141714 - // Julian calendar: 1582-01-01 -> -141704 - // The code below converts -141714 to -141704. - val daysSinceEpoch = value.asInstanceOf[Int] - val rebasedDays = if (daysSinceEpoch < DateTimeUtils.GREGORIAN_CUTOVER_DAY) { - rebaseGregorianToJulianDays(daysSinceEpoch) - } else { - daysSinceEpoch - } - new hiveIo.DateWritable(rebasedDays) + new DaysWritable(value.asInstanceOf[Int]) } private def getTimestampWritable(value: Any): hiveIo.TimestampWritable = --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org