Re: [PR] HIVE-29033: ORC reader should not assume that TimestampColumnVector is in UTC time zone [hive]

via GitHub Wed, 25 Jun 2025 01:53:43 -0700


deniskuzZ commented on code in PR #5868:
URL: https://github.com/apache/hive/pull/5868#discussion_r2166148509



##########
ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java:
##########
@@ -518,7 +521,8 @@ public static TimestampWritableV2 
nextTimestamp(ColumnVector vector,
         result = (TimestampWritableV2) previous;
       }
       TimestampColumnVector tcv = (TimestampColumnVector) vector;
-      result.setInternal(tcv.time[row], tcv.nanos[row]);
+      result.set(Timestamp.ofEpochSecond(Math.floorDiv(tcv.time[row], 1000L), 
tcv.nanos[row],
+          tcv.isUTC() ? ZoneOffset.UTC : ZoneId.systemDefault()));

Review Comment:
   > The PR approach works with existing way how TimestampTreeReader sets isUTC 
flag in TimestampColumnVector
   
   i don't think so 
   ````
   public void nextVector(...) {
     ....
     result.setIsUTC(this.context.getUseUTCTimestamp());
     ....
   }
   ````
   Here is an update: no new classes or method signature changes (except static 
to private). 
   Note: patch might seem big, but it just removes the dead code as well
   ````
   Subject: [PATCH] refactor
   ---
   Index: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampUtils.java
   IDEA additional info:
   Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
   <+>UTF-8
   ===================================================================
   diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampUtils.java
   --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampUtils.java  
(revision fac17feed39473c4219f3b3a2c63aeed4fc92701)
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/TimestampUtils.java  
(date 1750801652028)
   @@ -18,22 +18,15 @@
    
    package org.apache.hadoop.hive.ql.exec.vector;
    
   -import java.util.concurrent.TimeUnit;
   +import java.time.ZoneId;
   +import java.time.ZoneOffset;
    
    import org.apache.hadoop.hive.common.type.Timestamp;
   -import org.apache.hadoop.hive.serde2.io.DateWritableV2;
    import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable;
    import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
    
    public final class TimestampUtils {
    
   -  static final long MILLISECONDS_PER_SECOND = TimeUnit.SECONDS.toMillis(1);
   -  static final long NANOSECONDS_PER_MILLISECOND = 
TimeUnit.MILLISECONDS.toNanos(1);
   -
   -  public static long daysToNanoseconds(long daysSinceEpoch) {
   -    return DateWritableV2.daysToMillis((int) daysSinceEpoch) * 
NANOSECONDS_PER_MILLISECOND;
   -  }
   -
      public static TimestampWritableV2 timestampColumnVectorWritable(
          TimestampColumnVector timestampColVector, int elementNum,
          TimestampWritableV2 timestampWritable) {
   @@ -63,4 +56,9 @@
        }
        return o.toString();
      }
   +
   +  public static Timestamp timestamp(TimestampColumnVector tcv, int row) {
   +    return Timestamp.ofEpochSecond(Math.floorDiv(tcv.time[row], 1000L), 
tcv.nanos[row],
   +        tcv.isUTC() ? ZoneOffset.UTC : ZoneId.systemDefault());
   +  }
    }
   Index: ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java
   IDEA additional info:
   Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
   <+>UTF-8
   ===================================================================
   diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java
   --- a/ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java 
(revision fac17feed39473c4219f3b3a2c63aeed4fc92701)
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java (date 
1750801652033)
   @@ -44,6 +44,7 @@
    import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
   +import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils;
    import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
    import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
   @@ -518,7 +519,7 @@
            result = (TimestampWritableV2) previous;
          }
          TimestampColumnVector tcv = (TimestampColumnVector) vector;
   -      result.setInternal(tcv.time[row], tcv.nanos[row]);
   +      result.set(TimestampUtils.timestamp(tcv, row));
          return result;
        } else {
          return null;
   Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
   IDEA additional info:
   Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
   <+>UTF-8
   ===================================================================
   diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
   --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java     
(revision fac17feed39473c4219f3b3a2c63aeed4fc92701)
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java     
(date 1750840195393)
   @@ -36,6 +36,7 @@
    import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
   +import org.apache.hadoop.hive.ql.exec.vector.TimestampUtils;
    import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
    import org.apache.hadoop.hive.serde2.io.ByteWritable;
   @@ -53,19 +54,20 @@
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.orc.TypeDescription;
   -import org.slf4j.Logger;
   -import org.slf4j.LoggerFactory;
   +
   +import static org.apache.orc.impl.TreeReaderFactory.ReaderContext;
    
    public class RecordReaderImpl extends org.apache.orc.impl.RecordReaderImpl
                                  implements RecordReader {
   -  static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
      private final VectorizedRowBatch batch;
   +  private final ReaderContext context;
      private int rowInBatch;
      private long baseRow;
    
      protected RecordReaderImpl(ReaderImpl fileReader,
   -    Reader.Options options, final Configuration conf) throws IOException {
   +      Reader.Options options, final Configuration conf) throws IOException {
        super(fileReader, options);
   +    context = (new ReaderContext()).useUTCTimestamp(fileReader.isUTC());
        final boolean useDecimal64ColumnVectors = conf != null && 
HiveConf.getVar(conf,
          
HiveConf.ConfVars.HIVE_VECTORIZED_INPUT_FORMAT_SUPPORTS_ENABLED).equalsIgnoreCase("decimal_64");
        if (useDecimal64ColumnVectors){
   @@ -444,7 +446,7 @@
        }
      }
    
   -  static TimestampWritableV2 nextTimestamp(ColumnVector vector,
   +  private TimestampWritableV2 nextTimestamp(ColumnVector vector,
                                               int row,
                                               Object previous) {
        if (vector.isRepeating) {
   @@ -458,14 +460,15 @@
            result = (TimestampWritableV2) previous;
          }
          TimestampColumnVector tcv = (TimestampColumnVector) vector;
   -      result.setInternal(tcv.time[row], tcv.nanos[row]);
   +      tcv.setIsUTC(context.getUseUTCTimestamp());
   +      result.set(TimestampUtils.timestamp(tcv, row));
          return result;
        } else {
          return null;
        }
      }
    
   -  static OrcStruct nextStruct(ColumnVector vector,
   +  private OrcStruct nextStruct(ColumnVector vector,
                                  int row,
                                  TypeDescription schema,
                                  Object previous) {
   @@ -493,7 +496,7 @@
        }
      }
    
   -  static OrcUnion nextUnion(ColumnVector vector,
   +  private OrcUnion nextUnion(ColumnVector vector,
                                int row,
                                TypeDescription schema,
                                Object previous) {
   @@ -518,7 +521,7 @@
        }
      }
    
   -  static ArrayList<Object> nextList(ColumnVector vector,
   +  private ArrayList<Object> nextList(ColumnVector vector,
                                        int row,
                                        TypeDescription schema,
                                        Object previous) {
   @@ -558,7 +561,7 @@
        }
      }
    
   -  static Map<Object,Object> nextMap(ColumnVector vector,
   +  private Map<Object,Object> nextMap(ColumnVector vector,
                                        int row,
                                        TypeDescription schema,
                                        Object previous) {
   @@ -573,7 +576,7 @@
          TypeDescription valueType = schema.getChildren().get(1);
          LinkedHashMap<Object,Object> result;
          if (previous == null || previous.getClass() != LinkedHashMap.class) {
   -        result = new LinkedHashMap<Object,Object>(length);
   +        result = new LinkedHashMap<>(length);
          } else {
            result = (LinkedHashMap<Object,Object>) previous;
            // I couldn't think of a good way to reuse the keys and value 
objects
   @@ -590,50 +593,31 @@
        }
      }
    
   -  static Object nextValue(ColumnVector vector,
   +  private Object nextValue(ColumnVector vector,
                              int row,
                              TypeDescription schema,
                              Object previous) {
   -    switch (schema.getCategory()) {
   -      case BOOLEAN:
   -        return nextBoolean(vector, row, previous);
   -      case BYTE:
   -        return nextByte(vector, row, previous);
   -      case SHORT:
   -        return nextShort(vector, row, previous);
   -      case INT:
   -        return nextInt(vector, row, previous);
   -      case LONG:
   -        return nextLong(vector, row, previous);
   -      case FLOAT:
   -        return nextFloat(vector, row, previous);
   -      case DOUBLE:
   -        return nextDouble(vector, row, previous);
   -      case STRING:
   -        return nextString(vector, row, previous);
   -      case CHAR:
   -        return nextChar(vector, row, schema.getMaxLength(), previous);
   -      case VARCHAR:
   -        return nextVarchar(vector, row, schema.getMaxLength(), previous);
   -      case BINARY:
   -        return nextBinary(vector, row, previous);
   -      case DECIMAL:
   -        return nextDecimal(vector, row, previous);
   -      case DATE:
   -        return nextDate(vector, row, previous);
   -      case TIMESTAMP:
   -        return nextTimestamp(vector, row, previous);
   -      case STRUCT:
   -        return nextStruct(vector, row, schema, previous);
   -      case UNION:
   -        return nextUnion(vector, row, schema, previous);
   -      case LIST:
   -        return nextList(vector, row, schema, previous);
   -      case MAP:
   -        return nextMap(vector, row, schema, previous);
   -      default:
   -        throw new IllegalArgumentException("Unknown type " + schema);
   -    }
   +      return switch (schema.getCategory()) {
   +          case BOOLEAN -> nextBoolean(vector, row, previous);
   +          case BYTE -> nextByte(vector, row, previous);
   +          case SHORT -> nextShort(vector, row, previous);
   +          case INT -> nextInt(vector, row, previous);
   +          case LONG -> nextLong(vector, row, previous);
   +          case FLOAT -> nextFloat(vector, row, previous);
   +          case DOUBLE -> nextDouble(vector, row, previous);
   +          case STRING -> nextString(vector, row, previous);
   +          case CHAR -> nextChar(vector, row, schema.getMaxLength(), 
previous);
   +          case VARCHAR -> nextVarchar(vector, row, schema.getMaxLength(), 
previous);
   +          case BINARY -> nextBinary(vector, row, previous);
   +          case DECIMAL -> nextDecimal(vector, row, previous);
   +          case DATE -> nextDate(vector, row, previous);
   +          case TIMESTAMP -> nextTimestamp(vector, row, previous);
   +          case STRUCT -> nextStruct(vector, row, schema, previous);
   +          case UNION -> nextUnion(vector, row, schema, previous);
   +          case LIST -> nextList(vector, row, schema, previous);
   +          case MAP -> nextMap(vector, row, schema, previous);
   +          default -> throw new IllegalArgumentException("Unknown type " + 
schema);
   +      };
      }
    
      /* Routines for copying between VectorizedRowBatches */
   @@ -789,19 +773,11 @@
        castedDestination.noNulls = castedSource.noNulls;
        if (source.isRepeating) {
          castedDestination.isNull[0] = castedSource.isNull[0];
   -      for(int c=0; c > castedSource.fields.length; ++c) {
   -        copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 
1);
   -      }
        } else {
          if (!castedSource.noNulls) {
            for (int r = 0; r < length; ++r) {
              castedDestination.isNull[r] = castedSource.isNull[sourceOffset + 
r];
            }
   -      } else {
   -        for (int c = 0; c > castedSource.fields.length; ++c) {
   -          copyColumn(castedDestination.fields[c], castedSource.fields[c],
   -              sourceOffset, length);
   -        }
          }
        }
      }
   @@ -829,14 +805,10 @@
              castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
            }
          } else {
   -        for(int r=0; r < length; ++r) {
   +        for (int r=0; r < length; ++r) {
              castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
            }
          }
   -      for(int c=0; c > castedSource.fields.length; ++c) {
   -        copyColumn(castedDestination.fields[c], castedSource.fields[c],
   -            sourceOffset, length);
   -      }
        }
      }
    
   @@ -954,7 +926,6 @@
       * @param destination the batch to copy into
       * @param source the batch to copy from
       * @param sourceStart the row number to start from in the source
   -   * @return the number of rows copied
       */
      void copyIntoBatch(VectorizedRowBatch destination,
                         VectorizedRowBatch source,
   Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
   IDEA additional info:
   Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
   <+>UTF-8
   ===================================================================
   diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java
   --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java   
(revision fac17feed39473c4219f3b3a2c63aeed4fc92701)
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java   (date 
1750794090926)
   @@ -83,7 +83,9 @@
        return new RecordReaderImpl(this, options, conf);
      }
    
   -
   +  boolean isUTC() {
   +    return useUTCTimestamp;
   +  }
    
      @Override
      public RecordReader rows(boolean[] include) throws IOException {
   ````
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org
For additional commands, e-mail: gitbox-h...@hive.apache.org

Re: [PR] HIVE-29033: ORC reader should not assume that TimestampColumnVector is in UTC time zone [hive]

Reply via email to