Re: [PR] HIVE-29033: ORC reader should not assume that TimestampColumnVector is in UTC time zone [hive]

via GitHub Tue, 24 Jun 2025 14:52:22 -0700


deniskuzZ commented on code in PR #5868:
URL: https://github.com/apache/hive/pull/5868#discussion_r2164799550



##########
ql/src/java/org/apache/hadoop/hive/ql/io/BatchToRowReader.java:
##########
@@ -518,7 +521,8 @@ public static TimestampWritableV2 
nextTimestamp(ColumnVector vector,
         result = (TimestampWritableV2) previous;
       }
       TimestampColumnVector tcv = (TimestampColumnVector) vector;
-      result.setInternal(tcv.time[row], tcv.nanos[row]);
+      result.set(Timestamp.ofEpochSecond(Math.floorDiv(tcv.time[row], 1000L), 
tcv.nanos[row],
+          tcv.isUTC() ? ZoneOffset.UTC : ZoneId.systemDefault()));

Review Comment:
   attached the diff for `RecordReaderImpl` with extra cleanup. try applying to 
the original version of the class  without your changes
   ````
   Subject: [PATCH] refactor
   ---
   Index: ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
   IDEA additional info:
   Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
   <+>UTF-8
   ===================================================================
   diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java
   --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java     
(revision fac17feed39473c4219f3b3a2c63aeed4fc92701)
   +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java     
(date 1750795481725)
   @@ -18,12 +18,15 @@
    package org.apache.hadoop.hive.ql.io.orc;
    
    import java.io.IOException;
   +import java.time.ZoneId;
   +import java.time.ZoneOffset;
    import java.util.ArrayList;
    import java.util.LinkedHashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.hadoop.conf.Configuration;
   +import org.apache.hadoop.hive.common.type.Timestamp;
    import org.apache.hadoop.hive.conf.HiveConf;
    import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
   @@ -53,15 +56,13 @@
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.orc.TypeDescription;
   -import org.slf4j.Logger;
   -import org.slf4j.LoggerFactory;
    
    public class RecordReaderImpl extends org.apache.orc.impl.RecordReaderImpl
                                  implements RecordReader {
   -  static final Logger LOG = LoggerFactory.getLogger(RecordReaderImpl.class);
      private final VectorizedRowBatch batch;
      private int rowInBatch;
      private long baseRow;
   +  private ReaderContext context;
    
      protected RecordReaderImpl(ReaderImpl fileReader,
        Reader.Options options, final Configuration conf) throws IOException {
   @@ -73,10 +74,23 @@
        } else {
          batch = this.schema.createRowBatch();
        }
   +    context = new ReaderContext(fileReader.isUTC());
        rowInBatch = 0;
      }
    
   -  /**
   +  private static class ReaderContext  {
   +    private boolean useUTCTimestamp;
   +
   +    private ReaderContext(boolean useUTCTimestamp) {
   +      this.useUTCTimestamp = useUTCTimestamp;
   +    }
   +
   +    public boolean getUseUTCTimestamp() {
   +      return this.useUTCTimestamp;
   +    }
   +  }
   +
   +    /**
       * If the current batch is empty, get a new one.
       * @return true if we have rows available.
       * @throws IOException
   @@ -135,10 +149,10 @@
          }
          for(int i=0; i < numberOfChildren; ++i) {
            result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch,
   -            children.get(i), result.getFieldValue(i)));
   +            children.get(i), context, result.getFieldValue(i)));
          }
        } else {
   -      previous = nextValue(batch.cols[0], rowInBatch, schema, previous);
   +      previous = nextValue(batch.cols[0], rowInBatch, schema, context, 
previous);
        }
        rowInBatch += 1;
        return previous;
   @@ -446,6 +460,7 @@
    
      static TimestampWritableV2 nextTimestamp(ColumnVector vector,
                                               int row,
   +                                           ReaderContext context,
                                               Object previous) {
        if (vector.isRepeating) {
          row = 0;
   @@ -458,7 +473,8 @@
            result = (TimestampWritableV2) previous;
          }
          TimestampColumnVector tcv = (TimestampColumnVector) vector;
   -      result.setInternal(tcv.time[row], tcv.nanos[row]);
   +      result.set(Timestamp.ofEpochSecond(Math.floorDiv(tcv.time[row], 
1000L), tcv.nanos[row],
   +          context.getUseUTCTimestamp() ? ZoneOffset.UTC : 
ZoneId.systemDefault()));
          return result;
        } else {
          return null;
   @@ -468,6 +484,7 @@
      static OrcStruct nextStruct(ColumnVector vector,
                                  int row,
                                  TypeDescription schema,
   +                              ReaderContext context,
                                  Object previous) {
        if (vector.isRepeating) {
          row = 0;
   @@ -485,7 +502,7 @@
          StructColumnVector struct = (StructColumnVector) vector;
          for(int f=0; f < numChildren; ++f) {
            result.setFieldValue(f, nextValue(struct.fields[f], row,
   -            childrenTypes.get(f), result.getFieldValue(f)));
   +            childrenTypes.get(f), context, result.getFieldValue(f)));
          }
          return result;
        } else {
   @@ -496,6 +513,7 @@
      static OrcUnion nextUnion(ColumnVector vector,
                                int row,
                                TypeDescription schema,
   +                            ReaderContext context,
                                Object previous) {
        if (vector.isRepeating) {
          row = 0;
   @@ -511,7 +529,7 @@
          UnionColumnVector union = (UnionColumnVector) vector;
          byte tag = (byte) union.tags[row];
          result.set(tag, nextValue(union.fields[tag], row, 
childrenTypes.get(tag),
   -          result.getObject()));
   +          context, result.getObject()));
          return result;
        } else {
          return null;
   @@ -521,6 +539,7 @@
      static ArrayList<Object> nextList(ColumnVector vector,
                                        int row,
                                        TypeDescription schema,
   +                                    ReaderContext context,
                                        Object previous) {
        if (vector.isRepeating) {
          row = 0;
   @@ -541,14 +560,14 @@
          TypeDescription childType = schema.getChildren().get(0);
          while (idx < length && idx < oldLength) {
            result.set(idx, nextValue(list.child, offset + idx, childType,
   -            result.get(idx)));
   +            context, result.get(idx)));
            idx += 1;
          }
          if (length < oldLength) {
            result.subList(length,result.size()).clear();
          } else if (oldLength < length) {
            while (idx < length) {
   -          result.add(nextValue(list.child, offset + idx, childType, null));
   +          result.add(nextValue(list.child, offset + idx, childType, 
context, null));
              idx += 1;
            }
          }
   @@ -561,6 +580,7 @@
      static Map<Object,Object> nextMap(ColumnVector vector,
                                        int row,
                                        TypeDescription schema,
   +                                    ReaderContext context,
                                        Object previous) {
        if (vector.isRepeating) {
          row = 0;
   @@ -573,7 +593,7 @@
          TypeDescription valueType = schema.getChildren().get(1);
          LinkedHashMap<Object,Object> result;
          if (previous == null || previous.getClass() != LinkedHashMap.class) {
   -        result = new LinkedHashMap<Object,Object>(length);
   +        result = new LinkedHashMap<>(length);
          } else {
            result = (LinkedHashMap<Object,Object>) previous;
            // I couldn't think of a good way to reuse the keys and value 
objects
   @@ -581,8 +601,8 @@
            result.clear();
          }
          for(int e=0; e < length; ++e) {
   -        result.put(nextValue(map.keys, e + offset, keyType, null),
   -                   nextValue(map.values, e + offset, valueType, null));
   +        result.put(nextValue(map.keys, e + offset, keyType, context, null),
   +                   nextValue(map.values, e + offset, valueType, context, 
null));
          }
          return result;
        } else {
   @@ -593,47 +613,29 @@
      static Object nextValue(ColumnVector vector,
                              int row,
                              TypeDescription schema,
   +                          ReaderContext context,
                              Object previous) {
   -    switch (schema.getCategory()) {
   -      case BOOLEAN:
   -        return nextBoolean(vector, row, previous);
   -      case BYTE:
   -        return nextByte(vector, row, previous);
   -      case SHORT:
   -        return nextShort(vector, row, previous);
   -      case INT:
   -        return nextInt(vector, row, previous);
   -      case LONG:
   -        return nextLong(vector, row, previous);
   -      case FLOAT:
   -        return nextFloat(vector, row, previous);
   -      case DOUBLE:
   -        return nextDouble(vector, row, previous);
   -      case STRING:
   -        return nextString(vector, row, previous);
   -      case CHAR:
   -        return nextChar(vector, row, schema.getMaxLength(), previous);
   -      case VARCHAR:
   -        return nextVarchar(vector, row, schema.getMaxLength(), previous);
   -      case BINARY:
   -        return nextBinary(vector, row, previous);
   -      case DECIMAL:
   -        return nextDecimal(vector, row, previous);
   -      case DATE:
   -        return nextDate(vector, row, previous);
   -      case TIMESTAMP:
   -        return nextTimestamp(vector, row, previous);
   -      case STRUCT:
   -        return nextStruct(vector, row, schema, previous);
   -      case UNION:
   -        return nextUnion(vector, row, schema, previous);
   -      case LIST:
   -        return nextList(vector, row, schema, previous);
   -      case MAP:
   -        return nextMap(vector, row, schema, previous);
   -      default:
   -        throw new IllegalArgumentException("Unknown type " + schema);
   -    }
   +      return switch (schema.getCategory()) {
   +          case BOOLEAN -> nextBoolean(vector, row, previous);
   +          case BYTE -> nextByte(vector, row, previous);
   +          case SHORT -> nextShort(vector, row, previous);
   +          case INT -> nextInt(vector, row, previous);
   +          case LONG -> nextLong(vector, row, previous);
   +          case FLOAT -> nextFloat(vector, row, previous);
   +          case DOUBLE -> nextDouble(vector, row, previous);
   +          case STRING -> nextString(vector, row, previous);
   +          case CHAR -> nextChar(vector, row, schema.getMaxLength(), 
previous);
   +          case VARCHAR -> nextVarchar(vector, row, schema.getMaxLength(), 
previous);
   +          case BINARY -> nextBinary(vector, row, previous);
   +          case DECIMAL -> nextDecimal(vector, row, previous);
   +          case DATE -> nextDate(vector, row, previous);
   +          case TIMESTAMP -> nextTimestamp(vector, row, context, previous);
   +          case STRUCT -> nextStruct(vector, row, schema, context, previous);
   +          case UNION -> nextUnion(vector, row, schema, context, previous);
   +          case LIST -> nextList(vector, row, schema, context, previous);
   +          case MAP -> nextMap(vector, row, schema, context, previous);
   +          default -> throw new IllegalArgumentException("Unknown type " + 
schema);
   +      };
      }
    
      /* Routines for copying between VectorizedRowBatches */
   @@ -789,19 +791,11 @@
        castedDestination.noNulls = castedSource.noNulls;
        if (source.isRepeating) {
          castedDestination.isNull[0] = castedSource.isNull[0];
   -      for(int c=0; c > castedSource.fields.length; ++c) {
   -        copyColumn(castedDestination.fields[c], castedSource.fields[c], 0, 
1);
   -      }
        } else {
          if (!castedSource.noNulls) {
            for (int r = 0; r < length; ++r) {
              castedDestination.isNull[r] = castedSource.isNull[sourceOffset + 
r];
            }
   -      } else {
   -        for (int c = 0; c > castedSource.fields.length; ++c) {
   -          copyColumn(castedDestination.fields[c], castedSource.fields[c],
   -              sourceOffset, length);
   -        }
          }
        }
      }
   @@ -829,14 +823,10 @@
              castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
            }
          } else {
   -        for(int r=0; r < length; ++r) {
   +        for (int r=0; r < length; ++r) {
              castedDestination.tags[r] = castedSource.tags[sourceOffset + r];
            }
          }
   -      for(int c=0; c > castedSource.fields.length; ++c) {
   -        copyColumn(castedDestination.fields[c], castedSource.fields[c],
   -            sourceOffset, length);
   -      }
        }
      }
    
   @@ -954,7 +944,6 @@
       * @param destination the batch to copy into
       * @param source the batch to copy from
       * @param sourceStart the row number to start from in the source
   -   * @return the number of rows copied
       */
      void copyIntoBatch(VectorizedRowBatch destination,
                         VectorizedRowBatch source,
   
   ````



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org
For additional commands, e-mail: gitbox-h...@hive.apache.org

Re: [PR] HIVE-29033: ORC reader should not assume that TimestampColumnVector is in UTC time zone [hive]

Reply via email to