hive git commit: HIVE-19985: ACID: Skip decoding the ROW__ID sections for read-only queries (Eugene Koifman, reviewed by Gopal V)

ekoifman Sat, 29 Sep 2018 09:57:59 -0700

Repository: hive
Updated Branches:
  refs/heads/master 05ddd21c7 -> e133ec5c2



HIVE-19985: ACID: Skip decoding the ROW__ID sections for read-only queries 
(Eugene Koifman, reviewed by Gopal V)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e133ec5c
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e133ec5c
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e133ec5c

Branch: refs/heads/master
Commit: e133ec5c28e0ed5082773f00b5bf4d55d2697db9
Parents: 05ddd21
Author: Eugene Koifman <ekoif...@apache.org>
Authored: Sat Sep 29 09:56:44 2018 -0700
Committer: Eugene Koifman <ekoif...@apache.org>
Committed: Sat Sep 29 09:56:44 2018 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   3 +
 .../test/resources/testconfiguration.properties |   3 +-
 .../hive/llap/io/api/impl/LlapRecordReader.java |  61 +++++++--
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java |  47 +++++--
 .../hadoop/hive/ql/io/orc/OrcRecordUpdater.java |   2 +-
 .../io/orc/VectorizedOrcAcidRowBatchReader.java | 129 ++++++++++++++-----
 .../apache/hadoop/hive/ql/TestTxnCommands3.java |  63 +++++++++
 .../clientpositive/acid_meta_columns_decode.q   |  24 ++++
 .../llap/acid_meta_columns_decode.q.out         | 128 ++++++++++++++++++
 9 files changed, 407 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 0cecae5..d1e6631 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2698,6 +2698,9 @@ public class HiveConf extends Configuration {
     MERGE_CARDINALITY_VIOLATION_CHECK("hive.merge.cardinality.check", true,
       "Set to true to ensure that each SQL Merge statement ensures that for 
each row in the target\n" +
         "table there is at most 1 matching row in the source table per SQL 
Specification."),
+    OPTIMIZE_ACID_META_COLUMNS("hive.optimize.acid.meta.columns", true,
+        "If true, don't decode Acid metadata columns from storage unless" +
+        " they are needed."),
 
     // For Arrow SerDe
     HIVE_ARROW_ROOT_ALLOCATOR_LIMIT("hive.arrow.root.allocator.limit", 
Long.MAX_VALUE,

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index def3561..fdd8ecc 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -426,8 +426,9 @@ minillaplocal.query.files=\
   dec_str.q,\
   dp_counter_non_mm.q,\
   dp_counter_mm.q,\
-  acid_no_buckets.q, \
   acid_globallimit.q,\
+  acid_meta_columns_decode.q,\
+  acid_no_buckets.q, \
   acid_vectorization_missing_cols.q,\
   acid_vectorization_original.q,\
   alter_merge_stats_orc.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
----------------------------------------------------------------------
diff --git 
a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
 
b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
index 3455d16..27a5b0f 100644
--- 
a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
+++ 
b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java
@@ -160,7 +160,6 @@ class LlapRecordReader
     TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr(
         job, isAcidScan, Integer.MAX_VALUE);
 
-    this.includes = new IncludesImpl(tableIncludedCols, isAcidScan, rbCtx, 
schema, job);
 
     int queueLimitBase = getQueueVar(ConfVars.LLAP_IO_VRB_QUEUE_LIMIT_BASE, 
job, daemonConf);
     int queueLimitMin =  getQueueVar(ConfVars.LLAP_IO_VRB_QUEUE_LIMIT_MIN, 
job, daemonConf);
@@ -184,6 +183,8 @@ class LlapRecordReader
       this.acidReader = new VectorizedOrcAcidRowBatchReader(
           (OrcSplit) split, jobConf, Reporter.NULL, null, rbCtx, true);
     }
+    this.includes = new IncludesImpl(tableIncludedCols, isAcidScan, rbCtx,
+        schema, job, isAcidScan && acidReader.includeAcidColumns());
 
     // Create the consumer of encoded data; it will coordinate decoding to 
CVBs.
     feedback = rp = cvp.createReadPipeline(this, split, includes, sarg, 
counters, includes,
@@ -341,19 +342,27 @@ class LlapRecordReader
       return false;
     }
     if (isAcidScan) {
-      vrb.selectedInUse = true;
+      vrb.selectedInUse = true;//why?
       if (isVectorized) {
         // TODO: relying everywhere on the magical constants and columns being 
together means ACID
         //       columns are going to be super hard to change in a backward 
compat manner. I can
         //       foresee someone cursing while refactoring all the magic for 
prefix schema changes.
+        /**
+         * Acid meta cols are always either all included or all excluded the
+         * the width of 'cvb' changes accordingly so 'acidColCount' and
+         * 'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments.
+         */
         // Exclude the row column.
-        int acidColCount = OrcInputFormat.getRootColumn(false) - 1;
+        int acidColCount = acidReader.includeAcidColumns() ?
+            OrcInputFormat.getRootColumn(false) - 1 : 0;
         VectorizedRowBatch inputVrb = new VectorizedRowBatch(
-            acidColCount + 1 + vrb.getDataColumnCount() );
+            //so +1 is the OrcRecordUpdater.ROW?
+            acidColCount + 1 + vrb.getDataColumnCount());
         // By assumption, ACID columns are currently always in the beginning 
of the arrays.
         System.arraycopy(cvb.cols, 0, inputVrb.cols, 0, acidColCount);
         for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; 
++ixInReadSet) {
-          int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet);
+          int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) -
+              (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW);
           // TODO: should we create the batch from vrbctx, and reuse the 
vectors, like below? Future work.
           inputVrb.cols[ixInVrb] = cvb.cols[ixInReadSet];
         }
@@ -376,7 +385,7 @@ class LlapRecordReader
         int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet);
         cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb);
       }
-      vrb.selectedInUse = false;
+      vrb.selectedInUse = false;//why?
       vrb.size = cvb.size;
     }
 
@@ -564,18 +573,36 @@ class LlapRecordReader
   /** This class encapsulates include-related logic for LLAP readers. It is 
not actually specific
    *  to LLAP IO but in LLAP IO in particular, I want to encapsulate all this 
mess for now until
    *  we have smth better like Schema Evolution v2. This can also 
hypothetically encapsulate
-   *  field pruning inside structs and stuff like that. */
+   *  field pruning inside structs and stuff like that.
+   *
+   *  There is some split brain issue between {@link SchemaEvolution} used in
+   *  non-LLAP path and  this class.  The file schema for acid tables looks
+   *  like  this and <op, owid, writerId, rowid, cwid, <f1, ... fn>> and
+   *  {@link SchemaEvolution#getFileIncluded()}  respects that.  So if fn=2,
+   *  the  type IDs are 0..8 and the fileIncluded[] has 9 bits that indicate
+   *  what is read.  So in particular, {@link 
org.apache.hadoop.hive.ql.io.orc.RecordReader}
+   *  produces ColumnVectorS are NULL in every row for each
+   *  fileIncluded[9]==false.  The fields corresponding to structs are always
+   *  included if any child of the struct has to be included.
+   *
+   *  LLAP only produces ColumnVectorS if they are needed so the width of
+   *  ColumnVectorBatch varies depending on what was projected.
+   *
+   *  See also {@link VectorizedOrcAcidRowBatchReader#includeAcidColumns()} and
+   *  {@link #next(NullWritable, VectorizedRowBatch)}*/
   private static class IncludesImpl implements SchemaEvolutionFactory, 
Includes {
     private List<Integer> readerLogicalColumnIds;
     private List<Integer> filePhysicalColumnIds;
     private Integer acidStructColumnId = null;
+    private final boolean includeAcidColumns;
 
     // For current schema evolution.
     private TypeDescription readerSchema;
     private JobConf jobConf;
 
     public IncludesImpl(List<Integer> tableIncludedCols, boolean isAcidScan,
-        VectorizedRowBatchCtx rbCtx, TypeDescription readerSchema, JobConf 
jobConf) {
+        VectorizedRowBatchCtx rbCtx, TypeDescription readerSchema,
+        JobConf jobConf, boolean includeAcidColumns) {
           // Note: columnIds below makes additional changes for ACID. Don't 
use this var directly.
       this.readerSchema = readerSchema;
       this.jobConf = jobConf;
@@ -604,14 +631,29 @@ class LlapRecordReader
           // We don't want to include the root struct in ACID case; it would 
cause the whole
           // struct to get read without projection.
           if (acidStructColumnId == i) continue;
+          if(!includeAcidColumns) {
+            /**
+             * if not including acid columns, we still want to number the
+             * physical columns as if acid columns are included becase
+             * {@link #generateFileIncludes(TypeDescription)} takes the file
+             * schema as input
+             * (eg <op, owid, writerId, rowid, cwid, <f1, ... fn>>)
+             */
+            continue;
+          }
           filePhysicalColumnIds.add(i);
         }
         for (int tableColumnId : readerLogicalColumnIds) {
+          //but make sure to generate correct ids in type tree in-order
+          //walk order
           filePhysicalColumnIds.add(rootCol + tableColumnId);
         }
+        /*ok, so if filePhysicalColumnIds include acid column ids, we end up
+         decoding the vectors*/
       }
  
       this.filePhysicalColumnIds = filePhysicalColumnIds;
+      this.includeAcidColumns = includeAcidColumns;
     }
 
     @Override
@@ -628,7 +670,8 @@ class LlapRecordReader
       // TODO: will this work correctly with ACID?
       boolean[] readerIncludes = OrcInputFormat.genIncludedColumns(
           readerSchema, readerLogicalColumnIds);
-      Reader.Options options = new 
Reader.Options(jobConf).include(readerIncludes);
+      Reader.Options options = new Reader.Options(jobConf)
+          .include(readerIncludes).includeAcidColumns(includeAcidColumns);
       return new SchemaEvolution(fileSchema, readerSchema, options);
     }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 4d71eb4..71e5131 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -918,6 +918,11 @@ public class AcidUtils {
     return results.toArray(new Path[results.size()]);
   }
 
+  /**
+   * This will look at a footer of one of the files in the delta to see if the
+   * file is in Acid format, i.e. has acid metadata columns.  The assumption is
+   * that for any dir, either all files are acid or all are not.
+   */
   public static ParsedDelta parsedDelta(Path deltaDir, FileSystem fs) throws 
IOException {
     String deltaDirName = deltaDir.getName();
     if (deltaDirName.startsWith(DELETE_DELTA_PREFIX)) {
@@ -941,24 +946,42 @@ public class AcidUtils {
     if (filename.startsWith(deltaPrefix)) {
       //small optimization - delete delta can't be in raw format
       boolean isRawFormat = !isDeleteDelta && 
MetaDataFile.isRawFormat(deltaDir, fs);
-      String rest = filename.substring(deltaPrefix.length());
-      int split = rest.indexOf('_');
-      int split2 = rest.indexOf('_', split + 1);//may be -1 if no statementId
-      long min = Long.parseLong(rest.substring(0, split));
-      long max = split2 == -1 ?
-        Long.parseLong(rest.substring(split + 1)) :
-        Long.parseLong(rest.substring(split + 1, split2));
-      if(split2 == -1) {
-        return new ParsedDelta(min, max, null, isDeleteDelta, isRawFormat);
-      }
-      int statementId = Integer.parseInt(rest.substring(split2 + 1));
-      return new ParsedDelta(min, max, null, statementId, isDeleteDelta, 
isRawFormat);
+      return parsedDelta(deltaDir, isRawFormat);
     }
     throw new IllegalArgumentException(deltaDir + " does not start with " +
                                        deltaPrefix);
   }
 
   /**
+   * This method just parses the file name.  It relies on caller to figure if
+   * the file is in Acid format (i.e. has acid metadata columns) or not.
+   * {@link #parsedDelta(Path, FileSystem)}
+   */
+  public static ParsedDelta parsedDelta(Path deltaDir, boolean isRawFormat) {
+    String filename = deltaDir.getName();
+    boolean isDeleteDelta = filename.startsWith(DELETE_DELTA_PREFIX);
+    //make sure it's null for delete delta no matter what was passed in - this
+    //doesn't apply to delete deltas
+    isRawFormat = isDeleteDelta ? false : isRawFormat;
+    String rest = filename.substring((isDeleteDelta ?
+        DELETE_DELTA_PREFIX : DELTA_PREFIX).length());
+    int split = rest.indexOf('_');
+    //may be -1 if no statementId
+    int split2 = rest.indexOf('_', split + 1);
+    long min = Long.parseLong(rest.substring(0, split));
+    long max = split2 == -1 ?
+        Long.parseLong(rest.substring(split + 1)) :
+        Long.parseLong(rest.substring(split + 1, split2));
+    if(split2 == -1) {
+      return new ParsedDelta(min, max, null, isDeleteDelta, isRawFormat);
+    }
+    int statementId = Integer.parseInt(rest.substring(split2 + 1));
+    return new ParsedDelta(min, max, null, statementId, isDeleteDelta,
+        isRawFormat);
+
+  }
+
+  /**
    * Is the given directory in ACID format?
    * @param directory the partition directory to check
    * @param conf the query configuration

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
index 98f5df1..4ebd69e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java
@@ -82,7 +82,7 @@ public class OrcRecordUpdater implements RecordUpdater {
   final static int BUCKET = 2;
   final static int ROW_ID = 3;
   final static int CURRENT_WRITEID = 4;
-  final static int ROW = 5;
+  public static final int ROW = 5;
   /**
    * total number of fields (above)
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
index f16f9b4..1509bba 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java
@@ -18,14 +18,7 @@
 
 package org.apache.hadoop.hive.ql.io.orc;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.BitSet;
-import java.util.List;
-import java.util.Map.Entry;
-import java.util.TreeMap;
-
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -58,12 +51,17 @@ import org.apache.orc.IntegerColumnStatistics;
 import org.apache.orc.OrcConf;
 import org.apache.orc.StripeInformation;
 import org.apache.orc.StripeStatistics;
-import org.apache.orc.impl.AcidStats;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.TreeMap;
 /**
  * A fast vectorized batch reader class for ACID. Insert events are read 
directly
  * from the base files/insert_only deltas in vectorized row batches. The 
deleted
@@ -83,6 +81,10 @@ public class VectorizedOrcAcidRowBatchReader
   protected float progress = 0.0f;
   protected Object[] partitionValues;
   private boolean addPartitionCols = true;
+  /**
+   * true means there is no OrcRecordUpdater.ROW column
+   * (i.e. the struct wrapping user columns) in {@link 
#vectorizedRowBatchBase}.
+   */
   private final boolean isFlatPayload;
   private final ValidWriteIdList validWriteIdList;
   private final DeleteEventRegistry deleteEventRegistry;
@@ -97,6 +99,12 @@ public class VectorizedOrcAcidRowBatchReader
    */
   private final boolean rowIdProjected;
   /**
+   * if false, we don't need any acid medadata columns from the file because we
+   * know all data in the split is valid (wrt to visible writeIDs/delete 
events)
+   * and ROW_ID is not needed higher up in the operator pipeline
+   */
+  private final boolean includeAcidColumns;
+  /**
    * partition/table root
    */
   private final Path rootPath;
@@ -105,7 +113,8 @@ public class VectorizedOrcAcidRowBatchReader
    */
   private final OrcSplit.OffsetAndBucketProperty syntheticProps;
   /**
-   * To have access to {@link RecordReader#getRowNumber()} in the underlying 
file
+   * To have access to {@link RecordReader#getRowNumber()} in the underlying
+   * file which we need to generate synthetic ROW_IDs for original files
    */
   private RecordReader innerReader;
   /**
@@ -118,7 +127,7 @@ public class VectorizedOrcAcidRowBatchReader
    */
   private SearchArgument deleteEventSarg = null;
 
-
+  //OrcInputFormat c'tor
   VectorizedOrcAcidRowBatchReader(OrcSplit inputSplit, JobConf conf,
                                   Reporter reporter) throws IOException {
     this(inputSplit, conf,reporter, null);
@@ -249,6 +258,47 @@ public class VectorizedOrcAcidRowBatchReader
     rowIdProjected = areRowIdsProjected(rbCtx);
     rootPath = orcSplit.getRootDir();
     syntheticProps = orcSplit.getSyntheticAcidProps();
+
+    /**
+     * This could be optimized by moving dir type/write id based checks are
+     * done during split generation (i.e. per file not per split) and the
+     * DeleteEventRegistry is checked here since some splits from the same
+     * file may have relevant deletes and other may not.
+     */
+    if(conf.getBoolean(ConfVars.OPTIMIZE_ACID_META_COLUMNS.varname, true)) {
+      /*figure out if we can skip reading acid metadata columns:
+       * isOriginal - don't have meta columns - nothing to skip
+       * there no relevant delete events && ROW__ID is not needed higher up
+       * (e.g. this is not a delete statement)*/
+      if (!isOriginal && deleteEventRegistry.isEmpty() && !rowIdProjected) {
+        Path parent = orcSplit.getPath().getParent();
+        while (parent != null && !rootPath.equals(parent)) {
+          if (parent.getName().startsWith(AcidUtils.BASE_PREFIX)) {
+            /**
+             * The assumption here is that any base_x is filtered out by
+             * {@link AcidUtils#getAcidState(Path, Configuration, 
ValidWriteIdList)}
+             * so if we see it here it's valid.
+             * {@link AcidUtils#isValidBase(long, ValidWriteIdList, Path, 
FileSystem)}
+             * can check but it makes a {@link FileSystem} call.
+             */
+            readerOptions.includeAcidColumns(false);
+            break;
+          } else {
+            AcidUtils.ParsedDelta pd =
+                AcidUtils.parsedDelta(parent, isOriginal);
+            if (validWriteIdList.isWriteIdRangeValid(pd.getMinWriteId(),
+                pd.getMaxWriteId()) == ValidWriteIdList.RangeResponse.ALL) {
+              //all write IDs in range are committed (and visible in current
+              // snapshot)
+              readerOptions.includeAcidColumns(false);
+              break;
+            }
+          }
+          parent = parent.getParent();
+        }
+      }
+    }
+    includeAcidColumns = readerOptions.getIncludeAcidColumns();//default is 
true
   }
 
   /**
@@ -295,6 +345,10 @@ public class VectorizedOrcAcidRowBatchReader
     }
     deleteEventReaderOptions.searchArgument(null, null);
   }
+
+  public boolean includeAcidColumns() {
+    return this.includeAcidColumns;
+  }
   public void setBaseAndInnerReader(
     final org.apache.hadoop.mapred.RecordReader<NullWritable,
         VectorizedRowBatch> baseReader) {
@@ -661,7 +715,16 @@ public class VectorizedOrcAcidRowBatchReader
     } catch (Exception e) {
       throw new IOException("error iterating", e);
     }
-
+    if(!includeAcidColumns) {
+      //if here, we don't need to filter anything wrt acid metadata columns
+      //in fact, they are not even read from file/llap
+      value.size = vectorizedRowBatchBase.size;
+      value.selected = vectorizedRowBatchBase.selected;
+      value.selectedInUse = vectorizedRowBatchBase.selectedInUse;
+      copyFromBase(value);
+      progress = baseReader.getProgress();
+      return true;
+    }
     // Once we have read the VectorizedRowBatchBase from the file, there are 
two kinds of cases
     // for which we might have to discard rows from the batch:
     // Case 1- when the row is created by a transaction that is not valid, or
@@ -717,22 +780,7 @@ public class VectorizedOrcAcidRowBatchReader
      /* Just copy the payload.  {@link recordIdColumnVector} has already been 
populated */
       System.arraycopy(vectorizedRowBatchBase.cols, 0, value.cols, 0, 
value.getDataColumnCount());
     } else {
-      int payloadCol = OrcRecordUpdater.ROW;
-      if (isFlatPayload) {
-        // Ignore the struct column and just copy all the following data 
columns.
-        System.arraycopy(vectorizedRowBatchBase.cols, payloadCol + 1, 
value.cols, 0,
-            vectorizedRowBatchBase.cols.length - payloadCol - 1);
-      } else {
-        StructColumnVector payloadStruct =
-            (StructColumnVector) vectorizedRowBatchBase.cols[payloadCol];
-        // Transfer columnVector objects from base batch to outgoing batch.
-        System.arraycopy(payloadStruct.fields, 0, value.cols, 0, 
value.getDataColumnCount());
-      }
-      if (rowIdProjected) {
-        recordIdColumnVector.fields[0] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_WRITEID];
-        recordIdColumnVector.fields[1] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET];
-        recordIdColumnVector.fields[2] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID];
-      }
+      copyFromBase(value);
     }
     if (rowIdProjected) {
       int ix = rbCtx.findVirtualColumnNum(VirtualColumn.ROWID);
@@ -741,7 +789,28 @@ public class VectorizedOrcAcidRowBatchReader
     progress = baseReader.getProgress();
     return true;
   }
-
+  //get the 'data' cols and set in value as individual ColumnVector, then get
+  //ColumnVectors for acid meta cols to create a single ColumnVector
+  //representing RecordIdentifier and (optionally) set it in 'value'
+  private void copyFromBase(VectorizedRowBatch value) {
+    assert !isOriginal;
+    if (isFlatPayload) {
+      int payloadCol = includeAcidColumns ? OrcRecordUpdater.ROW : 0;
+        // Ignore the struct column and just copy all the following data 
columns.
+        System.arraycopy(vectorizedRowBatchBase.cols, payloadCol + 1, 
value.cols, 0,
+            vectorizedRowBatchBase.cols.length - payloadCol - 1);
+    } else {
+      StructColumnVector payloadStruct =
+          (StructColumnVector) 
vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW];
+      // Transfer columnVector objects from base batch to outgoing batch.
+      System.arraycopy(payloadStruct.fields, 0, value.cols, 0, 
value.getDataColumnCount());
+    }
+    if (rowIdProjected) {
+      recordIdColumnVector.fields[0] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_WRITEID];
+      recordIdColumnVector.fields[1] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET];
+      recordIdColumnVector.fields[2] = 
vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID];
+    }
+  }
   private ColumnVector[] handleOriginalFile(
       BitSet selectedBitSet, ColumnVector[] innerRecordIdColumnVector) throws 
IOException {
     /*

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java 
b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java
index 9c3cb20..a25406d 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java
@@ -1,3 +1,20 @@
+/*
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
 package org.apache.hadoop.hive.ql;
 
 import org.apache.hadoop.hive.conf.HiveConf;
@@ -159,4 +176,50 @@ public class TestTxnCommands3 extends 
TxnCommandsBaseForTests {
             "warehouse/t/base_0000002/bucket_00000"}};
     checkResult(expected2, testQuery, isVectorized, "after compaction", LOG);
   }
+  /**
+   * HIVE-19985
+   */
+  @Test
+  public void testAcidMetaColumsDecode() throws Exception {
+    //this only applies in vectorized mode
+    hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
+    hiveConf.set(MetastoreConf.ConfVars.CREATE_TABLES_AS_ACID.getVarname(), 
"true");
+    runStatementOnDriver("drop table if exists T");
+    runStatementOnDriver("create table T (a int, b int) stored as orc");
+    int[][] data1 = {{1, 2}, {3, 4}};
+    runStatementOnDriver("insert into T" + makeValuesClause(data1));
+    int[][] data2 = {{5, 6}, {7, 8}};
+    runStatementOnDriver("insert into T" + makeValuesClause(data2));
+    int[][] dataAll = {{1, 2}, {3, 4}, {5, 6}, {7, 8}};
+
+
+    hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, true);
+    List<String> rs = runStatementOnDriver("select a, b from T order by a, b");
+    Assert.assertEquals(stringifyValues(dataAll), rs);
+
+    hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, false);
+    rs = runStatementOnDriver("select a, b from T order by a, b");
+    Assert.assertEquals(stringifyValues(dataAll), rs);
+
+    runStatementOnDriver("alter table T compact 'major'");
+    TestTxnCommands2.runWorker(hiveConf);
+
+    //check status of compaction job
+    TxnStore txnHandler = TxnUtils.getTxnStore(hiveConf);
+    ShowCompactResponse resp = txnHandler.showCompact(new 
ShowCompactRequest());
+    Assert.assertEquals("Unexpected number of compactions in history",
+        1, resp.getCompactsSize());
+    Assert.assertEquals("Unexpected 0 compaction state",
+        TxnStore.CLEANING_RESPONSE, resp.getCompacts().get(0).getState());
+    Assert.assertTrue(
+        resp.getCompacts().get(0).getHadoopJobId().startsWith("job_local"));
+
+    hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, true);
+    rs = runStatementOnDriver("select a, b from T order by a, b");
+    Assert.assertEquals(stringifyValues(dataAll), rs);
+
+    hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, false);
+    rs = runStatementOnDriver("select a, b from T order by a, b");
+    Assert.assertEquals(stringifyValues(dataAll), rs);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q 
b/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q
new file mode 100644
index 0000000..7b02268
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q
@@ -0,0 +1,24 @@
+set hive.mapred.mode=nonstrict;
+set hive.fetch.task.conversion=none;
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+
+
+CREATE transactional TABLE acidTblDefault(a int, b INT) STORED AS ORC;
+INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4);
+
+set hive.optimize.acid.meta.columns=true;
+
+select a,b from acidTblDefault;
+select b from acidTblDefault;
+select ROW__ID, b from acidTblDefault;
+select a, ROW__ID, b from acidTblDefault;
+select a, ROW__ID from acidTblDefault;
+
+set hive.optimize.acid.meta.columns=false;
+
+select a,b from acidTblDefault;
+select b from acidTblDefault;
+select ROW__ID, b from acidTblDefault;
+select a, ROW__ID, b from acidTblDefault;
+select a, ROW__ID from acidTblDefault;

http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out 
b/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out
new file mode 100644
index 0000000..87c03e2
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out
@@ -0,0 +1,128 @@
+PREHOOK: query: CREATE transactional TABLE acidTblDefault(a int, b INT) STORED 
AS ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@acidTblDefault
+POSTHOOK: query: CREATE transactional TABLE acidTblDefault(a int, b INT) 
STORED AS ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@acidTblDefault
+PREHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@acidtbldefault
+POSTHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@acidtbldefault
+POSTHOOK: Lineage: acidtbldefault.a SCRIPT []
+POSTHOOK: Lineage: acidtbldefault.b SCRIPT []
+PREHOOK: query: select a,b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a,b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      2
+2      3
+3      4
+PREHOOK: query: select b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+2
+3
+4
+PREHOOK: query: select ROW__ID, b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select ROW__ID, b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}      2
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}      3
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}      4
+PREHOOK: query: select a, ROW__ID, b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a, ROW__ID, b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}       
2
+2      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}       
3
+3      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}       
4
+PREHOOK: query: select a, ROW__ID from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a, ROW__ID from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}
+2      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}
+3      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}
+PREHOOK: query: select a,b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a,b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      2
+2      3
+3      4
+PREHOOK: query: select b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+2
+3
+4
+PREHOOK: query: select ROW__ID, b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select ROW__ID, b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}      2
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}      3
+{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}      4
+PREHOOK: query: select a, ROW__ID, b from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a, ROW__ID, b from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}       
2
+2      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}       
3
+3      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}       
4
+PREHOOK: query: select a, ROW__ID from acidTblDefault
+PREHOOK: type: QUERY
+PREHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+POSTHOOK: query: select a, ROW__ID from acidTblDefault
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@acidtbldefault
+#### A masked pattern was here ####
+1      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0}
+2      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1}
+3      {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}

hive git commit: HIVE-19985: ACID: Skip decoding the ROW__ID sections for read-only queries (Eugene Koifman, reviewed by Gopal V)

Reply via email to