Repository: hive Updated Branches: refs/heads/master 05ddd21c7 -> e133ec5c2
HIVE-19985: ACID: Skip decoding the ROW__ID sections for read-only queries (Eugene Koifman, reviewed by Gopal V) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e133ec5c Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e133ec5c Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e133ec5c Branch: refs/heads/master Commit: e133ec5c28e0ed5082773f00b5bf4d55d2697db9 Parents: 05ddd21 Author: Eugene Koifman <ekoif...@apache.org> Authored: Sat Sep 29 09:56:44 2018 -0700 Committer: Eugene Koifman <ekoif...@apache.org> Committed: Sat Sep 29 09:56:44 2018 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 3 + .../test/resources/testconfiguration.properties | 3 +- .../hive/llap/io/api/impl/LlapRecordReader.java | 61 +++++++-- .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 47 +++++-- .../hadoop/hive/ql/io/orc/OrcRecordUpdater.java | 2 +- .../io/orc/VectorizedOrcAcidRowBatchReader.java | 129 ++++++++++++++----- .../apache/hadoop/hive/ql/TestTxnCommands3.java | 63 +++++++++ .../clientpositive/acid_meta_columns_decode.q | 24 ++++ .../llap/acid_meta_columns_decode.q.out | 128 ++++++++++++++++++ 9 files changed, 407 insertions(+), 53 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 0cecae5..d1e6631 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2698,6 +2698,9 @@ public class HiveConf extends Configuration { MERGE_CARDINALITY_VIOLATION_CHECK("hive.merge.cardinality.check", true, "Set to true to ensure that each SQL Merge statement ensures that for each row in the target\n" + "table there is at most 1 matching row in the source table per SQL Specification."), + OPTIMIZE_ACID_META_COLUMNS("hive.optimize.acid.meta.columns", true, + "If true, don't decode Acid metadata columns from storage unless" + + " they are needed."), // For Arrow SerDe HIVE_ARROW_ROOT_ALLOCATOR_LIMIT("hive.arrow.root.allocator.limit", Long.MAX_VALUE, http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index def3561..fdd8ecc 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -426,8 +426,9 @@ minillaplocal.query.files=\ dec_str.q,\ dp_counter_non_mm.q,\ dp_counter_mm.q,\ - acid_no_buckets.q, \ acid_globallimit.q,\ + acid_meta_columns_decode.q,\ + acid_no_buckets.q, \ acid_vectorization_missing_cols.q,\ acid_vectorization_original.q,\ alter_merge_stats_orc.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java index 3455d16..27a5b0f 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java @@ -160,7 +160,6 @@ class LlapRecordReader TypeDescription schema = OrcInputFormat.getDesiredRowTypeDescr( job, isAcidScan, Integer.MAX_VALUE); - this.includes = new IncludesImpl(tableIncludedCols, isAcidScan, rbCtx, schema, job); int queueLimitBase = getQueueVar(ConfVars.LLAP_IO_VRB_QUEUE_LIMIT_BASE, job, daemonConf); int queueLimitMin = getQueueVar(ConfVars.LLAP_IO_VRB_QUEUE_LIMIT_MIN, job, daemonConf); @@ -184,6 +183,8 @@ class LlapRecordReader this.acidReader = new VectorizedOrcAcidRowBatchReader( (OrcSplit) split, jobConf, Reporter.NULL, null, rbCtx, true); } + this.includes = new IncludesImpl(tableIncludedCols, isAcidScan, rbCtx, + schema, job, isAcidScan && acidReader.includeAcidColumns()); // Create the consumer of encoded data; it will coordinate decoding to CVBs. feedback = rp = cvp.createReadPipeline(this, split, includes, sarg, counters, includes, @@ -341,19 +342,27 @@ class LlapRecordReader return false; } if (isAcidScan) { - vrb.selectedInUse = true; + vrb.selectedInUse = true;//why? if (isVectorized) { // TODO: relying everywhere on the magical constants and columns being together means ACID // columns are going to be super hard to change in a backward compat manner. I can // foresee someone cursing while refactoring all the magic for prefix schema changes. + /** + * Acid meta cols are always either all included or all excluded the + * the width of 'cvb' changes accordingly so 'acidColCount' and + * 'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments. + */ // Exclude the row column. - int acidColCount = OrcInputFormat.getRootColumn(false) - 1; + int acidColCount = acidReader.includeAcidColumns() ? + OrcInputFormat.getRootColumn(false) - 1 : 0; VectorizedRowBatch inputVrb = new VectorizedRowBatch( - acidColCount + 1 + vrb.getDataColumnCount() ); + //so +1 is the OrcRecordUpdater.ROW? + acidColCount + 1 + vrb.getDataColumnCount()); // By assumption, ACID columns are currently always in the beginning of the arrays. System.arraycopy(cvb.cols, 0, inputVrb.cols, 0, acidColCount); for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; ++ixInReadSet) { - int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet); + int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) - + (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW); // TODO: should we create the batch from vrbctx, and reuse the vectors, like below? Future work. inputVrb.cols[ixInVrb] = cvb.cols[ixInReadSet]; } @@ -376,7 +385,7 @@ class LlapRecordReader int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet); cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb); } - vrb.selectedInUse = false; + vrb.selectedInUse = false;//why? vrb.size = cvb.size; } @@ -564,18 +573,36 @@ class LlapRecordReader /** This class encapsulates include-related logic for LLAP readers. It is not actually specific * to LLAP IO but in LLAP IO in particular, I want to encapsulate all this mess for now until * we have smth better like Schema Evolution v2. This can also hypothetically encapsulate - * field pruning inside structs and stuff like that. */ + * field pruning inside structs and stuff like that. + * + * There is some split brain issue between {@link SchemaEvolution} used in + * non-LLAP path and this class. The file schema for acid tables looks + * like this and <op, owid, writerId, rowid, cwid, <f1, ... fn>> and + * {@link SchemaEvolution#getFileIncluded()} respects that. So if fn=2, + * the type IDs are 0..8 and the fileIncluded[] has 9 bits that indicate + * what is read. So in particular, {@link org.apache.hadoop.hive.ql.io.orc.RecordReader} + * produces ColumnVectorS are NULL in every row for each + * fileIncluded[9]==false. The fields corresponding to structs are always + * included if any child of the struct has to be included. + * + * LLAP only produces ColumnVectorS if they are needed so the width of + * ColumnVectorBatch varies depending on what was projected. + * + * See also {@link VectorizedOrcAcidRowBatchReader#includeAcidColumns()} and + * {@link #next(NullWritable, VectorizedRowBatch)}*/ private static class IncludesImpl implements SchemaEvolutionFactory, Includes { private List<Integer> readerLogicalColumnIds; private List<Integer> filePhysicalColumnIds; private Integer acidStructColumnId = null; + private final boolean includeAcidColumns; // For current schema evolution. private TypeDescription readerSchema; private JobConf jobConf; public IncludesImpl(List<Integer> tableIncludedCols, boolean isAcidScan, - VectorizedRowBatchCtx rbCtx, TypeDescription readerSchema, JobConf jobConf) { + VectorizedRowBatchCtx rbCtx, TypeDescription readerSchema, + JobConf jobConf, boolean includeAcidColumns) { // Note: columnIds below makes additional changes for ACID. Don't use this var directly. this.readerSchema = readerSchema; this.jobConf = jobConf; @@ -604,14 +631,29 @@ class LlapRecordReader // We don't want to include the root struct in ACID case; it would cause the whole // struct to get read without projection. if (acidStructColumnId == i) continue; + if(!includeAcidColumns) { + /** + * if not including acid columns, we still want to number the + * physical columns as if acid columns are included becase + * {@link #generateFileIncludes(TypeDescription)} takes the file + * schema as input + * (eg <op, owid, writerId, rowid, cwid, <f1, ... fn>>) + */ + continue; + } filePhysicalColumnIds.add(i); } for (int tableColumnId : readerLogicalColumnIds) { + //but make sure to generate correct ids in type tree in-order + //walk order filePhysicalColumnIds.add(rootCol + tableColumnId); } + /*ok, so if filePhysicalColumnIds include acid column ids, we end up + decoding the vectors*/ } this.filePhysicalColumnIds = filePhysicalColumnIds; + this.includeAcidColumns = includeAcidColumns; } @Override @@ -628,7 +670,8 @@ class LlapRecordReader // TODO: will this work correctly with ACID? boolean[] readerIncludes = OrcInputFormat.genIncludedColumns( readerSchema, readerLogicalColumnIds); - Reader.Options options = new Reader.Options(jobConf).include(readerIncludes); + Reader.Options options = new Reader.Options(jobConf) + .include(readerIncludes).includeAcidColumns(includeAcidColumns); return new SchemaEvolution(fileSchema, readerSchema, options); } http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 4d71eb4..71e5131 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -918,6 +918,11 @@ public class AcidUtils { return results.toArray(new Path[results.size()]); } + /** + * This will look at a footer of one of the files in the delta to see if the + * file is in Acid format, i.e. has acid metadata columns. The assumption is + * that for any dir, either all files are acid or all are not. + */ public static ParsedDelta parsedDelta(Path deltaDir, FileSystem fs) throws IOException { String deltaDirName = deltaDir.getName(); if (deltaDirName.startsWith(DELETE_DELTA_PREFIX)) { @@ -941,24 +946,42 @@ public class AcidUtils { if (filename.startsWith(deltaPrefix)) { //small optimization - delete delta can't be in raw format boolean isRawFormat = !isDeleteDelta && MetaDataFile.isRawFormat(deltaDir, fs); - String rest = filename.substring(deltaPrefix.length()); - int split = rest.indexOf('_'); - int split2 = rest.indexOf('_', split + 1);//may be -1 if no statementId - long min = Long.parseLong(rest.substring(0, split)); - long max = split2 == -1 ? - Long.parseLong(rest.substring(split + 1)) : - Long.parseLong(rest.substring(split + 1, split2)); - if(split2 == -1) { - return new ParsedDelta(min, max, null, isDeleteDelta, isRawFormat); - } - int statementId = Integer.parseInt(rest.substring(split2 + 1)); - return new ParsedDelta(min, max, null, statementId, isDeleteDelta, isRawFormat); + return parsedDelta(deltaDir, isRawFormat); } throw new IllegalArgumentException(deltaDir + " does not start with " + deltaPrefix); } /** + * This method just parses the file name. It relies on caller to figure if + * the file is in Acid format (i.e. has acid metadata columns) or not. + * {@link #parsedDelta(Path, FileSystem)} + */ + public static ParsedDelta parsedDelta(Path deltaDir, boolean isRawFormat) { + String filename = deltaDir.getName(); + boolean isDeleteDelta = filename.startsWith(DELETE_DELTA_PREFIX); + //make sure it's null for delete delta no matter what was passed in - this + //doesn't apply to delete deltas + isRawFormat = isDeleteDelta ? false : isRawFormat; + String rest = filename.substring((isDeleteDelta ? + DELETE_DELTA_PREFIX : DELTA_PREFIX).length()); + int split = rest.indexOf('_'); + //may be -1 if no statementId + int split2 = rest.indexOf('_', split + 1); + long min = Long.parseLong(rest.substring(0, split)); + long max = split2 == -1 ? + Long.parseLong(rest.substring(split + 1)) : + Long.parseLong(rest.substring(split + 1, split2)); + if(split2 == -1) { + return new ParsedDelta(min, max, null, isDeleteDelta, isRawFormat); + } + int statementId = Integer.parseInt(rest.substring(split2 + 1)); + return new ParsedDelta(min, max, null, statementId, isDeleteDelta, + isRawFormat); + + } + + /** * Is the given directory in ACID format? * @param directory the partition directory to check * @param conf the query configuration http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java index 98f5df1..4ebd69e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java @@ -82,7 +82,7 @@ public class OrcRecordUpdater implements RecordUpdater { final static int BUCKET = 2; final static int ROW_ID = 3; final static int CURRENT_WRITEID = 4; - final static int ROW = 5; + public static final int ROW = 5; /** * total number of fields (above) */ http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java index f16f9b4..1509bba 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java @@ -18,14 +18,7 @@ package org.apache.hadoop.hive.ql.io.orc; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.List; -import java.util.Map.Entry; -import java.util.TreeMap; - +import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -58,12 +51,17 @@ import org.apache.orc.IntegerColumnStatistics; import org.apache.orc.OrcConf; import org.apache.orc.StripeInformation; import org.apache.orc.StripeStatistics; -import org.apache.orc.impl.AcidStats; import org.apache.orc.impl.OrcAcidUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; +import java.util.Map.Entry; +import java.util.TreeMap; /** * A fast vectorized batch reader class for ACID. Insert events are read directly * from the base files/insert_only deltas in vectorized row batches. The deleted @@ -83,6 +81,10 @@ public class VectorizedOrcAcidRowBatchReader protected float progress = 0.0f; protected Object[] partitionValues; private boolean addPartitionCols = true; + /** + * true means there is no OrcRecordUpdater.ROW column + * (i.e. the struct wrapping user columns) in {@link #vectorizedRowBatchBase}. + */ private final boolean isFlatPayload; private final ValidWriteIdList validWriteIdList; private final DeleteEventRegistry deleteEventRegistry; @@ -97,6 +99,12 @@ public class VectorizedOrcAcidRowBatchReader */ private final boolean rowIdProjected; /** + * if false, we don't need any acid medadata columns from the file because we + * know all data in the split is valid (wrt to visible writeIDs/delete events) + * and ROW_ID is not needed higher up in the operator pipeline + */ + private final boolean includeAcidColumns; + /** * partition/table root */ private final Path rootPath; @@ -105,7 +113,8 @@ public class VectorizedOrcAcidRowBatchReader */ private final OrcSplit.OffsetAndBucketProperty syntheticProps; /** - * To have access to {@link RecordReader#getRowNumber()} in the underlying file + * To have access to {@link RecordReader#getRowNumber()} in the underlying + * file which we need to generate synthetic ROW_IDs for original files */ private RecordReader innerReader; /** @@ -118,7 +127,7 @@ public class VectorizedOrcAcidRowBatchReader */ private SearchArgument deleteEventSarg = null; - + //OrcInputFormat c'tor VectorizedOrcAcidRowBatchReader(OrcSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { this(inputSplit, conf,reporter, null); @@ -249,6 +258,47 @@ public class VectorizedOrcAcidRowBatchReader rowIdProjected = areRowIdsProjected(rbCtx); rootPath = orcSplit.getRootDir(); syntheticProps = orcSplit.getSyntheticAcidProps(); + + /** + * This could be optimized by moving dir type/write id based checks are + * done during split generation (i.e. per file not per split) and the + * DeleteEventRegistry is checked here since some splits from the same + * file may have relevant deletes and other may not. + */ + if(conf.getBoolean(ConfVars.OPTIMIZE_ACID_META_COLUMNS.varname, true)) { + /*figure out if we can skip reading acid metadata columns: + * isOriginal - don't have meta columns - nothing to skip + * there no relevant delete events && ROW__ID is not needed higher up + * (e.g. this is not a delete statement)*/ + if (!isOriginal && deleteEventRegistry.isEmpty() && !rowIdProjected) { + Path parent = orcSplit.getPath().getParent(); + while (parent != null && !rootPath.equals(parent)) { + if (parent.getName().startsWith(AcidUtils.BASE_PREFIX)) { + /** + * The assumption here is that any base_x is filtered out by + * {@link AcidUtils#getAcidState(Path, Configuration, ValidWriteIdList)} + * so if we see it here it's valid. + * {@link AcidUtils#isValidBase(long, ValidWriteIdList, Path, FileSystem)} + * can check but it makes a {@link FileSystem} call. + */ + readerOptions.includeAcidColumns(false); + break; + } else { + AcidUtils.ParsedDelta pd = + AcidUtils.parsedDelta(parent, isOriginal); + if (validWriteIdList.isWriteIdRangeValid(pd.getMinWriteId(), + pd.getMaxWriteId()) == ValidWriteIdList.RangeResponse.ALL) { + //all write IDs in range are committed (and visible in current + // snapshot) + readerOptions.includeAcidColumns(false); + break; + } + } + parent = parent.getParent(); + } + } + } + includeAcidColumns = readerOptions.getIncludeAcidColumns();//default is true } /** @@ -295,6 +345,10 @@ public class VectorizedOrcAcidRowBatchReader } deleteEventReaderOptions.searchArgument(null, null); } + + public boolean includeAcidColumns() { + return this.includeAcidColumns; + } public void setBaseAndInnerReader( final org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> baseReader) { @@ -661,7 +715,16 @@ public class VectorizedOrcAcidRowBatchReader } catch (Exception e) { throw new IOException("error iterating", e); } - + if(!includeAcidColumns) { + //if here, we don't need to filter anything wrt acid metadata columns + //in fact, they are not even read from file/llap + value.size = vectorizedRowBatchBase.size; + value.selected = vectorizedRowBatchBase.selected; + value.selectedInUse = vectorizedRowBatchBase.selectedInUse; + copyFromBase(value); + progress = baseReader.getProgress(); + return true; + } // Once we have read the VectorizedRowBatchBase from the file, there are two kinds of cases // for which we might have to discard rows from the batch: // Case 1- when the row is created by a transaction that is not valid, or @@ -717,22 +780,7 @@ public class VectorizedOrcAcidRowBatchReader /* Just copy the payload. {@link recordIdColumnVector} has already been populated */ System.arraycopy(vectorizedRowBatchBase.cols, 0, value.cols, 0, value.getDataColumnCount()); } else { - int payloadCol = OrcRecordUpdater.ROW; - if (isFlatPayload) { - // Ignore the struct column and just copy all the following data columns. - System.arraycopy(vectorizedRowBatchBase.cols, payloadCol + 1, value.cols, 0, - vectorizedRowBatchBase.cols.length - payloadCol - 1); - } else { - StructColumnVector payloadStruct = - (StructColumnVector) vectorizedRowBatchBase.cols[payloadCol]; - // Transfer columnVector objects from base batch to outgoing batch. - System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount()); - } - if (rowIdProjected) { - recordIdColumnVector.fields[0] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_WRITEID]; - recordIdColumnVector.fields[1] = vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET]; - recordIdColumnVector.fields[2] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID]; - } + copyFromBase(value); } if (rowIdProjected) { int ix = rbCtx.findVirtualColumnNum(VirtualColumn.ROWID); @@ -741,7 +789,28 @@ public class VectorizedOrcAcidRowBatchReader progress = baseReader.getProgress(); return true; } - + //get the 'data' cols and set in value as individual ColumnVector, then get + //ColumnVectors for acid meta cols to create a single ColumnVector + //representing RecordIdentifier and (optionally) set it in 'value' + private void copyFromBase(VectorizedRowBatch value) { + assert !isOriginal; + if (isFlatPayload) { + int payloadCol = includeAcidColumns ? OrcRecordUpdater.ROW : 0; + // Ignore the struct column and just copy all the following data columns. + System.arraycopy(vectorizedRowBatchBase.cols, payloadCol + 1, value.cols, 0, + vectorizedRowBatchBase.cols.length - payloadCol - 1); + } else { + StructColumnVector payloadStruct = + (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW]; + // Transfer columnVector objects from base batch to outgoing batch. + System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount()); + } + if (rowIdProjected) { + recordIdColumnVector.fields[0] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_WRITEID]; + recordIdColumnVector.fields[1] = vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET]; + recordIdColumnVector.fields[2] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID]; + } + } private ColumnVector[] handleOriginalFile( BitSet selectedBitSet, ColumnVector[] innerRecordIdColumnVector) throws IOException { /* http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java index 9c3cb20..a25406d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands3.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.hive.ql; import org.apache.hadoop.hive.conf.HiveConf; @@ -159,4 +176,50 @@ public class TestTxnCommands3 extends TxnCommandsBaseForTests { "warehouse/t/base_0000002/bucket_00000"}}; checkResult(expected2, testQuery, isVectorized, "after compaction", LOG); } + /** + * HIVE-19985 + */ + @Test + public void testAcidMetaColumsDecode() throws Exception { + //this only applies in vectorized mode + hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); + hiveConf.set(MetastoreConf.ConfVars.CREATE_TABLES_AS_ACID.getVarname(), "true"); + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("create table T (a int, b int) stored as orc"); + int[][] data1 = {{1, 2}, {3, 4}}; + runStatementOnDriver("insert into T" + makeValuesClause(data1)); + int[][] data2 = {{5, 6}, {7, 8}}; + runStatementOnDriver("insert into T" + makeValuesClause(data2)); + int[][] dataAll = {{1, 2}, {3, 4}, {5, 6}, {7, 8}}; + + + hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, true); + List<String> rs = runStatementOnDriver("select a, b from T order by a, b"); + Assert.assertEquals(stringifyValues(dataAll), rs); + + hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, false); + rs = runStatementOnDriver("select a, b from T order by a, b"); + Assert.assertEquals(stringifyValues(dataAll), rs); + + runStatementOnDriver("alter table T compact 'major'"); + TestTxnCommands2.runWorker(hiveConf); + + //check status of compaction job + TxnStore txnHandler = TxnUtils.getTxnStore(hiveConf); + ShowCompactResponse resp = txnHandler.showCompact(new ShowCompactRequest()); + Assert.assertEquals("Unexpected number of compactions in history", + 1, resp.getCompactsSize()); + Assert.assertEquals("Unexpected 0 compaction state", + TxnStore.CLEANING_RESPONSE, resp.getCompacts().get(0).getState()); + Assert.assertTrue( + resp.getCompacts().get(0).getHadoopJobId().startsWith("job_local")); + + hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, true); + rs = runStatementOnDriver("select a, b from T order by a, b"); + Assert.assertEquals(stringifyValues(dataAll), rs); + + hiveConf.setBoolVar(HiveConf.ConfVars.OPTIMIZE_ACID_META_COLUMNS, false); + rs = runStatementOnDriver("select a, b from T order by a, b"); + Assert.assertEquals(stringifyValues(dataAll), rs); + } } http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q b/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q new file mode 100644 index 0000000..7b02268 --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_meta_columns_decode.q @@ -0,0 +1,24 @@ +set hive.mapred.mode=nonstrict; +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + + +CREATE transactional TABLE acidTblDefault(a int, b INT) STORED AS ORC; +INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4); + +set hive.optimize.acid.meta.columns=true; + +select a,b from acidTblDefault; +select b from acidTblDefault; +select ROW__ID, b from acidTblDefault; +select a, ROW__ID, b from acidTblDefault; +select a, ROW__ID from acidTblDefault; + +set hive.optimize.acid.meta.columns=false; + +select a,b from acidTblDefault; +select b from acidTblDefault; +select ROW__ID, b from acidTblDefault; +select a, ROW__ID, b from acidTblDefault; +select a, ROW__ID from acidTblDefault; http://git-wip-us.apache.org/repos/asf/hive/blob/e133ec5c/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out b/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out new file mode 100644 index 0000000..87c03e2 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/acid_meta_columns_decode.q.out @@ -0,0 +1,128 @@ +PREHOOK: query: CREATE transactional TABLE acidTblDefault(a int, b INT) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acidTblDefault +POSTHOOK: query: CREATE transactional TABLE acidTblDefault(a int, b INT) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acidTblDefault +PREHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acidtbldefault +POSTHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1,2),(2,3),(3,4) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@acidtbldefault +POSTHOOK: Lineage: acidtbldefault.a SCRIPT [] +POSTHOOK: Lineage: acidtbldefault.b SCRIPT [] +PREHOOK: query: select a,b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a,b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 2 +2 3 +3 4 +PREHOOK: query: select b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +2 +3 +4 +PREHOOK: query: select ROW__ID, b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 2 +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 3 +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2} 4 +PREHOOK: query: select a, ROW__ID, b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a, ROW__ID, b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 2 +2 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 3 +3 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2} 4 +PREHOOK: query: select a, ROW__ID from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a, ROW__ID from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} +2 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} +3 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2} +PREHOOK: query: select a,b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a,b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 2 +2 3 +3 4 +PREHOOK: query: select b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +2 +3 +4 +PREHOOK: query: select ROW__ID, b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 2 +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 3 +{"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2} 4 +PREHOOK: query: select a, ROW__ID, b from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a, ROW__ID, b from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} 2 +2 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} 3 +3 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2} 4 +PREHOOK: query: select a, ROW__ID from acidTblDefault +PREHOOK: type: QUERY +PREHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +POSTHOOK: query: select a, ROW__ID from acidTblDefault +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acidtbldefault +#### A masked pattern was here #### +1 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":0} +2 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":1} +3 {"writeid":### Masked writeid ###,"bucketid":536870912,"rowid":2}