This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push: new 0e2d0757357 HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3 (#4809) 0e2d0757357 is described below commit 0e2d07573570cb66fa9bf8af05ca79ccee55e21f Author: Aman Raj <104416558+amanraj2...@users.noreply.github.com> AuthorDate: Tue Oct 24 12:03:15 2023 +0530 HIVE-27807: Backport of HIVE-20629, HIVE-20705, HIVE-20734 to branch-3 (#4809) * HIVE-20629: Hive incremental replication fails with events missing error if database is kept idle for more than an hour (Mahesh Kumar Behera, reviewed by Sankar Hariappan) * HIVE-20705: Vectorization: Native Vector MapJoin doesn't support Complex Big Table values * HIVE-20734: Beeline: When beeline-site.xml is and hive CLI redirects to beeline, it should use the system username/dummy password instead of prompting for one --------- Co-authored-by: Sankar Hariappan <sank...@apache.org> Co-authored-by: Matt McCline <mmccl...@hortonworks.com> Co-authored-by: Vaibhav Gumashta <vgumas...@hortonworks.com> Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#4809) --- bin/ext/beeline.sh | 7 +- bin/hive | 1 + .../TestReplicationScenariosAcrossInstances.java | 40 +++ .../test/resources/testconfiguration.properties | 1 + .../hadoop/hive/ql/exec/repl/ReplLoadWork.java | 9 +- .../incremental/IncrementalLoadEventsIterator.java | 4 +- .../incremental/IncrementalLoadTasksBuilder.java | 20 +- .../hive/ql/optimizer/physical/Vectorizer.java | 18 +- .../hive/ql/parse/ReplicationSemanticAnalyzer.java | 15 +- .../apache/hadoop/hive/ql/plan/MapJoinDesc.java | 10 + .../hadoop/hive/ql/plan/VectorMapJoinDesc.java | 14 + .../clientpositive/vector_mapjoin_complex_values.q | 34 ++ .../llap/vector_mapjoin_complex_values.q.out | 355 +++++++++++++++++++++ 13 files changed, 500 insertions(+), 28 deletions(-) diff --git a/bin/ext/beeline.sh b/bin/ext/beeline.sh index 8052c452bac..5bf7fe67503 100644 --- a/bin/ext/beeline.sh +++ b/bin/ext/beeline.sh @@ -32,7 +32,12 @@ beeline () { export HADOOP_CLASSPATH="${hadoopClasspath}${HIVE_CONF_DIR}:${beelineJarPath}:${superCsvJarPath}:${jlineJarPath}" export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -Dlog4j.configurationFile=beeline-log4j2.properties " - exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@" + # if CLIUSER is not empty, then pass it as user id / password during beeline redirect + if [ -z $CLIUSER ] ; then + exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@" + else + exec $HADOOP jar ${beelineJarPath} $CLASS $HIVE_OPTS "$@" -n "${CLIUSER}" -p "${CLIUSER}" + fi } beeline_help () { diff --git a/bin/hive b/bin/hive index a7ae2f571e9..ef9ef955d23 100755 --- a/bin/hive +++ b/bin/hive @@ -86,6 +86,7 @@ if [ "$SERVICE" = "" ] ; then fi if [[ "$SERVICE" == "cli" && "$USE_BEELINE_FOR_HIVE_CLI" == "true" ]] ; then + CLIUSER=`whoami` SERVICE="beeline" fi diff --git a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java index 1d0a9c8b447..12ec8e66731 100644 --- a/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java +++ b/itests/hive-unit/src/test/java/org/apache/hadoop/hive/ql/parse/TestReplicationScenariosAcrossInstances.java @@ -961,6 +961,46 @@ public class TestReplicationScenariosAcrossInstances { assertFalse(props.containsKey(SOURCE_OF_REPLICATION)); } + @Test + public void testIncrementalDumpEmptyDumpDirectory() throws Throwable { + WarehouseInstance.Tuple tuple = primary.dump(primaryDbName, null); + + replica.load(replicatedDbName, tuple.dumpLocation) + .status(replicatedDbName) + .verifyResult(tuple.lastReplicationId); + + tuple = primary.dump(primaryDbName, tuple.lastReplicationId); + + replica.load(replicatedDbName, tuple.dumpLocation) + .status(replicatedDbName) + .verifyResult(tuple.lastReplicationId); + + // create events for some other database and then dump the primaryDbName to dump an empty directory. + String testDbName = primaryDbName + "_test"; + tuple = primary.run(" create database " + testDbName) + .run("create table " + testDbName + ".tbl (fld int)") + .dump(primaryDbName, tuple.lastReplicationId); + + // Incremental load to existing database with empty dump directory should set the repl id to the last event at src. + replica.load(replicatedDbName, tuple.dumpLocation) + .status(replicatedDbName) + .verifyResult(tuple.lastReplicationId); + + // Incremental load to non existing db should return database not exist error. + tuple = primary.dump("someJunkDB", tuple.lastReplicationId); + CommandProcessorResponse response = replica.runCommand("REPL LOAD someJunkDB from " + tuple.dumpLocation); + response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.metadata.hiveException: " + + "database does not exist"); + + // Bootstrap load from an empty dump directory should return empty load directory error. + tuple = primary.dump("someJunkDB", null); + response = replica.runCommand("REPL LOAD someJunkDB from " + tuple.dumpLocation); + response.getErrorMessage().toLowerCase().contains("org.apache.hadoop.hive.ql.parse.semanticException:" + + " no data to load in path"); + + primary.run(" drop database if exists " + testDbName + " cascade"); + } + @Test public void testIncrementalDumpMultiIteration() throws Throwable { WarehouseInstance.Tuple bootstrapTuple = primary.dump(primaryDbName, null); diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 16a3e082d99..52cde10efdc 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -807,6 +807,7 @@ minillaplocal.query.files=\ vector_like_2.q,\ vector_llap_io_data_conversion.q,\ vector_llap_text_1.q,\ + vector_mapjoin_complex_values.q,\ vector_mapjoin_reduce.q,\ vector_null_map.q,\ vector_number_compare_projection.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java index fdbcb15c72d..ff21b6a601d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/ReplLoadWork.java @@ -53,7 +53,7 @@ public class ReplLoadWork implements Serializable { final LineageState sessionStateLineageState; public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String dbNameToLoadIn, - String tableNameToLoadIn, LineageState lineageState, boolean isIncrementalDump) throws IOException { + String tableNameToLoadIn, LineageState lineageState, boolean isIncrementalDump, Long eventTo) throws IOException { this.tableNameToLoadIn = tableNameToLoadIn; sessionStateLineageState = lineageState; this.dumpDirectory = dumpDirectory; @@ -64,7 +64,7 @@ public class ReplLoadWork implements Serializable { this.bootstrapIterator = null; this.constraintsIterator = null; incrementalLoad = new IncrementalLoadTasksBuilder(dbNameToLoadIn, tableNameToLoadIn, dumpDirectory, - incrementalIterator, hiveConf); + incrementalIterator, hiveConf, eventTo); } else { this.bootstrapIterator = new BootstrapEventsIterator(dumpDirectory, dbNameToLoadIn, hiveConf); this.constraintsIterator = new ConstraintEventsIterator(dumpDirectory, hiveConf); @@ -73,11 +73,6 @@ public class ReplLoadWork implements Serializable { } } - public ReplLoadWork(HiveConf hiveConf, String dumpDirectory, String dbNameOrPattern, - LineageState lineageState) throws IOException { - this(hiveConf, dumpDirectory, dbNameOrPattern, null, lineageState, false); - } - public BootstrapEventsIterator iterator() { return bootstrapIterator; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java index 4b37c8dd989..5638ace714d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadEventsIterator.java @@ -44,7 +44,9 @@ public class IncrementalLoadEventsIterator implements Iterator<FileStatus> { FileSystem fs = eventPath.getFileSystem(conf); eventDirs = fs.listStatus(eventPath, EximUtil.getDirectoryFilter(fs)); if ((eventDirs == null) || (eventDirs.length == 0)) { - throw new IllegalArgumentException("No data to load in path " + loadPath); + currentIndex = 0; + numEvents = 0; + return; } // For event dump, each sub-dir is an individual event dump. // We need to guarantee that the directory listing we got is in order of event id. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java index 2a9388772cf..60ab9b64a10 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/repl/incremental/IncrementalLoadTasksBuilder.java @@ -64,15 +64,16 @@ import java.util.HashSet; public class IncrementalLoadTasksBuilder { private final String dbName, tableName; private final IncrementalLoadEventsIterator iterator; - private HashSet<ReadEntity> inputs; - private HashSet<WriteEntity> outputs; + private final HashSet<ReadEntity> inputs; + private final HashSet<WriteEntity> outputs; private Logger log; private final HiveConf conf; private final ReplLogger replLogger; private static long numIteration; + private final Long eventTo; public IncrementalLoadTasksBuilder(String dbName, String tableName, String loadPath, - IncrementalLoadEventsIterator iterator, HiveConf conf) { + IncrementalLoadEventsIterator iterator, HiveConf conf, Long eventTo) { this.dbName = dbName; this.tableName = tableName; this.iterator = iterator; @@ -83,6 +84,7 @@ public class IncrementalLoadTasksBuilder { replLogger = new IncrementalLoadLogger(dbName, loadPath, iterator.getNumEvents()); numIteration = 0; replLogger.startLog(); + this.eventTo = eventTo; } public Task<? extends Serializable> build(DriverContext driverContext, Hive hive, Logger log, @@ -151,6 +153,18 @@ public class IncrementalLoadTasksBuilder { // add load task to start the next iteration taskChainTail.addDependentTask(TaskFactory.get(loadWork, conf)); } else { + // if no events were replayed, then add a task to update the last repl id of the database/table to last event id. + if (taskChainTail == evTaskRoot) { + String lastEventid = eventTo.toString(); + if (StringUtils.isEmpty(tableName)) { + taskChainTail = dbUpdateReplStateTask(dbName, lastEventid, taskChainTail); + this.log.debug("no events to replay, set last repl id of db " + dbName + " to " + lastEventid); + } else { + taskChainTail = tableUpdateReplStateTask(dbName, tableName, null, lastEventid, taskChainTail); + this.log.debug("no events to replay, set last repl id of table " + dbName + "." + tableName + " to " + + lastEventid); + } + } Map<String, String> dbProps = new HashMap<>(); dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID.toString(), String.valueOf(lastReplayedEvent)); ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 2dd12ef1918..22915b50f68 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -3500,6 +3500,9 @@ public class Vectorizer implements PhysicalPlanResolver { * Similarly, we need a mapping since a value expression can be a calculation and the value * will go into a scratch column. */ + boolean supportsValueTypes = true; // Assume. + HashSet<String> notSupportedValueTypes = new HashSet<String>(); + int[] bigTableValueColumnMap = new int[allBigTableValueExpressions.length]; String[] bigTableValueColumnNames = new String[allBigTableValueExpressions.length]; TypeInfo[] bigTableValueTypeInfos = new TypeInfo[allBigTableValueExpressions.length]; @@ -3514,7 +3517,13 @@ public class Vectorizer implements PhysicalPlanResolver { ExprNodeDesc exprNode = bigTableExprs.get(i); bigTableValueColumnNames[i] = exprNode.toString(); - bigTableValueTypeInfos[i] = exprNode.getTypeInfo(); + TypeInfo typeInfo = exprNode.getTypeInfo(); + if (!(typeInfo instanceof PrimitiveTypeInfo)) { + supportsValueTypes = false; + Category category = typeInfo.getCategory(); + notSupportedValueTypes.add(category.toString()); + } + bigTableValueTypeInfos[i] = typeInfo; } if (bigTableValueExpressionsList.size() == 0) { slimmedBigTableValueExpressions = null; @@ -3747,6 +3756,10 @@ public class Vectorizer implements PhysicalPlanResolver { if (!supportsKeyTypes) { vectorDesc.setNotSupportedKeyTypes(new ArrayList(notSupportedKeyTypes)); } + vectorDesc.setSupportsValueTypes(supportsValueTypes); + if (!supportsValueTypes) { + vectorDesc.setNotSupportedValueTypes(new ArrayList(notSupportedValueTypes)); + } // Check common conditions for both Optimized and Fast Hash Tables. boolean result = true; // Assume. @@ -3756,7 +3769,8 @@ public class Vectorizer implements PhysicalPlanResolver { !oneMapJoinCondition || hasNullSafes || !smallTableExprVectorizes || - outerJoinHasNoKeys) { + outerJoinHasNoKeys || + !supportsValueTypes) { result = false; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java index f83146125f3..fe0cec010e0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/ReplicationSemanticAnalyzer.java @@ -328,21 +328,8 @@ public class ReplicationSemanticAnalyzer extends BaseSemanticAnalyzer { LOG.debug("{} contains an bootstrap dump", loadPath); } - if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) { - ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, - tblNameOrPattern, queryState.getLineageState(), false); - rootTasks.add(TaskFactory.get(replLoadWork, conf)); - return; - } - - FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath); - if (srcs == null || (srcs.length == 0)) { - LOG.warn("Nothing to load at {}", loadPath.toUri().toString()); - return; - } - ReplLoadWork replLoadWork = new ReplLoadWork(conf, loadPath.toString(), dbNameOrPattern, - tblNameOrPattern, queryState.getLineageState(), evDump); + tblNameOrPattern, queryState.getLineageState(), evDump, dmd.getEventTo()); rootTasks.add(TaskFactory.get(replLoadWork, conf)); } catch (Exception e) { // TODO : simple wrap & rethrow for now, clean up with error codes diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java index 83b34161a75..7834b182a78 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapJoinDesc.java @@ -478,6 +478,16 @@ public class MapJoinDesc extends JoinDesc implements Serializable { vectorMapJoinDesc.getSupportsKeyTypes(), "Optimized Table and Supports Key Types")); } + final boolean supportsValueTypes = vectorMapJoinDesc.getSupportsValueTypes(); + if (!supportsValueTypes) { + + // Only add this condition when false to avoid mega-Q file update. + conditionList.add( + new VectorizationCondition( + false, + "Supports Value Types " + + vectorMapJoinDesc.getNotSupportedValueTypes().toString())); + } VectorizationCondition[] conditions = conditionList.toArray(new VectorizationCondition[0]); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java index 58032ca0572..3c7c69d5822 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java @@ -204,6 +204,8 @@ public class VectorMapJoinDesc extends AbstractVectorDesc { private boolean isHybridHashJoin; private boolean supportsKeyTypes; private List<String> notSupportedKeyTypes; + private boolean supportsValueTypes; + private List<String> notSupportedValueTypes; private boolean smallTableExprVectorizes; private boolean outerJoinHasNoKeys; @@ -249,6 +251,18 @@ public class VectorMapJoinDesc extends AbstractVectorDesc { public List<String> getNotSupportedKeyTypes() { return notSupportedKeyTypes; } + public void setSupportsValueTypes(boolean supportsValueTypes) { + this.supportsValueTypes = supportsValueTypes; + } + public boolean getSupportsValueTypes() { + return supportsValueTypes; + } + public void setNotSupportedValueTypes(List<String> notSupportedValueTypes) { + this.notSupportedValueTypes = notSupportedValueTypes; + } + public List<String> getNotSupportedValueTypes() { + return notSupportedValueTypes; + } public void setSmallTableExprVectorizes(boolean smallTableExprVectorizes) { this.smallTableExprVectorizes = smallTableExprVectorizes; } diff --git a/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q new file mode 100644 index 00000000000..1c88daaefd4 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_mapjoin_complex_values.q @@ -0,0 +1,34 @@ +set hive.mapred.mode=nonstrict; +set hive.explain.user=false; +set hive.vectorized.execution.enabled=true; +set hive.auto.convert.join=true; +set hive.mapjoin.hybridgrace.hashtable=false; +set hive.fetch.task.conversion=none; +set hive.cli.print.header=true; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; + +create table census( +ssn int, +name string, +city string, +email string) +row format delimited +fields terminated by ','; + +insert into census values(100,"raj","san jose","email"); + +create table census_clus( +ssn int, +name string, +city string, +email string) +clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES ('transactional'='true'); + +insert into table census_clus select * from census; + +EXPLAIN VECTORIZATION DETAIL +UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn); + +UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn); \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out new file mode 100644 index 00000000000..d7fe5f1d0dc --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/vector_mapjoin_complex_values.q.out @@ -0,0 +1,355 @@ +PREHOOK: query: create table census( +ssn int, +name string, +city string, +email string) +row format delimited +fields terminated by ',' +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@census +POSTHOOK: query: create table census( +ssn int, +name string, +city string, +email string) +row format delimited +fields terminated by ',' +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@census +PREHOOK: query: insert into census values(100,"raj","san jose","email") +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@census +POSTHOOK: query: insert into census values(100,"raj","san jose","email") +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@census +POSTHOOK: Lineage: census.city SCRIPT [] +POSTHOOK: Lineage: census.email SCRIPT [] +POSTHOOK: Lineage: census.name SCRIPT [] +POSTHOOK: Lineage: census.ssn SCRIPT [] +col1 col2 col3 col4 +PREHOOK: query: create table census_clus( +ssn int, +name string, +city string, +email string) +clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@census_clus +POSTHOOK: query: create table census_clus( +ssn int, +name string, +city string, +email string) +clustered by (ssn) into 4 buckets stored as orc TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@census_clus +PREHOOK: query: insert into table census_clus select * from census +PREHOOK: type: QUERY +PREHOOK: Input: default@census +PREHOOK: Output: default@census_clus +POSTHOOK: query: insert into table census_clus select * from census +POSTHOOK: type: QUERY +POSTHOOK: Input: default@census +POSTHOOK: Output: default@census_clus +POSTHOOK: Lineage: census_clus.city SIMPLE [(census)census.FieldSchema(name:city, type:string, comment:null), ] +POSTHOOK: Lineage: census_clus.email SIMPLE [(census)census.FieldSchema(name:email, type:string, comment:null), ] +POSTHOOK: Lineage: census_clus.name SIMPLE [(census)census.FieldSchema(name:name, type:string, comment:null), ] +POSTHOOK: Lineage: census_clus.ssn SIMPLE [(census)census.FieldSchema(name:ssn, type:int, comment:null), ] +census.ssn census.name census.city census.email +PREHOOK: query: EXPLAIN VECTORIZATION DETAIL +UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn) +PREHOOK: type: QUERY +PREHOOK: Input: default@census +PREHOOK: Input: default@census_clus +PREHOOK: Output: default@census_clus +POSTHOOK: query: EXPLAIN VECTORIZATION DETAIL +UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@census +POSTHOOK: Input: default@census_clus +POSTHOOK: Output: default@census_clus +Explain +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: census_clus + Statistics: Num rows: 1 Data size: 185 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:ssn:int, 1:name:string, 2:city:string, 3:email:string, 4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: FilterLongColEqualLongScalar(col 0:int, val 100) + predicate: (ssn = 100) (type: boolean) + Statistics: Num rows: 1 Data size: 185 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Left Semi Join 0 to 1 + keys: + 0 100 (type: int) + 1 100 (type: int) + Map Join Vectorization: + bigTableKeyExpressions: ConstantVectorExpression(val 100) -> 5:int + bigTableValueExpressions: col 2:string, col 3:string, col 4:struct<writeid:bigint,bucketid:int,rowid:bigint> + className: VectorMapJoinOperator + native: false + nativeConditionsMet: hive.mapjoin.optimized.hashtable IS true, hive.vectorized.execution.mapjoin.native.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, One MapJoin Condition IS true, No nullsafe IS true, Small table vectorizes IS true, Optimized Table and Supports Key Types IS true + nativeConditionsNotMet: Supports Value Types [STRUCT] IS false + outputColumnNames: _col2, _col3, _col6 + input vertices: + 1 Reducer 4 + Statistics: Num rows: 1 Data size: 257 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col6 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), _col2 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col3, _col4 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [2, 0, 1] + Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkObjectHashOperator + keyColumnNums: [2] + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + partitionColumnNums: [3] + valueColumnNums: [0, 1] + Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col3 (type: string), _col4 (type: string) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 4 + includeColumns: [0, 2, 3] + dataColumns: ssn:int, name:string, city:string, email:string + neededVirtualColumns: [ROWID] + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint] + Map 3 + Map Operator Tree: + TableScan + alias: census + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:ssn:int, 1:name:string, 2:city:string, 3:email:string, 4:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>] + Filter Operator + Filter Vectorization: + className: VectorFilterOperator + native: true + predicateExpression: FilterLongColEqualLongScalar(col 0:int, val 100) + predicate: (ssn = 100) (type: boolean) + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [] + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + Group By Vectorization: + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: ConstantVectorExpression(val 100) -> 5:int + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [] + keys: 100 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 100 (type: int) + sort order: + + Map-reduce partition columns: 100 (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + keyColumnNums: [1] + keyExpressions: ConstantVectorExpression(val 100) -> 1:int + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumnNums: [] + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: no inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 4 + includeColumns: [0] + dataColumns: ssn:int, name:string, city:string, email:string + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint] + Reducer 2 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + reduceColumnNullOrder: z + reduceColumnSortOrder: + + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 3 + dataColumns: KEY.reducesinkkey0:struct<writeid:bigint,bucketid:int,rowid:bigint>, VALUE._col1:string, VALUE._col2:string + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, string] + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), 100 (type: int), 'updated name' (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 3, 4, 1, 2] + selectExpressions: ConstantVectorExpression(val 100) -> 3:int, ConstantVectorExpression(val updated name) -> 4:string + Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 1 Data size: 357 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.census_clus + Write Type: UPDATE + Reducer 4 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + reduceColumnNullOrder: a + reduceColumnSortOrder: + + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 1 + dataColumns: KEY._col0:int + partitionColumnCount: 0 + scratchColumnTypeNames: [bigint, bigint] + Reduce Operator Tree: + Group By Operator + Group By Vectorization: + className: VectorGroupByOperator + groupByMode: MERGEPARTIAL + keyExpressions: ConstantVectorExpression(val 100) -> 1:int, ConstantVectorExpression(val 100) -> 2:int + native: false + vectorProcessingMode: MERGE_PARTIAL + projectedOutputColumnNums: [] + keys: 100 (type: int), 100 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [] + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + Group By Vectorization: + className: VectorGroupByOperator + groupByMode: HASH + keyExpressions: ConstantVectorExpression(val 100) -> 2:int + native: false + vectorProcessingMode: HASH + projectedOutputColumnNums: [] + keys: 100 (type: int) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 100 (type: int) + sort order: + + Map-reduce partition columns: 100 (type: int) + Reduce Sink Vectorization: + className: VectorReduceSinkLongOperator + keyColumnNums: [1] + keyExpressions: ConstantVectorExpression(val 100) -> 1:int + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumnNums: [] + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.census_clus + Write Type: UPDATE + + Stage: Stage-3 + Stats Work + Basic Stats Work: + +PREHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn) +PREHOOK: type: QUERY +PREHOOK: Input: default@census +PREHOOK: Input: default@census_clus +PREHOOK: Output: default@census_clus +POSTHOOK: query: UPDATE census_clus SET name = 'updated name' where ssn=100 and EXISTS (select distinct ssn from census where ssn=census_clus.ssn) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@census +POSTHOOK: Input: default@census_clus +POSTHOOK: Output: default@census_clus +row__id ssn _c2 city email