This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new e67eeb53fe8 HIVE-29367: Prevent Long overflows in ConvertJoinMapJoin
(#6237)
e67eeb53fe8 is described below
commit e67eeb53fe86607ca23a56fc749f6d32b8e9c54f
Author: konstantinb <[email protected]>
AuthorDate: Sat Jan 31 02:23:55 2026 -0800
HIVE-29367: Prevent Long overflows in ConvertJoinMapJoin (#6237)
---
.../hive/ql/optimizer/ConvertJoinMapJoin.java | 34 +--
.../hive/ql/optimizer/TestConvertJoinMapJoin.java | 164 +++++++++++++
.../clientpositive/mapjoin_stats_overflow.q | 26 ++
.../llap/mapjoin_stats_overflow.q.out | 270 +++++++++++++++++++++
4 files changed, 479 insertions(+), 15 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
index a622a0a7c02..0d94dff357c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
@@ -274,30 +274,30 @@ private boolean selectJoinForLlap(OptimizeTezProcContext
context, JoinOperator j
// Determine the size of small table inputs
final int mapJoinConversionPos = mapJoinConversion.getBigTablePos();
- long totalSize = 0;
+ long estimatedTotalSize = 0;
for (int pos = 0; pos < joinOp.getParentOperators().size(); pos++) {
if (pos == mapJoinConversionPos) {
continue;
}
Operator<? extends OperatorDesc> parentOp =
joinOp.getParentOperators().get(pos);
- totalSize += computeOnlineDataSize(parentOp.getStatistics());
+ estimatedTotalSize = StatsUtils.safeAdd(estimatedTotalSize,
computeOnlineDataSize(parentOp.getStatistics()));
}
// Size of bigtable
long bigTableSize =
computeOnlineDataSize(joinOp.getParentOperators().get(mapJoinConversionPos).getStatistics());
// Network cost of DPHJ
- long networkCostDPHJ = totalSize + bigTableSize;
+ long networkCostDPHJ = StatsUtils.safeAdd(estimatedTotalSize,
bigTableSize);
- LOG.info("Cost of dynamically partitioned hash join : total small table
size = " + totalSize
+ LOG.info("Cost of dynamically partitioned hash join : total small table
size = " + estimatedTotalSize
+ " bigTableSize = " + bigTableSize + "networkCostDPHJ = " +
networkCostDPHJ);
// Network cost of map side join
- long networkCostMJ = numNodes * totalSize;
+ long networkCostMJ = StatsUtils.safeMult(numNodes, estimatedTotalSize);
LOG.info("Cost of Bucket Map Join : numNodes = " + numNodes + " total
small table size = "
- + totalSize + " networkCostMJ = " + networkCostMJ);
+ + estimatedTotalSize + " networkCostMJ = " + networkCostMJ);
- if (totalSize <= maxJoinMemory) {
+ if (estimatedTotalSize <= maxJoinMemory) {
// mapjoin is applicable; don't try the below algos..
return false;
}
@@ -363,9 +363,13 @@ public long computeOnlineDataSizeGeneric(Statistics
statistics, long overHeadPer
numRows = 1;
}
long worstCaseNeededSlots = 1L << DoubleMath.log2(numRows /
hashTableLoadFactor, RoundingMode.UP);
- onlineDataSize += statistics.getDataSize() -
hashTableDataSizeAdjustment(numRows, statistics.getColumnStats());
- onlineDataSize += overHeadPerRow * statistics.getNumRows();
- onlineDataSize += overHeadPerSlot * worstCaseNeededSlots;
+ long adjustedDataSize = Math.max(0L,
+ statistics.getDataSize() - hashTableDataSizeAdjustment(numRows,
statistics.getColumnStats()));
+ onlineDataSize = StatsUtils.safeAdd(onlineDataSize, adjustedDataSize);
+ onlineDataSize = StatsUtils.safeAdd(onlineDataSize,
+ StatsUtils.safeMult(overHeadPerRow, statistics.getNumRows()));
+ onlineDataSize = StatsUtils.safeAdd(onlineDataSize,
+ StatsUtils.safeMult(overHeadPerSlot, worstCaseNeededSlots));
return onlineDataSize;
}
@@ -384,7 +388,7 @@ private static long hashTableDataSizeAdjustment(long
numRows, List<ColStatistics
for (ColStatistics cs : colStats) {
if (cs != null) {
String colTypeLowerCase = cs.getColumnType().toLowerCase();
- long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls()
+ 1 : numRows;
+ long nonNullCount = cs.getNumNulls() > 0 ? Math.max(0L, numRows -
cs.getNumNulls()) + 1 : numRows;
double overhead = 0;
if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
|| colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
@@ -1248,7 +1252,7 @@ public MapJoinConversion
getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr
if (bigInputStat != null && selectedBigTable) {
// We are replacing the current big table with a new one, thus
// we need to count the current one as a map table then.
- totalSize += computeOnlineDataSize(bigInputStat);
+ totalSize = StatsUtils.safeAdd(totalSize,
computeOnlineDataSize(bigInputStat));
// Check if number of distinct keys is greater than given max number
of entries
// for HashMap
if (checkMapJoinThresholds &&
!checkNumberOfEntriesForHashTable(joinOp, bigTablePosition, context)) {
@@ -1257,7 +1261,7 @@ public MapJoinConversion
getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr
} else if (!selectedBigTable) {
// This is not the first table and we are not using it as big table,
// in fact, we're adding this table as a map table
- totalSize += inputSize;
+ totalSize = StatsUtils.safeAdd(totalSize, inputSize);
// Check if number of distinct keys is greater than given max number
of entries
// for HashMap
if (checkMapJoinThresholds &&
!checkNumberOfEntriesForHashTable(joinOp, pos, context)) {
@@ -1342,7 +1346,7 @@ private static Long
computeCumulativeCardinality(Operator<? extends OperatorDesc
if (inputCardinality == null) {
return null;
}
- cumulativeCardinality += inputCardinality;
+ cumulativeCardinality = StatsUtils.safeAdd(cumulativeCardinality,
inputCardinality);
}
}
Statistics currInputStat = op.getStatistics();
@@ -1350,7 +1354,7 @@ private static Long
computeCumulativeCardinality(Operator<? extends OperatorDesc
LOG.warn("Couldn't get statistics from: " + op);
return null;
}
- cumulativeCardinality += currInputStat.getNumRows();
+ cumulativeCardinality = StatsUtils.safeAdd(cumulativeCardinality,
currInputStat.getNumRows());
return cumulativeCardinality;
}
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java
b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java
new file mode 100644
index 00000000000..82005de35a2
--- /dev/null
+++
b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.junit.jupiter.api.Test;
+
+class TestConvertJoinMapJoin {
+
+ @Test
+ void testComputeOnlineDataSizeGenericLargeDataSize() {
+ ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+ converter.hashTableLoadFactor = 0.75f;
+ Statistics stats = new Statistics(1000L, Long.MAX_VALUE, 0L, 0L);
+
+ long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+ assertTrue(result >= 0, "Result should not be negative due to overflow");
+ }
+
+ @Test
+ void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() {
+ ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+ converter.hashTableLoadFactor = 0.75f;
+ Statistics stats = new Statistics(Long.MAX_VALUE / 2, 1000L, 0L, 0L);
+
+ long result = converter.computeOnlineDataSizeGeneric(stats, Long.MAX_VALUE
/ 4, Long.MAX_VALUE / 4);
+
+ assertTrue(result >= 0, "Result should not be negative due to overflow");
+ assertEquals(Long.MAX_VALUE, result, "Result should saturate at
Long.MAX_VALUE");
+ }
+
+ @Test
+ void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() {
+ ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+ converter.hashTableLoadFactor = 0.75f;
+ Statistics stats = new Statistics(100L, 10000L, 0L, 0L);
+ List<ColStatistics> colStats = new ArrayList<>();
+ ColStatistics cs = new ColStatistics("col1", "string");
+ cs.setNumNulls(Long.MAX_VALUE);
+ colStats.add(cs);
+ stats.setColumnStats(colStats);
+
+ long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+ assertTrue(result >= 0, "Result should not be negative due to underflow in
nonNullCount");
+ }
+
+ @Test
+ void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() {
+ ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+ converter.hashTableLoadFactor = 0.75f;
+ Statistics stats = new Statistics(1000000L, 100L, 0L, 0L);
+ List<ColStatistics> colStats = new ArrayList<>();
+ ColStatistics cs = new ColStatistics("col1", "string");
+ cs.setNumNulls(0L);
+ colStats.add(cs);
+ stats.setColumnStats(colStats);
+
+ long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+ assertTrue(result >= 0, "Result should not be negative when adjustment >
dataSize");
+ }
+
+ @Test
+ void testComputeOnlineDataSizeGenericAllExtremeValues() {
+ ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+ converter.hashTableLoadFactor = 0.75f;
+ Statistics stats = new Statistics(Long.MAX_VALUE, Long.MAX_VALUE, 0L, 0L);
+ List<ColStatistics> colStats = new ArrayList<>();
+ ColStatistics cs = new ColStatistics("col1", "string");
+ cs.setNumNulls(Long.MAX_VALUE);
+ colStats.add(cs);
+ stats.setColumnStats(colStats);
+
+ long result = converter.computeOnlineDataSizeGeneric(stats,
Long.MAX_VALUE, Long.MAX_VALUE);
+
+ assertTrue(result >= 0, "Result should not be negative with extreme
values");
+ assertEquals(Long.MAX_VALUE, result, "Result should saturate at
Long.MAX_VALUE");
+ }
+
+ @Test
+ void testComputeCumulativeCardinalityWithParentsOverflow() {
+ Operator<?> parent1 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+ when(parent1.getParentOperators()).thenReturn(Collections.emptyList());
+ Operator<?> parent2 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+ when(parent2.getParentOperators()).thenReturn(Collections.emptyList());
+ Operator<?> mockOp = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+ when(mockOp.getParentOperators()).thenReturn(Arrays.asList(parent1,
parent2));
+
+ Long result = invokeComputeCumulativeCardinality(mockOp);
+
+ assertNotNull(result, "Result should not be null");
+ assertTrue(result >= 0, "Result should not be negative due to overflow");
+ assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate
at Long.MAX_VALUE");
+ }
+
+ @Test
+ void testComputeCumulativeCardinalityDeepTreeOverflow() {
+ Operator<?> leaf = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+ when(leaf.getParentOperators()).thenReturn(Collections.emptyList());
+ Operator<?> mid1 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+
when(mid1.getParentOperators()).thenReturn(Collections.singletonList(leaf));
+ Operator<?> mid2 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+
when(mid2.getParentOperators()).thenReturn(Collections.singletonList(mid1));
+ Operator<?> root = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+
when(root.getParentOperators()).thenReturn(Collections.singletonList(mid2));
+
+ Long result = invokeComputeCumulativeCardinality(root);
+
+ assertNotNull(result, "Result should not be null");
+ assertTrue(result >= 0, "Result should not be negative due to overflow");
+ assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate
at Long.MAX_VALUE");
+ }
+
+ @SuppressWarnings("unchecked")
+ private Operator<?> createMockOperatorWithStats(long numRows) {
+ Operator<?> mockOp = mock(Operator.class);
+ Statistics stats = new Statistics(numRows, numRows * 100, 0L, 0L);
+ when(mockOp.getStatistics()).thenReturn(stats);
+ return mockOp;
+ }
+
+ private Long invokeComputeCumulativeCardinality(Operator<?> op) {
+ try {
+ Method method = ConvertJoinMapJoin.class.getDeclaredMethod(
+ "computeCumulativeCardinality", Operator.class);
+ method.setAccessible(true);
+ return (Long) method.invoke(null, op);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q
b/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q
new file mode 100644
index 00000000000..9867c8c4479
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q
@@ -0,0 +1,26 @@
+-- Test overflow handling in computeOnlineDataSize with Long.MAX_VALUE
statistics
+
+SET hive.auto.convert.join=true;
+SET hive.auto.convert.join.noconditionaltask=true;
+SET hive.auto.convert.join.noconditionaltask.size=10000000;
+
+CREATE TABLE t1 (k BIGINT, v STRING);
+CREATE TABLE t2 (k BIGINT, v STRING);
+
+-- Case 1: Normal statistics - t1 fits in 10MB threshold, MapJoin expected
+ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='100000');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='10000','numNulls'='0');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+ALTER TABLE t2 UPDATE STATISTICS
SET('numRows'='1000000','rawDataSize'='10000000');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000000','numNulls'='0');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k;
+
+-- Case 2: Long.MAX_VALUE numRows - without fix, overflow causes negative size
and incorrect MapJoin
+ALTER TABLE t1 UPDATE STATISTICS
SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000','numNulls'='0');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k;
diff --git
a/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out
b/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out
new file mode 100644
index 00000000000..e7cdad56bde
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/mapjoin_stats_overflow.q.out
@@ -0,0 +1,270 @@
+PREHOOK: query: CREATE TABLE t1 (k BIGINT, v STRING)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: CREATE TABLE t1 (k BIGINT, v STRING)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+PREHOOK: query: CREATE TABLE t2 (k BIGINT, v STRING)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2
+POSTHOOK: query: CREATE TABLE t2 (k BIGINT, v STRING)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS
SET('numRows'='10000','rawDataSize'='100000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS
SET('numRows'='10000','rawDataSize'='100000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='10000','numNulls'='0')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='10000','numNulls'='0')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS
SET('numRows'='1000000','rawDataSize'='10000000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t2
+PREHOOK: Output: default@t2
+POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS
SET('numRows'='1000000','rawDataSize'='10000000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: default@t2
+PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000000','numNulls'='0')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t2
+PREHOOK: Output: default@t2
+POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000000','numNulls'='0')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: default@t2
+PREHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t2
+PREHOOK: Output: default@t2
+POSTHOOK: query: ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: default@t2
+PREHOOK: query: EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Map 2 <- Map 1 (BROADCAST_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ filterExpr: k is not null (type: boolean)
+ Statistics: Num rows: 10000 Data size: 80000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: k is not null (type: boolean)
+ Statistics: Num rows: 10000 Data size: 80000 Basic stats:
COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: k (type: bigint)
+ outputColumnNames: _col0
+ Statistics: Num rows: 10000 Data size: 80000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: bigint)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: bigint)
+ Statistics: Num rows: 10000 Data size: 80000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 2
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ filterExpr: k is not null (type: boolean)
+ probeDecodeDetails: cacheKey:HASH_MAP_MAPJOIN_25_container,
bigKeyColName:k, smallTablePos:0, keyRatio:0.01
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: k is not null (type: boolean)
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: k (type: bigint), v (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Map Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: bigint)
+ 1 _col0 (type: bigint)
+ outputColumnNames: _col0, _col2
+ input vertices:
+ 0 Map 1
+ Statistics: Num rows: 10000 Data size: 1020000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: bigint), _col2 (type:
string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 10000 Data size: 1020000 Basic
stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 10000 Data size: 1020000
Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS
SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS
SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000','numNulls'='0')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k
SET('numDVs'='1000','numNulls'='0')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t1
+PREHOOK: Output: default@t1
+POSTHOOK: query: ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v
SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: default@t1
+PREHOOK: query: EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: t1
+ filterExpr: k is not null (type: boolean)
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: k is not null (type: boolean)
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: k (type: bigint), v (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: bigint)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: bigint)
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Map 3
+ Map Operator Tree:
+ TableScan
+ alias: t2
+ filterExpr: k is not null (type: boolean)
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Filter Operator
+ predicate: k is not null (type: boolean)
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: k (type: bigint), v (type: string)
+ outputColumnNames: _col0, _col1
+ Statistics: Num rows: 1000000 Data size: 102000000 Basic
stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: bigint)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col0 (type: bigint)
+ Statistics: Num rows: 1000000 Data size: 102000000
Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col1 (type: string)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Merge Join Operator
+ condition map:
+ Inner Join 0 to 1
+ keys:
+ 0 _col0 (type: bigint)
+ 1 _col0 (type: bigint)
+ outputColumnNames: _col0, _col1, _col3
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col0 (type: bigint), _col1 (type: string),
_col3 (type: string)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 9223372036854775807 Data size:
9223372036854775807 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+