[1/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

gunther Thu, 20 Apr 2017 10:19:05 -0700

Repository: hive
Updated Branches:
  refs/heads/master fa24d4b9b -> 9d5d737db



http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
new file mode 100644
index 0000000..5d7b9e5
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinBranchInfo.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+
+public class SemiJoinBranchInfo {
+  private TableScanOperator ts;
+  private boolean isHint;
+
+  public SemiJoinBranchInfo(TableScanOperator ts) {
+    this.ts = ts;
+    isHint = false;
+  }
+
+  public SemiJoinBranchInfo(TableScanOperator ts, boolean isHint) {
+    this.ts = ts;
+    this.isHint = isHint;
+  }
+
+  public TableScanOperator getTsOp() {
+    return ts;
+  }
+
+  public boolean getIsHint() {
+    return isHint;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
new file mode 100644
index 0000000..1f24e23
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemiJoinHint.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.parse;
+
+public class SemiJoinHint {
+  private String tabAlias;
+  private String colName;
+  private Integer numEntries;
+
+  public SemiJoinHint(String tabAlias, String colName, Integer numEntries) {
+    this.tabAlias = tabAlias;
+    this.colName = colName;
+    this.numEntries = numEntries;
+  }
+
+  public String getTabAlias() {
+    return tabAlias;
+  }
+
+  public String getColName() {
+    return colName;
+  }
+
+  public Integer getNumEntries() {
+    return numEntries != null ? numEntries : -1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
index 7caeb78..96525b4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TaskCompiler.java
@@ -531,7 +531,7 @@ public abstract class TaskCompiler {
     clone.setLineageInfo(pCtx.getLineageInfo());
     clone.setMapJoinOps(pCtx.getMapJoinOps());
     clone.setRsToRuntimeValuesInfoMap(pCtx.getRsToRuntimeValuesInfoMap());
-    clone.setRsOpToTsOpMap(pCtx.getRsOpToTsOpMap());
+    clone.setRsToSemiJoinBranchInfo(pCtx.getRsToSemiJoinBranchInfo());
 
     return clone;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index eaad988..26eda04 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -178,6 +178,9 @@ public class TezCompiler extends TaskCompiler {
     TableScanOperator victimTS = null;
     ReduceSinkOperator victimRS = null;
 
+    // If there is a hint and no operator is removed then throw error
+    boolean hasHint = false;
+    boolean removed = false;
     for (Operator<?> o : component) {
       // Look for AppMasterEventOperator or ReduceSinkOperator
       if (o instanceof AppMasterEventOperator) {
@@ -185,25 +188,34 @@ public class TezCompiler extends TaskCompiler {
                 || o.getStatistics().getDataSize() < victimAM.getStatistics()
                 .getDataSize()) {
           victimAM = (AppMasterEventOperator) o;
+          removed = true;
         }
       } else if (o instanceof ReduceSinkOperator) {
-        TableScanOperator ts = context.parseContext.getRsOpToTsOpMap().get(o);
-        if (ts == null) {
+
+        SemiJoinBranchInfo sjInfo =
+                context.parseContext.getRsToSemiJoinBranchInfo().get(o);
+        if (sjInfo == null ) continue;
+        if (sjInfo.getIsHint()) {
+          // Skipping because of hint. Mark this info,
+          hasHint = true;
           continue;
         }
+
+        TableScanOperator ts = sjInfo.getTsOp();
         // Sanity check
         assert component.contains(ts);
 
         if (victimRS == null ||
                 ts.getStatistics().getDataSize() <
-                victimTS.getStatistics().getDataSize()) {
-            victimRS = (ReduceSinkOperator) o;
-            victimTS = ts;
-          }
+                        victimTS.getStatistics().getDataSize()) {
+          victimRS = (ReduceSinkOperator) o;
+          victimTS = ts;
+          removed = true;
         }
       }
+    }
 
-    // Always set the min/max optimization as victim.
+    // Always set the semijoin optimization as victim.
     Operator<?> victim = victimRS;
 
     if (victimRS == null && victimAM != null ) {
@@ -227,6 +239,11 @@ public class TezCompiler extends TaskCompiler {
       }
     }
 
+    if (hasHint && !removed) {
+      // There is hint but none of the operators removed. Throw error
+      throw new SemanticException("The user hint is causing an operator cycle. 
Please fix it and retry");
+    }
+
     if (victim == null ||
             (!context.pruningOpsRemovedByPriorOpt.isEmpty() &&
                     context.pruningOpsRemovedByPriorOpt.contains(victim))) {
@@ -287,11 +304,12 @@ public class TezCompiler extends TaskCompiler {
       LOG.debug("Adding special edge: " + o.getName() + " --> " + 
ts.toString());
       children.add(ts);
     } else if (o instanceof ReduceSinkOperator){
-      // min/max case
+      // semijoin case
       children = new ArrayList<Operator<?>>();
       children.addAll(o.getChildOperators());
-      TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(o);
-      if (ts != null) {
+      SemiJoinBranchInfo sjInfo = 
parseContext.getRsToSemiJoinBranchInfo().get(o);
+      if (sjInfo != null ) {
+        TableScanOperator ts = sjInfo.getTsOp();
         LOG.debug("Adding special edge: " + o.getName() + " --> " + 
ts.toString());
         children.add(ts);
       }
@@ -460,7 +478,7 @@ public class TezCompiler extends TaskCompiler {
     if (pCtx.getRsToRuntimeValuesInfoMap().size() > 0) {
       for (ReduceSinkOperator rs : 
pCtx.getRsToRuntimeValuesInfoMap().keySet()) {
         // Process min/max
-        GenTezUtils.processDynamicMinMaxPushDownOperator(
+        GenTezUtils.processDynamicSemiJoinPushDownOperator(
                 procCtx, pCtx.getRsToRuntimeValuesInfoMap().get(rs), rs);
       }
     }
@@ -617,7 +635,7 @@ public class TezCompiler extends TaskCompiler {
   private static void removeSemijoinOptimizationFromSMBJoins(
           OptimizeTezProcContext procCtx) throws SemanticException {
     if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
-            procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+            procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
       return;
     }
 
@@ -636,9 +654,9 @@ public class TezCompiler extends TaskCompiler {
     GraphWalker ogw = new PreOrderOnceWalker(disp);
     ogw.startWalking(topNodes, null);
 
+    List<TableScanOperator> tsOps = new ArrayList<>();
     // Iterate over the map and remove semijoin optimizations if needed.
     for (CommonMergeJoinOperator joinOp : ctx.JoinOpToTsOpMap.keySet()) {
-      List<TableScanOperator> tsOps = new ArrayList<TableScanOperator>();
       // Get one top level TS Op directly from the stack
       tsOps.add(ctx.JoinOpToTsOpMap.get(joinOp));
 
@@ -651,7 +669,7 @@ public class TezCompiler extends TaskCompiler {
         }
 
         assert parent instanceof SelectOperator;
-        while(parent != null) {
+        while (parent != null) {
           if (parent instanceof TableScanOperator) {
             tsOps.add((TableScanOperator) parent);
             break;
@@ -659,20 +677,24 @@ public class TezCompiler extends TaskCompiler {
           parent = parent.getParentOperators().get(0);
         }
       }
-
-      // Now the relevant TableScanOperators are known, find if there exists
-      // a semijoin filter on any of them, if so, remove it.
-      ParseContext pctx = procCtx.parseContext;
-      for (TableScanOperator ts : tsOps) {
-        for (ReduceSinkOperator rs : pctx.getRsOpToTsOpMap().keySet()) {
-          if (ts == pctx.getRsOpToTsOpMap().get(rs)) {
-            // match!
-            if (LOG.isDebugEnabled()) {
-              LOG.debug("Semijoin optimization found going to SMB join. 
Removing semijoin "
-                  + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
-            }
-            GenTezUtils.removeBranch(rs);
-            GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+    }
+    // Now the relevant TableScanOperators are known, find if there exists
+    // a semijoin filter on any of them, if so, remove it.
+
+    ParseContext pctx = procCtx.parseContext;
+    for (TableScanOperator ts : tsOps) {
+      for (ReduceSinkOperator rs : pctx.getRsToSemiJoinBranchInfo().keySet()) {
+        SemiJoinBranchInfo sjInfo = pctx.getRsToSemiJoinBranchInfo().get(rs);
+        if (ts == sjInfo.getTsOp()) {
+          // match!
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Semijoin optimization found going to SMB join. Removing 
semijoin "
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
+          }
+          GenTezUtils.removeBranch(rs);
+          GenTezUtils.removeSemiJoinOperator(pctx, rs, ts);
+          if (sjInfo.getIsHint()) {
+            LOG.debug("Removing hinted semijoin as it is with SMB join " + rs 
+ " : " + ts);
           }
         }
       }
@@ -699,7 +721,7 @@ public class TezCompiler extends TaskCompiler {
   private static void removeSemiJoinCyclesDueToMapsideJoins(
           OptimizeTezProcContext procCtx) throws SemanticException {
     if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION) ||
-            procCtx.parseContext.getRsOpToTsOpMap().size() == 0) {
+            procCtx.parseContext.getRsToSemiJoinBranchInfo().size() == 0) {
       return;
     }
 
@@ -752,10 +774,10 @@ public class TezCompiler extends TaskCompiler {
         }
 
         ReduceSinkOperator rs = ((ReduceSinkOperator) child);
-        TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
-        if (ts == null) {
-          continue;
-        }
+        SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+        if (sjInfo == null) continue;
+
+        TableScanOperator ts = sjInfo.getTsOp();
         // This is a semijoin branch. Find if this is creating a potential
         // cycle with childJoin.
         for (Operator<?> parent : childJoin.getParentOperators()) {
@@ -776,6 +798,9 @@ public class TezCompiler extends TaskCompiler {
             }
             GenTezUtils.removeBranch(rs);
             GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
+            if (sjInfo.getIsHint()) {
+              LOG.debug("Removing hinted semijoin as it is creating cycles 
with mapside joins " + rs + " : " + ts);
+            }
           }
         }
       }
@@ -790,8 +815,8 @@ public class TezCompiler extends TaskCompiler {
       assert nd instanceof ReduceSinkOperator;
       ReduceSinkOperator rs = (ReduceSinkOperator) nd;
       ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext;
-      TableScanOperator ts = pCtx.getRsOpToTsOpMap().get(rs);
-      if (ts == null) {
+      SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs);
+      if (sjInfo == null) {
         // nothing to do here.
         return null;
       }
@@ -802,6 +827,7 @@ public class TezCompiler extends TaskCompiler {
       GroupByDesc gbDesc = gbOp.getConf();
       ArrayList<AggregationDesc> aggregationDescs = gbDesc.getAggregators();
       boolean removeSemiJoin = false;
+      TableScanOperator ts = sjInfo.getTsOp();
       for (AggregationDesc agg : aggregationDescs) {
         if (agg.getGenericUDAFName() != "bloom_filter") {
           continue;
@@ -809,20 +835,24 @@ public class TezCompiler extends TaskCompiler {
 
         GenericUDAFBloomFilterEvaluator udafBloomFilterEvaluator =
                 (GenericUDAFBloomFilterEvaluator) 
agg.getGenericUDAFEvaluator();
+        if (udafBloomFilterEvaluator.hasHintEntries())
+          return null; // Created using hint, skip it
+
         long expectedEntries = udafBloomFilterEvaluator.getExpectedEntries();
         if (expectedEntries == -1 || expectedEntries >
                 
pCtx.getConf().getLongVar(ConfVars.TEZ_MAX_BLOOM_FILTER_ENTRIES)) {
           removeSemiJoin = true;
           if (LOG.isDebugEnabled()) {
             LOG.debug("expectedEntries=" + expectedEntries + ". "
-                + "Either stats unavailable or expectedEntries exceeded max 
allowable bloomfilter size. "
-                + "Removing semijoin "
-                + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
+                    + "Either stats unavailable or expectedEntries exceeded 
max allowable bloomfilter size. "
+                    + "Removing semijoin "
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
           }
           break;
         }
       }
 
+      // At this point, hinted semijoin case has been handled already
       // Check if big table is big enough that runtime filtering is
       // worth it.
       if (ts.getStatistics() != null) {
@@ -831,16 +861,16 @@ public class TezCompiler extends TaskCompiler {
           removeSemiJoin = true;
           if (LOG.isDebugEnabled()) {
             LOG.debug("Insufficient rows (" + numRows + ") to justify semijoin 
optimization. Removing semijoin "
-                + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
+                    + OperatorUtils.getOpNamePretty(rs) + " - " + 
OperatorUtils.getOpNamePretty(ts));
           }
         }
       }
-
       if (removeSemiJoin) {
         // The stats are not annotated, remove the semijoin operator
         GenTezUtils.removeBranch(rs);
         GenTezUtils.removeSemiJoinOperator(pCtx, rs, ts);
       }
+
       return null;
     }
   }
@@ -905,15 +935,23 @@ public class TezCompiler extends TaskCompiler {
           }
 
           ReduceSinkOperator rs = (ReduceSinkOperator) child;
-          TableScanOperator ts = parseContext.getRsOpToTsOpMap().get(rs);
-          if (ts == null || ts != bigTableTS) {
-            // skip, no semijoin or not the one we are looking for.
+          SemiJoinBranchInfo sjInfo = 
parseContext.getRsToSemiJoinBranchInfo().get(rs);
+          if (sjInfo == null) continue;
+
+          TableScanOperator ts = sjInfo.getTsOp();
+          if (ts != bigTableTS) {
+            // skip, not the one we are looking for.
             continue;
           }
 
+          parallelEdges = true;
+
+          if (sjInfo.getIsHint()) {
+            // Created by hint, skip it
+            continue;
+          }
           // Add the semijoin branch to the map
           semijoins.put(rs, ts);
-          parallelEdges = true;
         }
       }
     }
@@ -1141,10 +1179,15 @@ public class TezCompiler extends TaskCompiler {
     }
 
     List<ReduceSinkOperator> semijoinRsToRemove = new 
ArrayList<ReduceSinkOperator>();
-    Map<ReduceSinkOperator, TableScanOperator> map = 
procCtx.parseContext.getRsOpToTsOpMap();
+    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = 
procCtx.parseContext.getRsToSemiJoinBranchInfo();
     double semijoinReductionThreshold = procCtx.conf.getFloatVar(
         HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
     for (ReduceSinkOperator rs : map.keySet()) {
+      SemiJoinBranchInfo sjInfo = map.get(rs);
+      if (sjInfo.getIsHint()) {
+        // Semijoin created using hint, skip it
+        continue;
+      }
       // rs is semijoin optimization branch, which should look like 
<Parent>-SEL-GB1-RS1-GB2-RS2
       // Get to the SelectOperator ancestor
       SelectOperator sel = null;
@@ -1159,7 +1202,7 @@ public class TezCompiler extends TaskCompiler {
       }
 
       // Check the ndv/rows from the SEL vs the destination tablescan the 
semijoin opt is going to.
-      TableScanOperator ts = map.get(rs);
+      TableScanOperator ts = sjInfo.getTsOp();
       RuntimeValuesInfo rti = 
procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
       ExprNodeDesc tsExpr = rti.getTsColExpr();
       // In the SEL operator of the semijoin branch, there should be only one 
column in the operator
@@ -1179,7 +1222,7 @@ public class TezCompiler extends TaskCompiler {
     }
 
     for (ReduceSinkOperator rs : semijoinRsToRemove) {
-      TableScanOperator ts = map.get(rs);
+      TableScanOperator ts = map.get(rs).getTsOp();
       if (LOG.isDebugEnabled()) {
         LOG.debug("Reduction factor not satisfied for " + 
OperatorUtils.getOpNamePretty(rs)
             + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin 
optimization.");

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
index 18e4fbd..3143554 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDynamicListDesc.java
@@ -18,7 +18,10 @@
 
 package org.apache.hadoop.hive.ql.plan;
 
+import java.util.Map;
+
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
 
 /**
@@ -29,14 +32,17 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
 
   Operator<? extends OperatorDesc> source;
   int keyIndex;
+  Map<String, SemiJoinHint> hints;
 
   public ExprNodeDynamicListDesc() {
   }
 
-  public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends 
OperatorDesc> source, int keyIndex) {
+  public ExprNodeDynamicListDesc(TypeInfo typeInfo, Operator<? extends 
OperatorDesc> source,
+      int keyIndex, Map<String, SemiJoinHint> hints) {
     super(typeInfo);
     this.source = source;
     this.keyIndex = keyIndex;
+    this.hints = hints;
   }
 
   public void setSource(Operator<? extends OperatorDesc> source) {
@@ -57,8 +63,7 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
 
   @Override
   public ExprNodeDesc clone() {
-    ExprNodeDynamicListDesc clone = new ExprNodeDynamicListDesc(typeInfo, 
source, keyIndex);
-    return clone;
+    return new ExprNodeDynamicListDesc(typeInfo, source, keyIndex, hints);
   }
 
   @Override
@@ -78,4 +83,8 @@ public class ExprNodeDynamicListDesc extends ExprNodeDesc {
   public String toString() {
     return source.toString();
   }
+
+  public Map<String, SemiJoinHint> getHints() {
+    return hints;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
index bcf3691..032c7bb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/JoinDesc.java
@@ -29,6 +29,7 @@ import java.util.Map;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.parse.QBJoinTree;
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.apache.hadoop.hive.ql.plan.Explain.Level;
 
 
@@ -106,6 +107,10 @@ public class JoinDesc extends AbstractOperatorDesc {
   private transient Map<String, Operator<? extends OperatorDesc>> 
aliasToOpInfo;
   private transient boolean leftInputJoin;
   private transient List<String> streamAliases;
+  // Note: there are two things in Hive called semi-joins - the left semi join 
construct,
+  //       and also a bloom-filter based optimization that came later. This is 
for the latter.
+  //       Everything else in this desc that says "semi-join" is for the 
former.
+  private transient Map<String, SemiJoinHint> semiJoinHints;
 
   public JoinDesc() {
   }
@@ -197,6 +202,7 @@ public class JoinDesc extends AbstractOperatorDesc {
     this.filterMap = clone.filterMap;
     this.residualFilterExprs = clone.residualFilterExprs;
     this.statistics = clone.statistics;
+    this.semiJoinHints = clone.semiJoinHints;
   }
 
   public Map<Byte, List<ExprNodeDesc>> getExprs() {
@@ -682,4 +688,16 @@ public class JoinDesc extends AbstractOperatorDesc {
     streamAliases = joinDesc.streamAliases == null ? null : new 
ArrayList<String>(joinDesc.streamAliases);
   }
 
+  private static final org.slf4j.Logger LOG = 
org.slf4j.LoggerFactory.getLogger(JoinDesc.class);
+  public void setSemiJoinHints(Map<String, SemiJoinHint> semiJoinHints) {
+    if (semiJoinHints != null || this.semiJoinHints != null) {
+      LOG.debug("Setting semi-join hints to " + semiJoinHints);
+    }
+    this.semiJoinHints = semiJoinHints;
+  }
+
+  public Map<String, SemiJoinHint> getSemiJoinHints() {
+    return semiJoinHints;
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java 
b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
index 71c7310..f45daa8 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java
@@ -26,6 +26,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.Stack;
 
+import org.apache.hadoop.hive.ql.parse.SemiJoinHint;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
@@ -134,13 +135,12 @@ public class SyntheticJoinPredicate extends Transform {
     public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
         Object... nodeOutputs) throws SemanticException {
 
-      ParseContext pCtx = ((SyntheticContext) procCtx).getParseContext();
-
       @SuppressWarnings("unchecked")
       CommonJoinOperator<JoinDesc> join = (CommonJoinOperator<JoinDesc>) nd;
 
       ReduceSinkOperator source = (ReduceSinkOperator) stack.get(stack.size() 
- 2);
       int srcPos = join.getParentOperators().indexOf(source);
+      Map<String, SemiJoinHint> hints = join.getConf().getSemiJoinHints();
 
       List<Operator<? extends OperatorDesc>> parents = 
join.getParentOperators();
 
@@ -181,7 +181,7 @@ public class SyntheticJoinPredicate extends Transform {
           inArgs.add(sourceKeys.get(i));
 
           ExprNodeDynamicListDesc dynamicExpr =
-              new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), 
target, i);
+              new ExprNodeDynamicListDesc(targetKeys.get(i).getTypeInfo(), 
target, i, hints);
 
           inArgs.add(dynamicExpr);
 

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
index 2b84beb..2413ae6 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java
@@ -72,6 +72,7 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
   public static class GenericUDAFBloomFilterEvaluator extends 
GenericUDAFEvaluator {
     // Source operator to get the number of entries
     private SelectOperator sourceOperator;
+    private long hintEntries = -1;
     private long maxEntries = 0;
     private long minEntries = 0;
     private float factor = 1;
@@ -254,6 +255,10 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
     }
 
     public long getExpectedEntries() {
+      // If hint is provided use that size.
+      if (hintEntries > 0 )
+        return hintEntries;
+
       long expectedEntries = -1;
       if (sourceOperator != null && sourceOperator.getStatistics() != null) {
         Statistics stats = sourceOperator.getStatistics();
@@ -294,6 +299,14 @@ public class GenericUDAFBloomFilter implements 
GenericUDAFResolver2 {
       this.sourceOperator = sourceOperator;
     }
 
+    public void setHintEntries(long hintEntries) {
+      this.hintEntries = hintEntries;
+    }
+
+    public boolean hasHintEntries() {
+      return hintEntries != -1;
+    }
+
     public void setMaxEntries(long maxEntries) {
       this.maxEntries = maxEntries;
     }

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/queries/clientpositive/semijoin_hint.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/semijoin_hint.q 
b/ql/src/test/queries/clientpositive/semijoin_hint.q
new file mode 100644
index 0000000..5de0c8c
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/semijoin_hint.q
@@ -0,0 +1,54 @@
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.cbo.enable=true;
+set hive.compute.query.using.stats=false;
+set hive.mapred.mode=nonstrict;
+set hive.optimize.ppd=true;
+set hive.ppd.remove.duplicatefilters=true;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.tez.dynamic.semijoin.reduction=true;
+set hive.optimize.metadataonly=false;
+set hive.optimize.index.filter=true;
+set hive.stats.autogather=true;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.dynamic.semijoin.reduction.threshold=-999999999999;
+
+-- Create Tables
+create table alltypesorc_int ( cint int, cstring string ) stored as ORC;
+create table srcpart_date (str string, value string) partitioned by (ds string 
) stored as ORC;
+CREATE TABLE srcpart_small(key1 STRING, value1 STRING) partitioned by (ds 
string) STORED as ORC;
+
+-- Add Partitions
+alter table srcpart_date add partition (ds = "2008-04-08");
+alter table srcpart_date add partition (ds = "2008-04-09");
+
+alter table srcpart_small add partition (ds = "2008-04-08");
+alter table srcpart_small add partition (ds = "2008-04-09");
+
+-- Load
+insert overwrite table alltypesorc_int select cint, cstring1 from alltypesorc;
+insert overwrite table srcpart_date partition (ds = "2008-04-08" ) select key, 
value from srcpart where ds = "2008-04-08";
+insert overwrite table srcpart_date partition (ds = "2008-04-09") select key, 
value from srcpart where ds = "2008-04-09";
+insert overwrite table srcpart_small partition (ds = "2008-04-09") select key, 
value from srcpart where ds = "2008-04-09";
+analyze table alltypesorc_int compute statistics for columns;
+analyze table srcpart_date compute statistics for columns;
+analyze table srcpart_small compute statistics for columns;
+
+set hive.cbo.returnpath.hiveop=true;
+
+create table srccc as select * from src;
+
+EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from srcpart_date k join 
srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (k.value = 
i.cstring);
+EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from srcpart_date k join 
srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = 
i.cstring);
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join 
srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.returnpath.hiveop=false;
+
+explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date k join 
srcpart_small v on (k.str = v.key1);
+
+set hive.cbo.enable=false;
+
+explain select /*+ semi(k, str, 1000)*/ count(*) from srcpart_date k join 
srcpart_small v on (k.str = v.key1);
+

http://git-wip-us.apache.org/repos/asf/hive/blob/9d5d737d/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out 
b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
new file mode 100644
index 0000000..bac9240
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/semijoin_hint.q.out
@@ -0,0 +1,899 @@
+PREHOOK: query: create table alltypesorc_int ( cint int, cstring string ) 
stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: create table alltypesorc_int ( cint int, cstring string ) 
stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@alltypesorc_int
+PREHOOK: query: create table srcpart_date (str string, value string) 
partitioned by (ds string ) stored as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: create table srcpart_date (str string, value string) 
partitioned by (ds string ) stored as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_date
+PREHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) 
partitioned by (ds string) STORED as ORC
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: CREATE TABLE srcpart_small(key1 STRING, value1 STRING) 
partitioned by (ds string) STORED as ORC
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srcpart_small
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_date
+POSTHOOK: query: alter table srcpart_date add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-08")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@srcpart_small
+POSTHOOK: query: alter table srcpart_small add partition (ds = "2008-04-09")
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+PREHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 
from alltypesorc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+PREHOOK: Output: default@alltypesorc_int
+POSTHOOK: query: insert overwrite table alltypesorc_int select cint, cstring1 
from alltypesorc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+POSTHOOK: Output: default@alltypesorc_int
+POSTHOOK: Lineage: alltypesorc_int.cint SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ]
+POSTHOOK: Lineage: alltypesorc_int.cstring SIMPLE 
[(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, 
comment:null), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = 
"2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = 
"2008-04-08" ) select key, value from srcpart where ds = "2008-04-08"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).str SIMPLE 
[(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-08).value SIMPLE 
[(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_date partition (ds = 
"2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_date partition (ds = 
"2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).str SIMPLE 
[(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_date PARTITION(ds=2008-04-09).value SIMPLE 
[(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table srcpart_small partition (ds = 
"2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: query: insert overwrite table srcpart_small partition (ds = 
"2008-04-09") select key, value from srcpart where ds = "2008-04-09"
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11
+POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).key1 SIMPLE 
[(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: srcpart_small PARTITION(ds=2008-04-09).value1 SIMPLE 
[(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: analyze table alltypesorc_int compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc_int
+PREHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table alltypesorc_int compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc_int
+POSTHOOK: Output: default@alltypesorc_int
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_date compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_date
+PREHOOK: Input: default@srcpart_date@ds=2008-04-08
+PREHOOK: Input: default@srcpart_date@ds=2008-04-09
+PREHOOK: Output: default@srcpart_date
+PREHOOK: Output: default@srcpart_date@ds=2008-04-08
+PREHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_date compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_date
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_date@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_date
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_date@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: analyze table srcpart_small compute statistics for columns
+PREHOOK: type: QUERY
+PREHOOK: Input: default@srcpart_small
+PREHOOK: Input: default@srcpart_small@ds=2008-04-08
+PREHOOK: Input: default@srcpart_small@ds=2008-04-09
+PREHOOK: Output: default@srcpart_small
+PREHOOK: Output: default@srcpart_small@ds=2008-04-08
+PREHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+POSTHOOK: query: analyze table srcpart_small compute statistics for columns
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@srcpart_small
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Input: default@srcpart_small@ds=2008-04-09
+POSTHOOK: Output: default@srcpart_small
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-08
+POSTHOOK: Output: default@srcpart_small@ds=2008-04-09
+#### A masked pattern was here ####
+PREHOOK: query: create table srccc as select * from src
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@srccc
+POSTHOOK: query: create table srccc as select * from src
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@srccc
+POSTHOOK: Lineage: srccc.key SIMPLE [(src)src.FieldSchema(name:key, 
type:string, comment:default), ]
+POSTHOOK: Lineage: srccc.value SIMPLE [(src)src.FieldSchema(name:value, 
type:string, comment:default), ]
+PREHOOK: query: EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i 
on (k.value = i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select  /*+ semi(k, str, 5000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i 
on (k.value = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Reducer 7 (BROADCAST_EDGE)
+        Map 8 <- Reducer 5 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
+        Reducer 3 <- Map 8 (SIMPLE_EDGE), Reducer 2 (SIMPLE_EDGE)
+        Reducer 4 <- Reducer 3 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 7 <- Map 6 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: i
+                  filterExpr: (cstring is not null and (cstring BETWEEN 
DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and 
in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: 
boolean)
+                  Statistics: Num rows: 12288 Data size: 862450 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (cstring is not null and (cstring BETWEEN 
DynamicValue(RS_7_k_cstring_min) AND DynamicValue(RS_7_k_cstring_max) and 
in_bloom_filter(cstring, DynamicValue(RS_7_k_cstring_bloom_filter)))) (type: 
boolean)
+                    Statistics: Num rows: 9174 Data size: 643900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: cstring (type: string)
+                      outputColumnNames: cstring
+                      Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: cstring (type: string)
+                        sort order: +
+                        Map-reduce partition columns: cstring (type: string)
+                        Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: (str is not null and value is not null) (type: 
boolean)
+                  Statistics: Num rows: 2000 Data size: 356000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (str is not null and value is not null) (type: 
boolean)
+                    Statistics: Num rows: 2000 Data size: 356000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string), value (type: string)
+                      outputColumnNames: str, value
+                      Statistics: Num rows: 2000 Data size: 356000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: value (type: string)
+                        sort order: +
+                        Map-reduce partition columns: value (type: string)
+                        Statistics: Num rows: 2000 Data size: 356000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        value expressions: str (type: string)
+                      Select Operator
+                        expressions: value (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 182000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=5000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 8 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_9_i_key1_min) AND DynamicValue(RS_9_i_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_9_i_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic 
stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 cstring (type: string)
+                  1 value (type: string)
+                outputColumnNames: str
+                Statistics: Num rows: 3281 Data size: 285447 Basic stats: 
COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  key expressions: str (type: string)
+                  sort order: +
+                  Map-reduce partition columns: str (type: string)
+                  Statistics: Num rows: 3281 Data size: 285447 Basic stats: 
COMPLETE Column stats: COMPLETE
+                Select Operator
+                  expressions: str (type: string)
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 3281 Data size: 285447 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Group By Operator
+                    aggregations: min(_col0), max(_col0), bloom_filter(_col0, 
expectedEntries=410)
+                    mode: hash
+                    outputColumnNames: _col0, _col1, _col2
+                    Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 16004 Data size: 128032 Basic stats: 
COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 16004 Data size: 64016 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=410)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+        Reducer 7 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=5000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from srcpart_date 
k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i on (v.key1 = 
i.cstring)
+PREHOOK: type: QUERY
+POSTHOOK: query: EXPLAIN select  /*+ semi(i, 3000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1) join alltypesorc_int i 
on (v.key1 = i.cstring)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 6 <- Reducer 4 (BROADCAST_EDGE)
+        Map 7 <- Reducer 5 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE), Map 7 
(SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: i
+                  filterExpr: cstring is not null (type: boolean)
+                  Statistics: Num rows: 12288 Data size: 862450 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: cstring is not null (type: boolean)
+                    Statistics: Num rows: 9174 Data size: 643900 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: cstring (type: string)
+                      outputColumnNames: cstring
+                      Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: cstring (type: string)
+                        sort order: +
+                        Map-reduce partition columns: cstring (type: string)
+                        Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: cstring (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=3000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+                      Select Operator
+                        expressions: cstring (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 9174 Data size: 643900 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=3000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_i_key1_min) AND DynamicValue(RS_3_i_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_i_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic 
stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 7 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: (str is not null and (str BETWEEN 
DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and 
in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (str is not null and (str BETWEEN 
DynamicValue(RS_3_i_str_min) AND DynamicValue(RS_3_i_str_max) and 
in_bloom_filter(str, DynamicValue(RS_3_i_str_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: str
+                      Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: str (type: string)
+                        sort order: +
+                        Map-reduce partition columns: str (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                     Inner Join 1 to 2
+                keys:
+                  0 cstring (type: string)
+                  1 key1 (type: string)
+                  2 str (type: string)
+                Statistics: Num rows: 16008 Data size: 128064 Basic stats: 
COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 16008 Data size: 64032 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=3000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+        Reducer 5 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=3000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: str
+                      Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: str (type: string)
+                        sort order: +
+                        Map-reduce partition columns: str (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: str (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=1000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: key1
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: key1 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: key1 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic 
stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: 
COMPLETE Column stats: PARTIAL
+                Select Operator
+                  Statistics: Num rows: 9756 Data size: 39024 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Group By Operator
+                    aggregations: count()
+                    mode: hash
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                    Reduce Output Operator
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: $f0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date 
k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, 1000)*/ count(*) from srcpart_date 
k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Select Operator
+                        expressions: _col0 (type: string)
+                        outputColumnNames: _col0
+                        Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                        Group By Operator
+                          aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=1000)
+                          mode: hash
+                          outputColumnNames: _col0, _col1, _col2
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Reduce Output Operator
+                            sort order: 
+                            Statistics: Num rows: 1 Data size: 552 Basic 
stats: COMPLETE Column stats: COMPLETE
+                            value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_6_k_key1_min) AND DynamicValue(RS_6_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_6_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Select Operator
+                      expressions: key1 (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 1000 Data size: 87000 Basic 
stats: COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: 
COMPLETE Column stats: PARTIAL
+                Group By Operator
+                  aggregations: count()
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                    value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1)
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select /*+ semi(k, str, 1000)*/ count(*) from 
srcpart_date k join srcpart_small v on (k.str = v.key1)
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 5 <- Reducer 4 (BROADCAST_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 5 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: k
+                  filterExpr: str is not null (type: boolean)
+                  Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: str is not null (type: boolean)
+                    Statistics: Num rows: 2000 Data size: 174000 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: str (type: string)
+                      sort order: +
+                      Map-reduce partition columns: str (type: string)
+                      Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: str (type: string)
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 2000 Data size: 174000 Basic 
stats: COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: min(_col0), max(_col0), 
bloom_filter(_col0, expectedEntries=1000)
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2
+                        Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 552 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col0 (type: string), _col1 
(type: string), _col2 (type: binary)
+            Execution mode: llap
+            LLAP IO: all inputs
+        Map 5 
+            Map Operator Tree:
+                TableScan
+                  alias: v
+                  filterExpr: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                  Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                  Filter Operator
+                    predicate: (key1 is not null and (key1 BETWEEN 
DynamicValue(RS_3_k_key1_min) AND DynamicValue(RS_3_k_key1_max) and 
in_bloom_filter(key1, DynamicValue(RS_3_k_key1_bloom_filter)))) (type: boolean)
+                    Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+                    Reduce Output Operator
+                      key expressions: key1 (type: string)
+                      sort order: +
+                      Map-reduce partition columns: key1 (type: string)
+                      Statistics: Num rows: 1000 Data size: 87000 Basic stats: 
COMPLETE Column stats: PARTIAL
+            Execution mode: llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 str (type: string)
+                  1 key1 (type: string)
+                Statistics: Num rows: 9756 Data size: 78048 Basic stats: 
COMPLETE Column stats: PARTIAL
+                Group By Operator
+                  aggregations: count()
+                  mode: hash
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  Reduce Output Operator
+                    sort order: 
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                    value expressions: _col0 (type: bigint)
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(VALUE._col0)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: PARTIAL
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
bloom_filter(VALUE._col2, expectedEntries=1000)
+                mode: final
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: string), _col1 (type: 
string), _col2 (type: binary)
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

[1/5] hive git commit: HIVE-16423: Add hint to enforce semi join optimization (Deepak Jaiswal, reviewed by Jason Dere)

Reply via email to