http://git-wip-us.apache.org/repos/asf/hive/blob/831bd7d8/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig new file mode 100644 index 0000000..699bb11 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java.orig @@ -0,0 +1,13038 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.parse; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.Serializable; +import java.security.AccessControlException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Queue; +import java.util.Set; +import java.util.TreeSet; +import java.util.UUID; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.antlr.runtime.ClassicToken; +import org.antlr.runtime.CommonToken; +import org.antlr.runtime.Token; +import org.antlr.runtime.tree.Tree; +import org.antlr.runtime.tree.TreeWizard; +import org.antlr.runtime.tree.TreeWizard.ContextVisitor; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.hive.common.BlobStorageUtils; +import org.apache.hadoop.hive.ql.plan.AlterTableDesc; +import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.ObjectPair; +import org.apache.hadoop.hive.common.StatsSetupConst; +import org.apache.hadoop.hive.common.StatsSetupConst.StatDB; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.conf.HiveConf.StrictChecks; +import org.apache.hadoop.hive.metastore.MetaStoreUtils; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.Warehouse; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.FieldSchema; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.metastore.api.SQLForeignKey; +import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey; +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.QueryProperties; +import org.apache.hadoop.hive.ql.QueryState; +import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.ArchiveUtils; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory; +import org.apache.hadoop.hive.ql.exec.FetchTask; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.FilterOperator; +import org.apache.hadoop.hive.ql.exec.FunctionInfo; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.JoinOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.RecordReader; +import org.apache.hadoop.hive.ql.exec.RecordWriter; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Task; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.exec.UnionOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.hooks.ReadEntity; +import org.apache.hadoop.hive.ql.hooks.WriteEntity; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.AcidUtils.Operation; +import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.NullRowsInputFormat; +import org.apache.hadoop.hive.ql.io.RCFileInputFormat; +import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.metadata.DummyPartition; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.HiveUtils; +import org.apache.hadoop.hive.ql.metadata.InvalidTableException; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.ql.optimizer.Optimizer; +import org.apache.hadoop.hive.ql.optimizer.Transform; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException.UnsupportedFeature; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.HiveOpConverterPostProc; +import org.apache.hadoop.hive.ql.optimizer.lineage.Generator; +import org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext; +import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec.SpecType; +import org.apache.hadoop.hive.ql.parse.CalcitePlanner.ASTSearcher; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderExpression; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.OrderSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFInputSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PTFQueryInputType; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionExpression; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitionedTableFunctionSpec; +import org.apache.hadoop.hive.ql.parse.PTFInvocationSpec.PartitioningSpec; +import org.apache.hadoop.hive.ql.parse.QBSubQuery.SubQueryType; +import org.apache.hadoop.hive.ql.parse.SubQueryUtils.ISubQueryJoinInfo; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.BoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.CurrentRowSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.Direction; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.RangeBoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.ValueBoundarySpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowExpressionSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFrameSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowFunctionSpec; +import org.apache.hadoop.hive.ql.parse.WindowingSpec.WindowSpec; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.AlterTableDesc.AlterTableTypes; +import org.apache.hadoop.hive.ql.plan.CreateTableDesc; +import org.apache.hadoop.hive.ql.plan.CreateTableLikeDesc; +import org.apache.hadoop.hive.ql.plan.CreateViewDesc; +import org.apache.hadoop.hive.ql.plan.DDLWork; +import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.FileSinkDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc; +import org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc; +import org.apache.hadoop.hive.ql.plan.ForwardDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.HiveOperation; +import org.apache.hadoop.hive.ql.plan.JoinCondDesc; +import org.apache.hadoop.hive.ql.plan.JoinDesc; +import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc; +import org.apache.hadoop.hive.ql.plan.LateralViewJoinDesc; +import org.apache.hadoop.hive.ql.plan.LimitDesc; +import org.apache.hadoop.hive.ql.plan.ListBucketingCtx; +import org.apache.hadoop.hive.ql.plan.LoadFileDesc; +import org.apache.hadoop.hive.ql.plan.LoadTableDesc; +import org.apache.hadoop.hive.ql.plan.MapJoinDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PTFDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; +import org.apache.hadoop.hive.ql.plan.ScriptDesc; +import org.apache.hadoop.hive.ql.plan.SelectDesc; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.plan.UDTFDesc; +import org.apache.hadoop.hive.ql.plan.UnionDesc; +import org.apache.hadoop.hive.ql.plan.ptf.OrderExpressionDef; +import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef; +import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef; +import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject; +import org.apache.hadoop.hive.ql.session.SessionState; +import org.apache.hadoop.hive.ql.session.SessionState.ResourceType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFHash; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; +import org.apache.hadoop.hive.ql.util.ResourceDownloader; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.Deserializer; +import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; +import org.apache.hadoop.hive.serde2.NoOpFetchFormatter; +import org.apache.hadoop.hive.serde2.NullStructSerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.thrift.ThriftJDBCBinarySerDe; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.hadoop.hive.shims.Utils; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.OutputFormat; +import org.apache.hadoop.security.UserGroupInformation; + +import com.google.common.collect.Sets; + +/** + * Implementation of the semantic analyzer. It generates the query plan. + * There are other specific semantic analyzers for some hive operations such as + * DDLSemanticAnalyzer for ddl operations. + */ + +public class SemanticAnalyzer extends BaseSemanticAnalyzer { + + public static final String DUMMY_DATABASE = "_dummy_database"; + public static final String DUMMY_TABLE = "_dummy_table"; + public static final String SUBQUERY_TAG_1 = "-subquery1"; + public static final String SUBQUERY_TAG_2 = "-subquery2"; + + // Max characters when auto generating the column name with func name + private static final int AUTOGEN_COLALIAS_PRFX_MAXLENGTH = 20; + + private static final String VALUES_TMP_TABLE_NAME_PREFIX = "Values__Tmp__Table__"; + + static final String MATERIALIZATION_MARKER = "$MATERIALIZATION"; + + private HashMap<TableScanOperator, ExprNodeDesc> opToPartPruner; + private HashMap<TableScanOperator, PrunedPartitionList> opToPartList; + protected HashMap<String, TableScanOperator> topOps; + protected LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext> opParseCtx; + private List<LoadTableDesc> loadTableWork; + private List<LoadFileDesc> loadFileWork; + private List<ColumnStatsAutoGatherContext> columnStatsAutoGatherContexts; + private final Map<JoinOperator, QBJoinTree> joinContext; + private final Map<SMBMapJoinOperator, QBJoinTree> smbMapJoinContext; + private final HashMap<TableScanOperator, Table> topToTable; + private final Map<FileSinkOperator, Table> fsopToTable; + private final List<ReduceSinkOperator> reduceSinkOperatorsAddedByEnforceBucketingSorting; + private final HashMap<TableScanOperator, Map<String, String>> topToTableProps; + private QB qb; + private ASTNode ast; + private int destTableId; + private UnionProcContext uCtx; + List<AbstractMapJoinOperator<? extends MapJoinDesc>> listMapJoinOpsNoReducer; + private HashMap<TableScanOperator, SampleDesc> opToSamplePruner; + private final Map<TableScanOperator, Map<String, ExprNodeDesc>> opToPartToSkewedPruner; + private Map<SelectOperator, Table> viewProjectToTableSchema; + /** + * a map for the split sampling, from alias to an instance of SplitSample + * that describes percentage and number. + */ + private final HashMap<String, SplitSample> nameToSplitSample; + Map<GroupByOperator, Set<String>> groupOpToInputTables; + Map<String, PrunedPartitionList> prunedPartitions; + protected List<FieldSchema> resultSchema; + private CreateViewDesc createVwDesc; + private ArrayList<String> viewsExpanded; + private ASTNode viewSelect; + protected final UnparseTranslator unparseTranslator; + private final GlobalLimitCtx globalLimitCtx; + + // prefix for column names auto generated by hive + private final String autogenColAliasPrfxLbl; + private final boolean autogenColAliasPrfxIncludeFuncName; + + // Keep track of view alias to read entity corresponding to the view + // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T + // keeps track of aliases for V3, V3:V2, V3:V2:V1. + // This is used when T is added as an input for the query, the parents of T is + // derived from the alias V3:V2:V1:T + private final Map<String, ReadEntity> viewAliasToInput; + + //need merge isDirect flag to input even if the newInput does not have a parent + private boolean mergeIsDirect; + + // flag for no scan during analyze ... compute statistics + protected boolean noscan; + + //flag for partial scan during analyze ... compute statistics + protected boolean partialscan; + + protected volatile boolean disableJoinMerge = false; + + /* + * Capture the CTE definitions in a Query. + */ + final Map<String, CTEClause> aliasToCTEs; + + /* + * Used to check recursive CTE invocations. Similar to viewsExpanded + */ + ArrayList<String> ctesExpanded; + + /* + * Whether root tasks after materialized CTE linkage have been resolved + */ + boolean rootTasksResolved; + + protected TableMask tableMask; + + CreateTableDesc tableDesc; + + /** Not thread-safe. */ + final ASTSearcher astSearcher = new ASTSearcher(); + + protected AnalyzeRewriteContext analyzeRewrite; + + // A mapping from a tableName to a table object in metastore. + Map<String, Table> tabNameToTabObject; + + // The tokens we should ignore when we are trying to do table masking. + private final Set<Integer> ignoredTokens = Sets.newHashSet(HiveParser.TOK_GROUPBY, + HiveParser.TOK_ORDERBY, HiveParser.TOK_WINDOWSPEC, HiveParser.TOK_CLUSTERBY, + HiveParser.TOK_DISTRIBUTEBY, HiveParser.TOK_SORTBY); + + static class Phase1Ctx { + String dest; + int nextNum; + } + + public SemanticAnalyzer(QueryState queryState) throws SemanticException { + super(queryState); + opToPartPruner = new HashMap<TableScanOperator, ExprNodeDesc>(); + opToPartList = new HashMap<TableScanOperator, PrunedPartitionList>(); + opToSamplePruner = new HashMap<TableScanOperator, SampleDesc>(); + nameToSplitSample = new HashMap<String, SplitSample>(); + // Must be deterministic order maps - see HIVE-8707 + topOps = new LinkedHashMap<String, TableScanOperator>(); + loadTableWork = new ArrayList<LoadTableDesc>(); + loadFileWork = new ArrayList<LoadFileDesc>(); + columnStatsAutoGatherContexts = new ArrayList<ColumnStatsAutoGatherContext>(); + opParseCtx = new LinkedHashMap<Operator<? extends OperatorDesc>, OpParseContext>(); + joinContext = new HashMap<JoinOperator, QBJoinTree>(); + smbMapJoinContext = new HashMap<SMBMapJoinOperator, QBJoinTree>(); + // Must be deterministic order map for consistent q-test output across Java versions + topToTable = new LinkedHashMap<TableScanOperator, Table>(); + fsopToTable = new HashMap<FileSinkOperator, Table>(); + reduceSinkOperatorsAddedByEnforceBucketingSorting = new ArrayList<ReduceSinkOperator>(); + topToTableProps = new HashMap<TableScanOperator, Map<String, String>>(); + destTableId = 1; + uCtx = null; + listMapJoinOpsNoReducer = new ArrayList<AbstractMapJoinOperator<? extends MapJoinDesc>>(); + groupOpToInputTables = new HashMap<GroupByOperator, Set<String>>(); + prunedPartitions = new HashMap<String, PrunedPartitionList>(); + tabNameToTabObject = new HashMap<String, Table>(); + unparseTranslator = new UnparseTranslator(conf); + autogenColAliasPrfxLbl = HiveConf.getVar(conf, + HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_LABEL); + autogenColAliasPrfxIncludeFuncName = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVE_AUTOGEN_COLUMNALIAS_PREFIX_INCLUDEFUNCNAME); + queryProperties = new QueryProperties(); + opToPartToSkewedPruner = new HashMap<TableScanOperator, Map<String, ExprNodeDesc>>(); + aliasToCTEs = new HashMap<String, CTEClause>(); + globalLimitCtx = new GlobalLimitCtx(); + viewAliasToInput = new HashMap<String, ReadEntity>(); + mergeIsDirect = true; + noscan = partialscan = false; + tabNameToTabObject = new HashMap<>(); + } + + @Override + protected void reset(boolean clearPartsCache) { + super.reset(true); + if(clearPartsCache) { + prunedPartitions.clear(); + + //When init(true) combine with genResolvedParseTree, it will generate Resolved Parse tree from syntax tree + //ReadEntity created under these conditions should be all relevant to the syntax tree even the ones without parents + //set mergeIsDirect to true here. + mergeIsDirect = true; + } else { + mergeIsDirect = false; + } + tabNameToTabObject.clear(); + loadTableWork.clear(); + loadFileWork.clear(); + columnStatsAutoGatherContexts.clear(); + topOps.clear(); + destTableId = 1; + idToTableNameMap.clear(); + qb = null; + ast = null; + uCtx = null; + joinContext.clear(); + smbMapJoinContext.clear(); + opParseCtx.clear(); + groupOpToInputTables.clear(); + disableJoinMerge = false; + aliasToCTEs.clear(); + topToTable.clear(); + opToPartPruner.clear(); + opToPartList.clear(); + opToPartToSkewedPruner.clear(); + opToSamplePruner.clear(); + nameToSplitSample.clear(); + fsopToTable.clear(); + resultSchema = null; + createVwDesc = null; + viewsExpanded = null; + viewSelect = null; + ctesExpanded = null; + globalLimitCtx.disableOpt(); + viewAliasToInput.clear(); + reduceSinkOperatorsAddedByEnforceBucketingSorting.clear(); + topToTableProps.clear(); + listMapJoinOpsNoReducer.clear(); + unparseTranslator.clear(); + queryProperties.clear(); + outputs.clear(); + } + + public void initParseCtx(ParseContext pctx) { + opToPartPruner = pctx.getOpToPartPruner(); + opToPartList = pctx.getOpToPartList(); + opToSamplePruner = pctx.getOpToSamplePruner(); + topOps = pctx.getTopOps(); + loadTableWork = pctx.getLoadTableWork(); + loadFileWork = pctx.getLoadFileWork(); + ctx = pctx.getContext(); + destTableId = pctx.getDestTableId(); + idToTableNameMap = pctx.getIdToTableNameMap(); + uCtx = pctx.getUCtx(); + listMapJoinOpsNoReducer = pctx.getListMapJoinOpsNoReducer(); + prunedPartitions = pctx.getPrunedPartitions(); + tabNameToTabObject = pctx.getTabNameToTabObject(); + fetchTask = pctx.getFetchTask(); + setLineageInfo(pctx.getLineageInfo()); + } + + public ParseContext getParseContext() { + // Make sure the basic query properties are initialized + copyInfoToQueryProperties(queryProperties); + return new ParseContext(queryState, opToPartPruner, opToPartList, topOps, + new HashSet<JoinOperator>(joinContext.keySet()), + new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()), + loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx, + listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject, + opToSamplePruner, globalLimitCtx, nameToSplitSample, inputs, rootTasks, + opToPartToSkewedPruner, viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting, + analyzeRewrite, tableDesc, queryProperties, viewProjectToTableSchema, acidFileSinks); + } + + public CompilationOpContext getOpContext() { + return ctx.getOpContext(); + } + + public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias) + throws SemanticException { + doPhase1QBExpr(ast, qbexpr, id, alias, false); + } + @SuppressWarnings("nls") + public void doPhase1QBExpr(ASTNode ast, QBExpr qbexpr, String id, String alias, boolean insideView) + throws SemanticException { + + assert (ast.getToken() != null); + switch (ast.getToken().getType()) { + case HiveParser.TOK_QUERY: { + QB qb = new QB(id, alias, true); + qb.setInsideView(insideView); + Phase1Ctx ctx_1 = initPhase1Ctx(); + doPhase1(ast, qb, ctx_1, null); + + qbexpr.setOpcode(QBExpr.Opcode.NULLOP); + qbexpr.setQB(qb); + } + break; + case HiveParser.TOK_UNIONALL: { + qbexpr.setOpcode(QBExpr.Opcode.UNION); + // query 1 + assert (ast.getChild(0) != null); + QBExpr qbexpr1 = new QBExpr(alias + SUBQUERY_TAG_1); + doPhase1QBExpr((ASTNode) ast.getChild(0), qbexpr1, id + SUBQUERY_TAG_1, + alias + SUBQUERY_TAG_1, insideView); + qbexpr.setQBExpr1(qbexpr1); + + // query 2 + assert (ast.getChild(1) != null); + QBExpr qbexpr2 = new QBExpr(alias + SUBQUERY_TAG_2); + doPhase1QBExpr((ASTNode) ast.getChild(1), qbexpr2, id + SUBQUERY_TAG_2, + alias + SUBQUERY_TAG_2, insideView); + qbexpr.setQBExpr2(qbexpr2); + } + break; + } + } + + private LinkedHashMap<String, ASTNode> doPhase1GetAggregationsFromSelect( + ASTNode selExpr, QB qb, String dest) throws SemanticException { + + // Iterate over the selects search for aggregation Trees. + // Use String as keys to eliminate duplicate trees. + LinkedHashMap<String, ASTNode> aggregationTrees = new LinkedHashMap<String, ASTNode>(); + List<ASTNode> wdwFns = new ArrayList<ASTNode>(); + for (int i = 0; i < selExpr.getChildCount(); ++i) { + ASTNode function = (ASTNode) selExpr.getChild(i); + if (function.getType() == HiveParser.TOK_SELEXPR || + function.getType() == HiveParser.TOK_SUBQUERY_EXPR) { + function = (ASTNode)function.getChild(0); + } + doPhase1GetAllAggregations(function, aggregationTrees, wdwFns); + } + + // window based aggregations are handled differently + for (ASTNode wdwFn : wdwFns) { + WindowingSpec spec = qb.getWindowingSpec(dest); + if(spec == null) { + queryProperties.setHasWindowing(true); + spec = new WindowingSpec(); + qb.addDestToWindowingSpec(dest, spec); + } + HashMap<String, ASTNode> wExprsInDest = qb.getParseInfo().getWindowingExprsForClause(dest); + int wColIdx = spec.getWindowExpressions() == null ? 0 : spec.getWindowExpressions().size(); + WindowFunctionSpec wFnSpec = processWindowFunction(wdwFn, + (ASTNode)wdwFn.getChild(wdwFn.getChildCount()-1)); + // If this is a duplicate invocation of a function; don't add to WindowingSpec. + if ( wExprsInDest != null && + wExprsInDest.containsKey(wFnSpec.getExpression().toStringTree())) { + continue; + } + wFnSpec.setAlias(wFnSpec.getName() + "_window_" + wColIdx); + spec.addWindowFunction(wFnSpec); + qb.getParseInfo().addWindowingExprToClause(dest, wFnSpec.getExpression()); + } + + return aggregationTrees; + } + + private void doPhase1GetColumnAliasesFromSelect( + ASTNode selectExpr, QBParseInfo qbp) { + for (int i = 0; i < selectExpr.getChildCount(); ++i) { + ASTNode selExpr = (ASTNode) selectExpr.getChild(i); + if ((selExpr.getToken().getType() == HiveParser.TOK_SELEXPR) + && (selExpr.getChildCount() == 2)) { + String columnAlias = unescapeIdentifier(selExpr.getChild(1).getText()); + qbp.setExprToColumnAlias((ASTNode) selExpr.getChild(0), columnAlias); + } + } + } + + /** + * DFS-scan the expressionTree to find all aggregation subtrees and put them + * in aggregations. + * + * @param expressionTree + * @param aggregations + * the key to the HashTable is the toStringTree() representation of + * the aggregation subtree. + * @throws SemanticException + */ + private void doPhase1GetAllAggregations(ASTNode expressionTree, + HashMap<String, ASTNode> aggregations, List<ASTNode> wdwFns) throws SemanticException { + int exprTokenType = expressionTree.getToken().getType(); + if (exprTokenType == HiveParser.TOK_FUNCTION + || exprTokenType == HiveParser.TOK_FUNCTIONDI + || exprTokenType == HiveParser.TOK_FUNCTIONSTAR) { + assert (expressionTree.getChildCount() != 0); + if (expressionTree.getChild(expressionTree.getChildCount()-1).getType() + == HiveParser.TOK_WINDOWSPEC) { + // If it is a windowing spec, we include it in the list + // Further, we will examine its children AST nodes to check whether + // there are aggregation functions within + wdwFns.add(expressionTree); + doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(expressionTree.getChildCount()-1), + aggregations, wdwFns); + return; + } + if (expressionTree.getChild(0).getType() == HiveParser.Identifier) { + String functionName = unescapeIdentifier(expressionTree.getChild(0) + .getText()); + // Validate the function name + if (FunctionRegistry.getFunctionInfo(functionName) == null) { + throw new SemanticException(ErrorMsg.INVALID_FUNCTION.getMsg(functionName)); + } + if(FunctionRegistry.impliesOrder(functionName)) { + throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName)); + } + if (FunctionRegistry.getGenericUDAFResolver(functionName) != null) { + if(containsLeadLagUDF(expressionTree)) { + throw new SemanticException(ErrorMsg.MISSING_OVER_CLAUSE.getMsg(functionName)); + } + aggregations.put(expressionTree.toStringTree(), expressionTree); + FunctionInfo fi = FunctionRegistry.getFunctionInfo(functionName); + if (!fi.isNative()) { + unparseTranslator.addIdentifierTranslation((ASTNode) expressionTree + .getChild(0)); + } + return; + } + } + } + for (int i = 0; i < expressionTree.getChildCount(); i++) { + doPhase1GetAllAggregations((ASTNode) expressionTree.getChild(i), + aggregations, wdwFns); + } + } + + private List<ASTNode> doPhase1GetDistinctFuncExprs( + HashMap<String, ASTNode> aggregationTrees) throws SemanticException { + List<ASTNode> exprs = new ArrayList<ASTNode>(); + for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) { + ASTNode value = entry.getValue(); + assert (value != null); + if (value.getToken().getType() == HiveParser.TOK_FUNCTIONDI) { + exprs.add(value); + } + } + return exprs; + } + + public static String generateErrorMessage(ASTNode ast, String message) { + StringBuilder sb = new StringBuilder(); + if (ast == null) { + sb.append(message).append(". Cannot tell the position of null AST."); + return sb.toString(); + } + sb.append(ast.getLine()); + sb.append(":"); + sb.append(ast.getCharPositionInLine()); + sb.append(" "); + sb.append(message); + sb.append(". Error encountered near token '"); + sb.append(ErrorMsg.getText(ast)); + sb.append("'"); + return sb.toString(); + } + + ASTNode getAST() { + return this.ast; + } + + protected void setAST(ASTNode newAST) { + this.ast = newAST; + } + + /** + * Goes though the tabref tree and finds the alias for the table. Once found, + * it records the table name-> alias association in aliasToTabs. It also makes + * an association from the alias to the table AST in parse info. + * + * @return the alias of the table + */ + private String processTable(QB qb, ASTNode tabref) throws SemanticException { + // For each table reference get the table name + // and the alias (if alias is not present, the table name + // is used as an alias) + int aliasIndex = 0; + int propsIndex = -1; + int tsampleIndex = -1; + int ssampleIndex = -1; + for (int index = 1; index < tabref.getChildCount(); index++) { + ASTNode ct = (ASTNode) tabref.getChild(index); + if (ct.getToken().getType() == HiveParser.TOK_TABLEBUCKETSAMPLE) { + tsampleIndex = index; + } else if (ct.getToken().getType() == HiveParser.TOK_TABLESPLITSAMPLE) { + ssampleIndex = index; + } else if (ct.getToken().getType() == HiveParser.TOK_TABLEPROPERTIES) { + propsIndex = index; + } else { + aliasIndex = index; + } + } + + ASTNode tableTree = (ASTNode) (tabref.getChild(0)); + + String tabIdName = getUnescapedName(tableTree).toLowerCase(); + + String alias; + if (aliasIndex != 0) { + alias = unescapeIdentifier(tabref.getChild(aliasIndex).getText()); + } + else { + alias = getUnescapedUnqualifiedTableName(tableTree); + } + + if (propsIndex >= 0) { + Tree propsAST = tabref.getChild(propsIndex); + Map<String, String> props = DDLSemanticAnalyzer.getProps((ASTNode) propsAST.getChild(0)); + // We get the information from Calcite. + if ("TRUE".equals(props.get("insideView"))) { + qb.getAliasInsideView().add(alias.toLowerCase()); + } + qb.setTabProps(alias, props); + } + + // If the alias is already there then we have a conflict + if (qb.exists(alias)) { + throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(tabref + .getChild(aliasIndex))); + } + if (tsampleIndex >= 0) { + ASTNode sampleClause = (ASTNode) tabref.getChild(tsampleIndex); + ArrayList<ASTNode> sampleCols = new ArrayList<ASTNode>(); + if (sampleClause.getChildCount() > 2) { + for (int i = 2; i < sampleClause.getChildCount(); i++) { + sampleCols.add((ASTNode) sampleClause.getChild(i)); + } + } + // TODO: For now only support sampling on up to two columns + // Need to change it to list of columns + if (sampleCols.size() > 2) { + throw new SemanticException(generateErrorMessage( + (ASTNode) tabref.getChild(0), + ErrorMsg.SAMPLE_RESTRICTION.getMsg())); + } + TableSample tabSample = new TableSample( + unescapeIdentifier(sampleClause.getChild(0).getText()), + unescapeIdentifier(sampleClause.getChild(1).getText()), + sampleCols); + qb.getParseInfo().setTabSample(alias, tabSample); + if (unparseTranslator.isEnabled()) { + for (ASTNode sampleCol : sampleCols) { + unparseTranslator.addIdentifierTranslation((ASTNode) sampleCol + .getChild(0)); + } + } + } else if (ssampleIndex >= 0) { + ASTNode sampleClause = (ASTNode) tabref.getChild(ssampleIndex); + + Tree type = sampleClause.getChild(0); + Tree numerator = sampleClause.getChild(1); + String value = unescapeIdentifier(numerator.getText()); + + + SplitSample sample; + if (type.getType() == HiveParser.TOK_PERCENT) { + assertCombineInputFormat(numerator, "Percentage"); + Double percent = Double.valueOf(value).doubleValue(); + if (percent < 0 || percent > 100) { + throw new SemanticException(generateErrorMessage((ASTNode) numerator, + "Sampling percentage should be between 0 and 100")); + } + int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM); + sample = new SplitSample(percent, seedNum); + } else if (type.getType() == HiveParser.TOK_ROWCOUNT) { + sample = new SplitSample(Integer.parseInt(value)); + } else { + assert type.getType() == HiveParser.TOK_LENGTH; + assertCombineInputFormat(numerator, "Total Length"); + long length = Integer.parseInt(value.substring(0, value.length() - 1)); + char last = value.charAt(value.length() - 1); + if (last == 'k' || last == 'K') { + length <<= 10; + } else if (last == 'm' || last == 'M') { + length <<= 20; + } else if (last == 'g' || last == 'G') { + length <<= 30; + } + int seedNum = conf.getIntVar(ConfVars.HIVESAMPLERANDOMNUM); + sample = new SplitSample(length, seedNum); + } + String alias_id = getAliasId(alias, qb); + nameToSplitSample.put(alias_id, sample); + } + // Insert this map into the stats + qb.setTabAlias(alias, tabIdName); + if (qb.isInsideView()) { + qb.getAliasInsideView().add(alias.toLowerCase()); + } + qb.addAlias(alias); + + qb.getParseInfo().setSrcForAlias(alias, tableTree); + + // if alias to CTE contains the alias, we do not do the translation because + // cte is actually a subquery. + if (!this.aliasToCTEs.containsKey(alias)) { + unparseTranslator.addTableNameTranslation(tableTree, SessionState.get().getCurrentDatabase()); + if (aliasIndex != 0) { + unparseTranslator.addIdentifierTranslation((ASTNode) tabref.getChild(aliasIndex)); + } + } + + return alias; + } + + Map<String, SplitSample> getNameToSplitSampleMap() { + return this.nameToSplitSample; + } + + /** + * Convert a string to Text format and write its bytes in the same way TextOutputFormat would do. + * This is needed to properly encode non-ascii characters. + */ + private static void writeAsText(String text, FSDataOutputStream out) throws IOException { + Text to = new Text(text); + out.write(to.getBytes(), 0, to.getLength()); + } + + /** + * Generate a temp table out of a value clause + * See also {@link #preProcessForInsert(ASTNode, QB)} + */ + private ASTNode genValuesTempTable(ASTNode originalFrom, QB qb) throws SemanticException { + Path dataDir = null; + if(!qb.getEncryptedTargetTablePaths().isEmpty()) { + //currently only Insert into T values(...) is supported thus only 1 values clause + //and only 1 target table are possible. If/when support for + //select ... from values(...) is added an insert statement may have multiple + //encrypted target tables. + dataDir = ctx.getMRTmpPath(qb.getEncryptedTargetTablePaths().get(0).toUri()); + } + // Pick a name for the table + SessionState ss = SessionState.get(); + String tableName = VALUES_TMP_TABLE_NAME_PREFIX + ss.getNextValuesTempTableSuffix(); + + // Step 1, parse the values clause we were handed + List<? extends Node> fromChildren = originalFrom.getChildren(); + // First child should be the virtual table ref + ASTNode virtualTableRef = (ASTNode)fromChildren.get(0); + assert virtualTableRef.getToken().getType() == HiveParser.TOK_VIRTUAL_TABREF : + "Expected first child of TOK_VIRTUAL_TABLE to be TOK_VIRTUAL_TABREF but was " + + virtualTableRef.getName(); + + List<? extends Node> virtualTableRefChildren = virtualTableRef.getChildren(); + // First child of this should be the table name. If it's anonymous, + // then we don't have a table name. + ASTNode tabName = (ASTNode)virtualTableRefChildren.get(0); + if (tabName.getToken().getType() != HiveParser.TOK_ANONYMOUS) { + // TODO, if you want to make select ... from (values(...) as foo(...) work, + // you need to parse this list of columns names and build it into the table + throw new SemanticException(ErrorMsg.VALUES_TABLE_CONSTRUCTOR_NOT_SUPPORTED.getMsg()); + } + + // The second child of the TOK_VIRTUAL_TABLE should be TOK_VALUES_TABLE + ASTNode valuesTable = (ASTNode)fromChildren.get(1); + assert valuesTable.getToken().getType() == HiveParser.TOK_VALUES_TABLE : + "Expected second child of TOK_VIRTUAL_TABLE to be TOK_VALUE_TABLE but was " + + valuesTable.getName(); + // Each of the children of TOK_VALUES_TABLE will be a TOK_VALUE_ROW + List<? extends Node> valuesTableChildren = valuesTable.getChildren(); + + // Now that we're going to start reading through the rows, open a file to write the rows too + // If we leave this method before creating the temporary table we need to be sure to clean up + // this file. + Path tablePath = null; + FileSystem fs = null; + FSDataOutputStream out = null; + try { + if(dataDir == null) { + tablePath = Warehouse.getDnsPath(new Path(ss.getTempTableSpace(), tableName), conf); + } + else { + //if target table of insert is encrypted, make sure temporary table data is stored + //similarly encrypted + tablePath = Warehouse.getDnsPath(new Path(dataDir, tableName), conf); + } + fs = tablePath.getFileSystem(conf); + fs.mkdirs(tablePath); + Path dataFile = new Path(tablePath, "data_file"); + out = fs.create(dataFile); + List<FieldSchema> fields = new ArrayList<FieldSchema>(); + + boolean firstRow = true; + for (Node n : valuesTableChildren) { + ASTNode valuesRow = (ASTNode) n; + assert valuesRow.getToken().getType() == HiveParser.TOK_VALUE_ROW : + "Expected child of TOK_VALUE_TABLE to be TOK_VALUE_ROW but was " + valuesRow.getName(); + // Each of the children of this should be a literal + List<? extends Node> valuesRowChildren = valuesRow.getChildren(); + boolean isFirst = true; + int nextColNum = 1; + for (Node n1 : valuesRowChildren) { + ASTNode value = (ASTNode) n1; + if (firstRow) { + fields.add(new FieldSchema("tmp_values_col" + nextColNum++, "string", "")); + } + if (isFirst) isFirst = false; + else writeAsText("\u0001", out); + writeAsText(unparseExprForValuesClause(value), out); + } + writeAsText("\n", out); + firstRow = false; + } + + // Step 2, create a temp table, using the created file as the data + StorageFormat format = new StorageFormat(conf); + format.processStorageFormat("TextFile"); + Table table = db.newTable(tableName); + table.setSerializationLib(format.getSerde()); + table.setFields(fields); + table.setDataLocation(tablePath); + table.getTTable().setTemporary(true); + table.setStoredAsSubDirectories(false); + table.setInputFormatClass(format.getInputFormat()); + table.setOutputFormatClass(format.getOutputFormat()); + db.createTable(table, false); + } catch (Exception e) { + String errMsg = ErrorMsg.INSERT_CANNOT_CREATE_TEMP_FILE.getMsg() + e.getMessage(); + LOG.error(errMsg); + // Try to delete the file + if (fs != null && tablePath != null) { + try { + fs.delete(tablePath, false); + } catch (IOException swallowIt) {} + } + throw new SemanticException(errMsg, e); + } finally { + IOUtils.closeStream(out); + } + + // Step 3, return a new subtree with a from clause built around that temp table + // The form of the tree is TOK_TABREF->TOK_TABNAME->identifier(tablename) + Token t = new ClassicToken(HiveParser.TOK_TABREF); + ASTNode tabRef = new ASTNode(t); + t = new ClassicToken(HiveParser.TOK_TABNAME); + ASTNode tabNameNode = new ASTNode(t); + tabRef.addChild(tabNameNode); + t = new ClassicToken(HiveParser.Identifier, tableName); + ASTNode identifier = new ASTNode(t); + tabNameNode.addChild(identifier); + return tabRef; + } + + // Take an expression in the values clause and turn it back into a string. This is far from + // comprehensive. At the moment it only supports: + // * literals (all types) + // * unary negatives + // * true/false + private String unparseExprForValuesClause(ASTNode expr) throws SemanticException { + switch (expr.getToken().getType()) { + case HiveParser.Number: + return expr.getText(); + + case HiveParser.StringLiteral: + return BaseSemanticAnalyzer.unescapeSQLString(expr.getText()); + + case HiveParser.KW_FALSE: + // UDFToBoolean casts any non-empty string to true, so set this to false + return ""; + + case HiveParser.KW_TRUE: + return "TRUE"; + + case HiveParser.MINUS: + return "-" + unparseExprForValuesClause((ASTNode)expr.getChildren().get(0)); + + case HiveParser.TOK_NULL: + // Hive's text input will translate this as a null + return "\\N"; + + default: + throw new SemanticException("Expression of type " + expr.getText() + + " not supported in insert/values"); + } + + } + + private void assertCombineInputFormat(Tree numerator, String message) throws SemanticException { + String inputFormat = conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") ? + HiveConf.getVar(conf, HiveConf.ConfVars.HIVETEZINPUTFORMAT): + HiveConf.getVar(conf, HiveConf.ConfVars.HIVEINPUTFORMAT); + if (!inputFormat.equals(CombineHiveInputFormat.class.getName())) { + throw new SemanticException(generateErrorMessage((ASTNode) numerator, + message + " sampling is not supported in " + inputFormat)); + } + } + + private String processSubQuery(QB qb, ASTNode subq) throws SemanticException { + + // This is a subquery and must have an alias + if (subq.getChildCount() != 2) { + throw new SemanticException(ErrorMsg.NO_SUBQUERY_ALIAS.getMsg(subq)); + } + ASTNode subqref = (ASTNode) subq.getChild(0); + String alias = unescapeIdentifier(subq.getChild(1).getText()); + + // Recursively do the first phase of semantic analysis for the subquery + QBExpr qbexpr = new QBExpr(alias); + + doPhase1QBExpr(subqref, qbexpr, qb.getId(), alias); + + // If the alias is already there then we have a conflict + if (qb.exists(alias)) { + throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(subq + .getChild(1))); + } + // Insert this map into the stats + qb.setSubqAlias(alias, qbexpr); + qb.addAlias(alias); + + unparseTranslator.addIdentifierTranslation((ASTNode) subq.getChild(1)); + + return alias; + } + + /* + * Phase1: hold onto any CTE definitions in aliasToCTE. + * CTE definitions are global to the Query. + */ + private void processCTE(QB qb, ASTNode ctes) throws SemanticException { + + int numCTEs = ctes.getChildCount(); + + for(int i=0; i <numCTEs; i++) { + ASTNode cte = (ASTNode) ctes.getChild(i); + ASTNode cteQry = (ASTNode) cte.getChild(0); + String alias = unescapeIdentifier(cte.getChild(1).getText()); + + String qName = qb.getId() == null ? "" : qb.getId() + ":"; + qName += alias.toLowerCase(); + + if ( aliasToCTEs.containsKey(qName)) { + throw new SemanticException(ErrorMsg.AMBIGUOUS_TABLE_ALIAS.getMsg(cte.getChild(1))); + } + aliasToCTEs.put(qName, new CTEClause(qName, cteQry)); + } + } + + /* + * We allow CTE definitions in views. So we can end up with a hierarchy of CTE definitions: + * - at the top level of a query statement + * - where a view is referenced. + * - views may refer to other views. + * + * The scoping rules we use are: to search for a CTE from the current QB outwards. In order to + * disambiguate between CTES are different levels we qualify(prefix) them with the id of the QB + * they appear in when adding them to the <code>aliasToCTEs</code> map. + * + */ + private CTEClause findCTEFromName(QB qb, String cteName) { + StringBuilder qId = new StringBuilder(); + if (qb.getId() != null) { + qId.append(qb.getId()); + } + + while (qId.length() > 0) { + String nm = qId + ":" + cteName; + CTEClause cte = aliasToCTEs.get(nm); + if (cte != null) { + return cte; + } + int lastIndex = qId.lastIndexOf(":"); + lastIndex = lastIndex < 0 ? 0 : lastIndex; + qId.setLength(lastIndex); + } + return aliasToCTEs.get(cteName); + } + + /* + * If a CTE is referenced in a QueryBlock: + * - add it as a SubQuery for now. + * - SQ.alias is the alias used in QB. (if no alias is specified, + * it used the CTE name. Works just like table references) + * - Adding SQ done by: + * - copying AST of CTE + * - setting ASTOrigin on cloned AST. + * - trigger phase 1 on new QBExpr. + * - update QB data structs: remove this as a table reference, move it to a SQ invocation. + */ + private void addCTEAsSubQuery(QB qb, String cteName, String cteAlias) + throws SemanticException { + cteAlias = cteAlias == null ? cteName : cteAlias; + CTEClause cte = findCTEFromName(qb, cteName); + ASTNode cteQryNode = cte.cteNode; + QBExpr cteQBExpr = new QBExpr(cteAlias); + doPhase1QBExpr(cteQryNode, cteQBExpr, qb.getId(), cteAlias); + qb.rewriteCTEToSubq(cteAlias, cteName, cteQBExpr); + } + + private final CTEClause rootClause = new CTEClause(null, null); + + @Override + public List<Task<? extends Serializable>> getAllRootTasks() { + if (!rootTasksResolved) { + rootTasks = toRealRootTasks(rootClause.asExecutionOrder()); + rootTasksResolved = true; + } + return rootTasks; + } + + @Override + public HashSet<ReadEntity> getAllInputs() { + HashSet<ReadEntity> readEntities = new HashSet<ReadEntity>(getInputs()); + for (CTEClause cte : rootClause.asExecutionOrder()) { + if (cte.source != null) { + readEntities.addAll(cte.source.getInputs()); + } + } + return readEntities; + } + + @Override + public HashSet<WriteEntity> getAllOutputs() { + HashSet<WriteEntity> writeEntities = new HashSet<WriteEntity>(getOutputs()); + for (CTEClause cte : rootClause.asExecutionOrder()) { + if (cte.source != null) { + writeEntities.addAll(cte.source.getOutputs()); + } + } + return writeEntities; + } + + class CTEClause { + CTEClause(String alias, ASTNode cteNode) { + this.alias = alias; + this.cteNode = cteNode; + } + String alias; + ASTNode cteNode; + boolean materialize; + int reference; + QBExpr qbExpr; + List<CTEClause> parents = new ArrayList<CTEClause>(); + + // materialized + Table table; + SemanticAnalyzer source; + + List<Task<? extends Serializable>> getTasks() { + return source == null ? null : source.rootTasks; + } + + List<CTEClause> asExecutionOrder() { + List<CTEClause> execution = new ArrayList<CTEClause>(); + asExecutionOrder(new HashSet<CTEClause>(), execution); + return execution; + } + + void asExecutionOrder(Set<CTEClause> visited, List<CTEClause> execution) { + for (CTEClause parent : parents) { + if (visited.add(parent)) { + parent.asExecutionOrder(visited, execution); + } + } + execution.add(this); + } + + @Override + public String toString() { + return alias == null ? "<root>" : alias; + } + } + + private List<Task<? extends Serializable>> toRealRootTasks(List<CTEClause> execution) { + List<Task<? extends Serializable>> cteRoots = new ArrayList<>(); + List<Task<? extends Serializable>> cteLeafs = new ArrayList<>(); + List<Task<? extends Serializable>> curTopRoots = null; + List<Task<? extends Serializable>> curBottomLeafs = null; + for (int i = 0; i < execution.size(); i++) { + CTEClause current = execution.get(i); + if (current.parents.isEmpty() && curTopRoots != null) { + cteRoots.addAll(curTopRoots); + cteLeafs.addAll(curBottomLeafs); + curTopRoots = curBottomLeafs = null; + } + List<Task<? extends Serializable>> curTasks = current.getTasks(); + if (curTasks == null) { + continue; + } + if (curTopRoots == null) { + curTopRoots = curTasks; + } + if (curBottomLeafs != null) { + for (Task<?> topLeafTask : curBottomLeafs) { + for (Task<?> currentRootTask : curTasks) { + topLeafTask.addDependentTask(currentRootTask); + } + } + } + curBottomLeafs = Task.findLeafs(curTasks); + } + if (curTopRoots != null) { + cteRoots.addAll(curTopRoots); + cteLeafs.addAll(curBottomLeafs); + } + + if (cteRoots.isEmpty()) { + return rootTasks; + } + for (Task<?> cteLeafTask : cteLeafs) { + for (Task<?> mainRootTask : rootTasks) { + cteLeafTask.addDependentTask(mainRootTask); + } + } + return cteRoots; + } + + Table materializeCTE(String cteName, CTEClause cte) throws HiveException { + + ASTNode createTable = new ASTNode(new ClassicToken(HiveParser.TOK_CREATETABLE)); + + ASTNode tableName = new ASTNode(new ClassicToken(HiveParser.TOK_TABNAME)); + tableName.addChild(new ASTNode(new ClassicToken(HiveParser.Identifier, cteName))); + + ASTNode temporary = new ASTNode(new ClassicToken(HiveParser.KW_TEMPORARY, MATERIALIZATION_MARKER)); + + createTable.addChild(tableName); + createTable.addChild(temporary); + createTable.addChild(cte.cteNode); + + SemanticAnalyzer analyzer = new SemanticAnalyzer(queryState); + analyzer.initCtx(ctx); + analyzer.init(false); + + // should share cte contexts + analyzer.aliasToCTEs.putAll(aliasToCTEs); + + HiveOperation operation = queryState.getHiveOperation(); + try { + analyzer.analyzeInternal(createTable); + } finally { + queryState.setCommandType(operation); + } + + Table table = analyzer.tableDesc.toTable(conf); + Path location = table.getDataLocation(); + try { + location.getFileSystem(conf).mkdirs(location); + } catch (IOException e) { + throw new HiveException(e); + } + table.setMaterializedTable(true); + + LOG.info(cteName + " will be materialized into " + location); + cte.table = table; + cte.source = analyzer; + + ctx.addMaterializedTable(cteName, table); + + return table; + } + + + static boolean isJoinToken(ASTNode node) { + if ((node.getToken().getType() == HiveParser.TOK_JOIN) + || (node.getToken().getType() == HiveParser.TOK_CROSSJOIN) + || isOuterJoinToken(node) + || (node.getToken().getType() == HiveParser.TOK_LEFTSEMIJOIN) + || (node.getToken().getType() == HiveParser.TOK_UNIQUEJOIN)) { + return true; + } + + return false; + } + + static private boolean isOuterJoinToken(ASTNode node) { + return (node.getToken().getType() == HiveParser.TOK_LEFTOUTERJOIN) + || (node.getToken().getType() == HiveParser.TOK_RIGHTOUTERJOIN) + || (node.getToken().getType() == HiveParser.TOK_FULLOUTERJOIN); + } + + /** + * Given the AST with TOK_JOIN as the root, get all the aliases for the tables + * or subqueries in the join. + * + * @param qb + * @param join + * @throws SemanticException + */ + @SuppressWarnings("nls") + private void processJoin(QB qb, ASTNode join) throws SemanticException { + int numChildren = join.getChildCount(); + if ((numChildren != 2) && (numChildren != 3) + && join.getToken().getType() != HiveParser.TOK_UNIQUEJOIN) { + throw new SemanticException(generateErrorMessage(join, + "Join with multiple children")); + } + + queryProperties.incrementJoinCount(isOuterJoinToken(join)); + for (int num = 0; num < numChildren; num++) { + ASTNode child = (ASTNode) join.getChild(num); + if (child.getToken().getType() == HiveParser.TOK_TABREF) { + processTable(qb, child); + } else if (child.getToken().getType() == HiveParser.TOK_SUBQUERY) { + processSubQuery(qb, child); + } else if (child.getToken().getType() == HiveParser.TOK_PTBLFUNCTION) { + queryProperties.setHasPTF(true); + processPTF(qb, child); + PTFInvocationSpec ptfInvocationSpec = qb.getPTFInvocationSpec(child); + String inputAlias = ptfInvocationSpec == null ? null : + ptfInvocationSpec.getFunction().getAlias();; + if ( inputAlias == null ) { + throw new SemanticException(generateErrorMessage(child, + "PTF invocation in a Join must have an alias")); + } + + } else if (child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW || + child.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) { + // SELECT * FROM src1 LATERAL VIEW udtf() AS myTable JOIN src2 ... + // is not supported. Instead, the lateral view must be in a subquery + // SELECT * FROM (SELECT * FROM src1 LATERAL VIEW udtf() AS myTable) a + // JOIN src2 ... + throw new SemanticException(ErrorMsg.LATERAL_VIEW_WITH_JOIN + .getMsg(join)); + } else if (isJoinToken(child)) { + processJoin(qb, child); + } + } + } + + /** + * Given the AST with TOK_LATERAL_VIEW as the root, get the alias for the + * table or subquery in the lateral view and also make a mapping from the + * alias to all the lateral view AST's. + * + * @param qb + * @param lateralView + * @return the alias for the table/subquery + * @throws SemanticException + */ + + private String processLateralView(QB qb, ASTNode lateralView) + throws SemanticException { + int numChildren = lateralView.getChildCount(); + + assert (numChildren == 2); + ASTNode next = (ASTNode) lateralView.getChild(1); + + String alias = null; + + switch (next.getToken().getType()) { + case HiveParser.TOK_TABREF: + alias = processTable(qb, next); + break; + case HiveParser.TOK_SUBQUERY: + alias = processSubQuery(qb, next); + break; + case HiveParser.TOK_LATERAL_VIEW: + case HiveParser.TOK_LATERAL_VIEW_OUTER: + alias = processLateralView(qb, next); + break; + default: + throw new SemanticException(ErrorMsg.LATERAL_VIEW_INVALID_CHILD + .getMsg(lateralView)); + } + alias = alias.toLowerCase(); + qb.getParseInfo().addLateralViewForAlias(alias, lateralView); + qb.addAlias(alias); + return alias; + } + + /** + * Phase 1: (including, but not limited to): + * + * 1. Gets all the aliases for all the tables / subqueries and makes the + * appropriate mapping in aliasToTabs, aliasToSubq 2. Gets the location of the + * destination and names the clause "inclause" + i 3. Creates a map from a + * string representation of an aggregation tree to the actual aggregation AST + * 4. Creates a mapping from the clause name to the select expression AST in + * destToSelExpr 5. Creates a mapping from a table alias to the lateral view + * AST's in aliasToLateralViews + * + * @param ast + * @param qb + * @param ctx_1 + * @throws SemanticException + */ + @SuppressWarnings({"fallthrough", "nls"}) + public boolean doPhase1(ASTNode ast, QB qb, Phase1Ctx ctx_1, PlannerContext plannerCtx) + throws SemanticException { + + boolean phase1Result = true; + QBParseInfo qbp = qb.getParseInfo(); + boolean skipRecursion = false; + + if (ast.getToken() != null) { + skipRecursion = true; + switch (ast.getToken().getType()) { + case HiveParser.TOK_SELECTDI: + qb.countSelDi(); + // fall through + case HiveParser.TOK_SELECT: + qb.countSel(); + qbp.setSelExprForClause(ctx_1.dest, ast); + + int posn = 0; + if (((ASTNode) ast.getChild(0)).getToken().getType() == HiveParser.TOK_HINTLIST) { + qbp.setHints((ASTNode) ast.getChild(0)); + posn++; + } + + if ((ast.getChild(posn).getChild(0).getType() == HiveParser.TOK_TRANSFORM)) + queryProperties.setUsesScript(true); + + LinkedHashMap<String, ASTNode> aggregations = doPhase1GetAggregationsFromSelect(ast, + qb, ctx_1.dest); + doPhase1GetColumnAliasesFromSelect(ast, qbp); + qbp.setAggregationExprsForClause(ctx_1.dest, aggregations); + qbp.setDistinctFuncExprsForClause(ctx_1.dest, + doPhase1GetDistinctFuncExprs(aggregations)); + break; + + case HiveParser.TOK_WHERE: + qbp.setWhrExprForClause(ctx_1.dest, ast); + if (!SubQueryUtils.findSubQueries((ASTNode) ast.getChild(0)).isEmpty()) + queryProperties.setFilterWithSubQuery(true); + break; + + case HiveParser.TOK_INSERT_INTO: + String currentDatabase = SessionState.get().getCurrentDatabase(); + String tab_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0), currentDatabase); + qbp.addInsertIntoTable(tab_name, ast); + + case HiveParser.TOK_DESTINATION: + ctx_1.dest = "insclause-" + ctx_1.nextNum; + ctx_1.nextNum++; + boolean isTmpFileDest = false; + if (ast.getChildCount() > 0 && ast.getChild(0) instanceof ASTNode) { + ASTNode ch = (ASTNode) ast.getChild(0); + if (ch.getToken().getType() == HiveParser.TOK_DIR && ch.getChildCount() > 0 + && ch.getChild(0) instanceof ASTNode) { + ch = (ASTNode) ch.getChild(0); + isTmpFileDest = ch.getToken().getType() == HiveParser.TOK_TMP_FILE; + } else { + if (ast.getToken().getType() == HiveParser.TOK_DESTINATION + && ast.getChild(0).getType() == HiveParser.TOK_TAB) { + String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0), + SessionState.get().getCurrentDatabase()); + qbp.getInsertOverwriteTables().put(fullTableName, ast); + } + } + } + + // is there a insert in the subquery + if (qbp.getIsSubQ() && !isTmpFileDest) { + throw new SemanticException(ErrorMsg.NO_INSERT_INSUBQUERY.getMsg(ast)); + } + + if (plannerCtx != null) { + plannerCtx.setInsertToken(ast, isTmpFileDest); + } + + qbp.setDestForClause(ctx_1.dest, (ASTNode) ast.getChild(0)); + handleInsertStatementSpecPhase1(ast, qbp, ctx_1); + if (qbp.getClauseNamesForDest().size() > 1) { + queryProperties.setMultiDestQuery(true); + } + break; + + case HiveParser.TOK_FROM: + int child_count = ast.getChildCount(); + if (child_count != 1) { + throw new SemanticException(generateErrorMessage(ast, + "Multiple Children " + child_count)); + } + + // Check if this is a subquery / lateral view + ASTNode frm = (ASTNode) ast.getChild(0); + if (frm.getToken().getType() == HiveParser.TOK_TABREF) { + processTable(qb, frm); + } else if (frm.getToken().getType() == HiveParser.TOK_VIRTUAL_TABLE) { + // Create a temp table with the passed values in it then rewrite this portion of the + // tree to be from that table. + ASTNode newFrom = genValuesTempTable(frm, qb); + ast.setChild(0, newFrom); + processTable(qb, newFrom); + } else if (frm.getToken().getType() == HiveParser.TOK_SUBQUERY) { + processSubQuery(qb, frm); + } else if (frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW || + frm.getToken().getType() == HiveParser.TOK_LATERAL_VIEW_OUTER) { + queryProperties.setHasLateralViews(true); + processLateralView(qb, frm); + } else if (isJoinToken(frm)) { + processJoin(qb, frm); + qbp.setJoinExpr(frm); + }else if(frm.getToken().getType() == HiveParser.TOK_PTBLFUNCTION){ + queryProperties.setHasPTF(true); + processPTF(qb, frm); + } + break; + + case HiveParser.TOK_CLUSTERBY: + // Get the clusterby aliases - these are aliased to the entries in the + // select list + queryProperties.setHasClusterBy(true); + qbp.setClusterByExprForClause(ctx_1.dest, ast); + break; + + case HiveParser.TOK_DISTRIBUTEBY: + // Get the distribute by aliases - these are aliased to the entries in + // the + // select list + queryProperties.setHasDistributeBy(true); + qbp.setDistributeByExprForClause(ctx_1.dest, ast); + if (qbp.getClusterByForClause(ctx_1.dest) != null) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.CLUSTERBY_DISTRIBUTEBY_CONFLICT.getMsg())); + } else if (qbp.getOrderByForClause(ctx_1.dest) != null) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.ORDERBY_DISTRIBUTEBY_CONFLICT.getMsg())); + } + break; + + case HiveParser.TOK_SORTBY: + // Get the sort by aliases - these are aliased to the entries in the + // select list + queryProperties.setHasSortBy(true); + qbp.setSortByExprForClause(ctx_1.dest, ast); + if (qbp.getClusterByForClause(ctx_1.dest) != null) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.CLUSTERBY_SORTBY_CONFLICT.getMsg())); + } else if (qbp.getOrderByForClause(ctx_1.dest) != null) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.ORDERBY_SORTBY_CONFLICT.getMsg())); + } + + break; + + case HiveParser.TOK_ORDERBY: + // Get the order by aliases - these are aliased to the entries in the + // select list + queryProperties.setHasOrderBy(true); + qbp.setOrderByExprForClause(ctx_1.dest, ast); + if (qbp.getClusterByForClause(ctx_1.dest) != null) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.CLUSTERBY_ORDERBY_CONFLICT.getMsg())); + } + break; + + case HiveParser.TOK_GROUPBY: + case HiveParser.TOK_ROLLUP_GROUPBY: + case HiveParser.TOK_CUBE_GROUPBY: + case HiveParser.TOK_GROUPING_SETS: + // Get the groupby aliases - these are aliased to the entries in the + // select list + queryProperties.setHasGroupBy(true); + if (qbp.getJoinExpr() != null) { + queryProperties.setHasJoinFollowedByGroupBy(true); + } + if (qbp.getSelForClause(ctx_1.dest).getToken().getType() == HiveParser.TOK_SELECTDI) { + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.SELECT_DISTINCT_WITH_GROUPBY.getMsg())); + } + qbp.setGroupByExprForClause(ctx_1.dest, ast); + skipRecursion = true; + + // Rollup and Cubes are syntactic sugar on top of grouping sets + if (ast.getToken().getType() == HiveParser.TOK_ROLLUP_GROUPBY) { + qbp.getDestRollups().add(ctx_1.dest); + } else if (ast.getToken().getType() == HiveParser.TOK_CUBE_GROUPBY) { + qbp.getDestCubes().add(ctx_1.dest); + } else if (ast.getToken().getType() == HiveParser.TOK_GROUPING_SETS) { + qbp.getDestGroupingSets().add(ctx_1.dest); + } + break; + + case HiveParser.TOK_HAVING: + qbp.setHavingExprForClause(ctx_1.dest, ast); + qbp.addAggregationExprsForClause(ctx_1.dest, + doPhase1GetAggregationsFromSelect(ast, qb, ctx_1.dest)); + break; + + case HiveParser.KW_WINDOW: + if (!qb.hasWindowingSpec(ctx_1.dest) ) { + throw new SemanticException(generateErrorMessage(ast, + "Query has no Cluster/Distribute By; but has a Window definition")); + } + handleQueryWindowClauses(qb, ctx_1, ast); + break; + + case HiveParser.TOK_LIMIT: + if (ast.getChildCount() == 2) { + qbp.setDestLimit(ctx_1.dest, + new Integer(ast.getChild(0).getText()), + new Integer(ast.getChild(1).getText())); + } else { + qbp.setDestLimit(ctx_1.dest, new Integer(0), + new Integer(ast.getChild(0).getText())); + } + break; + + case HiveParser.TOK_ANALYZE: + // Case of analyze command + + String table_name = getUnescapedName((ASTNode) ast.getChild(0).getChild(0)).toLowerCase(); + + + qb.setTabAlias(table_name, table_name); + qb.addAlias(table_name); + qb.getParseInfo().setIsAnalyzeCommand(true); + qb.getParseInfo().setNoScanAnalyzeCommand(this.noscan); + qb.getParseInfo().setPartialScanAnalyzeCommand(this.partialscan); + // Allow analyze the whole table and dynamic partitions + HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict"); + HiveConf.setVar(conf, HiveConf.ConfVars.HIVEMAPREDMODE, "nonstrict"); + + break; + + case HiveParser.TOK_UNIONALL: + if (!qbp.getIsSubQ()) { + // this shouldn't happen. The parser should have converted the union to be + // contained in a subquery. Just in case, we keep the error as a fallback. + throw new SemanticException(generateErrorMessage(ast, + ErrorMsg.UNION_NOTIN_SUBQ.getMsg())); + } + skipRecursion = false; + break; + + case HiveParser.TOK_INSERT: + ASTNode destination = (ASTNode) ast.getChild(0); + Tree tab = destination.getChild(0); + + // Proceed if AST contains partition & If Not Exists + if (destination.getChildCount() == 2 && + tab.getChildCount() == 2 && + destination.getChild(1).getType() == HiveParser.TOK_IFNOTEXISTS) { + String tableName = tab.getChild(0).getChild(0).getText(); + + Tree partitions = tab.getChild(1); + int childCount = partitions.getChildCount(); + HashMap<String, String> partition = new HashMap<String, String>(); + for (int i = 0; i < childCount; i++) { + String partitionName = partitions.getChild(i).getChild(0).getText(); + Tree pvalue = partitions.getChild(i).getChild(1); + if (pvalue == null) { + break; + } + String partitionVal = stripQuotes(pvalue.getText()); + partition.put(partitionName, partitionVal); + } + // if it is a dynamic partition throw the exception + if (childCount != partition.size()) { + throw new SemanticException(ErrorMsg.INSERT_INTO_DYNAMICPARTITION_IFNOTEXISTS + .getMsg(partition.toString())); + } + Table table = null; + try { + table = this.getTableObjectByName(tableName); + } catch (HiveException ex) { + throw new SemanticException(ex); + } + try { + Partition parMetaData = db.getPartition(table, partition, false); + // Check partition exists if it exists skip the overwrite + if (parMetaData != null) { + phase1Result = false; + skipRecursion = true; + LOG.info("Partition already exists so insert into overwrite " + + "skipped for partition : " + parMetaData.toString()); + break; + } + } catch (HiveException e) { + LOG.info("Error while getting metadata : ", e); + } + validatePartSpec(table, partition, (ASTNode)tab, conf, false); + } + skipRecursion = false; + break; + case HiveParser.TOK_LATERAL_VIEW: + case HiveParser.TOK_LATERAL_VIEW_OUTER: + // todo: nested LV + assert ast.getChildCount() == 1; + qb.getParseInfo().getDestToLateralView().put(ctx_1.dest, ast); + break; + case HiveParser.TOK_CTE: + processCTE(qb, ast); + break; + default: + skipRecursion = false; + break; + } + } + + if (!skipRecursion) { + // Iterate over the rest of the children + int child_count = ast.getChildCount(); + for (int child_pos = 0; child_pos < child_count && phase1Result; ++child_pos) { + // Recurse + phase1Result = phase1Result && doPhase1( + (ASTNode)ast.getChild(child_pos), qb, ctx_1, plannerCtx); + } + } + return phase1Result; + } + + /** + * This is phase1 of supporting specifying schema in insert statement + * insert into foo(z,y) select a,b from bar; + * @see #handleInsertStatementSpec(java.util.List, String, RowResolver, RowResolver, QB, ASTNode) + * @throws SemanticException + */ + private void handleInsertStatementSpecPhase1(ASTNode ast, QBParseInfo qbp, Phase1Ctx ctx_1) throws SemanticException { + ASTNode tabColName = (ASTNode)ast.getChild(1); + if(ast.getType() == HiveParser.TOK_INSERT_INTO && tabColName != null && tabColName.getType() == HiveParser.TOK_TABCOLNAME) { + //we have "insert into foo(a,b)..."; parser will enforce that 1+ columns are listed if TOK_TABCOLNAME is present + List<String> targetColNames = new ArrayList<String>(); + for(Node col : tabColName.getChildren()) { + assert ((ASTNode)col).getType() == HiveParser.Identifier : + "expected token " + HiveParser.Identifier + " found " + ((ASTNode)col).getType(); + targetColNames.add(((ASTNode)col).getText()); + } + String fullTableName = getUnescapedName((ASTNode) ast.getChild(0).getChild(0), + SessionState.get().getCurrentDatabase()); + qbp.setDestSchemaForClause(ctx_1.dest, targetColNames); + Set<String> targetColumns = new HashSet<String>(); + targetColumns.addAll(targetColNames); + if(targetColNames.size() != targetColumns.size()) { + throw new SemanticException(generateErrorMessage(tabColName, + "Duplicate column name detected in " + fullTableName + " table schema specification")); + } + Table targetTable = null; + try { + targetTable = db.getTable(fullTableName, false); + } + catch (HiveException ex) { + LOG.error("Error processing HiveParser.TOK_DESTINATION: " + ex.getMessage(), ex); + throw new SemanticException(ex); + } + if(targetTable == null) { + throw new SemanticException(generateErrorMessage(ast, + "Unable to access metadata for table " + fullTableName)); + } + for(FieldSchema f : targetTable.getCols()) { + //parser only allows foo(a,b), not foo(foo.a, foo.b) + targetColumns.remove(f.getName()); + } + if(!targetColumns.isEmpty()) {//here we need to see if remaining columns are dynamic partition columns + /* We just checked the user specified schema columns among regular table column and found some which are not + 'regular'. Now check is they are dynamic partition columns + For dynamic partitioning, + Given "create table multipart(a int, b int) partitioned by (c int, d int);" + for "insert into multipart partition(c='1',d)(d,a) values(2,3);" we expect parse tree to look like this + (TOK_INSERT_INTO + (TOK_TAB + (TOK_TABNAME multipart) + (TOK_PARTSPEC + (TOK_PARTVAL c '1') + (TOK_PARTVAL d) + ) + ) + (TOK_TABCOLNAME d a) + )*/ + List<String> dynamicPartitionColumns = new ArrayList<String>(); + if(ast.getChild(0) != null && ast.getChild(0).getType() == HiveParser.TOK_TAB) { + ASTNode tokTab = (ASTNode)ast.getChild(0); + ASTNode tokPartSpec = (ASTNode)tokTab.getFirstChildWithType(HiveParser.TOK_PARTSPEC); + if(tokPartSpec != null) { + for(Node n : tokPartSpec.getChildren()) { + ASTNode tokPartVal = null; + if(n instanceof ASTNode) { + tokPartVal = (ASTNode)n; + } + if(tokPartVal != null && tokPartVal.getType() == HiveParser.TOK_PARTVAL && tokPartVal.getChildCount() == 1) { + assert tokPartVal.getChild(0).getType() == HiveParser.Identifier : + "Expected column name; found tokType=" + tokPartVal.getType(); + dynamicPartitionColumns.add(tokPartVal.getChild(0).getText()); + } + } + } + } + for(String colName : dynamicPartitionColumns) { + targetColumns.remove(colName); + } + if(!targetColumns.isEmpty()) { + //Found some columns in user specified schema which are neither regular not dynamic partition columns + throw new SemanticException(generateErrorMessage(tabColName, + "'" + (targetColumns.size() == 1 ? targetColumns.iterator().next() : targetColumns) + + "' in insert schema specification " + (targetColumns.size() == 1 ? "is" : "are") + + " not found among regular columns of " + + fullTableName + " nor dynamic partition columns.")); + } + } + } + } + + public void getMaterializationMetadata(QB qb) throws SemanticException { + try { + gatherCTEReferences(qb, rootClause); + int threshold = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_CTE_MATERIALIZE_THRESHOLD); + for (CTEClause cte : Sets.newHashSet(aliasToCTEs.values())) { + if (threshold >= 0 && cte.reference >= threshold) { + cte.materialize = true; + } + } + } catch (HiveException e) { + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + if (e instanceof SemanticException) { + throw (SemanticException)e; + } + throw new SemanticException(e.getMessage(), e); + } + } + + private void gatherCTEReferences(QBExpr qbexpr, CTEClause parent) throws HiveException { + if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) { + gatherCTEReferences(qbexpr.getQB(), parent); + } else { + gatherCTEReferences(qbexpr.getQBExpr1(), parent); + gatherCTEReferences(qbexpr.getQBExpr2(), parent); + } + } + + // TODO: check view references, too + private void gatherCTEReferences(QB qb, CTEClause current) throws HiveException { + for (String alias : qb.getTabAliases()) { + String tabName = qb.getTabNameForAlias(alias); + String cteName = tabName.toLowerCase(); + + CTEClause cte = findCTEFromName(qb, cteName); + if (cte != null) { + if (ctesExpanded.contains(cteName)) { + throw new SemanticException("Recursive cte " + cteName + + " detected (cycle: " + StringUtils.join(ctesExpanded, " -> ") + + " -> " + cteName + ")."); + } + cte.reference++; + current.parents.add(cte); + if (cte.qbExpr != null) { + continue; + } + cte.qbExpr = new QBExpr(cteName); + doPhase1QBExpr(cte.cteNode, cte.qbExpr, qb.getId(), cteName); + + ctesExpanded.add(cteName); + gatherCTEReferences(cte.qbExpr, cte); + ctesExpanded.remove(ctesExpanded.size() - 1); + } + } + for (String alias : qb.getSubqAliases()) { + gatherCTEReferences(qb.getSubqForAlias(alias), current); + } + } + + public void getMetaData(QB qb) throws SemanticException { + getMetaData(qb, false); + } + + public void getMetaData(QB qb, boolean enableMaterialization) throws SemanticException { + try { + if (enableMaterialization) { + getMaterializationMetadata(qb); + } + getMetaData(qb, null); + } catch (HiveException e) { + // Has to use full name to make sure it does not conflict with + // org.apache.commons.lang.StringUtils + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + if (e instanceof SemanticException) { + throw (SemanticException)e; + } + throw new SemanticException(e.getMessage(), e); + } + } + + private void getMetaData(QBExpr qbexpr, ReadEntity parentInput) + throws HiveException { + if (qbexpr.getOpcode() == QBExpr.Opcode.NULLOP) { + getMetaData(qbexpr.getQB(), parentInput); + } else { + getMetaData(qbexpr.getQBExpr1(), parentInput); + getMetaData(qbexpr.getQBExpr2(), parentInput); + } + } + + @SuppressWarnings("nls") + private void getMetaData(QB qb, ReadEntity parentInput) + throws HiveException { + LOG.info("Get metadata for source tables"); + + // Go over the tables and populate the related structures. + // We have to materialize the table alias list since we might + // modify it in the middle for view rewrite. + List<String> tabAliases = new ArrayList<String>(qb.getTabAliases()); + + // Keep track of view alias to view name and read entity + // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T + // keeps track of full view name and read entity corresponding to alias V3, V3:V2, V3:V2:V1. + // This is needed for tracking the dependencies for inputs, along with their parents. + Map<String, ObjectPair<String, ReadEntity>> aliasToViewInfo = + new HashMap<String, ObjectPair<String, ReadEntity>>(); + + /* + * used to capture view to SQ conversions. This is used to check for + * recursive CTE invocations. + */ + Map<String, String> sqAliasToCTEName = new HashMap<String, String>(); + + for (String alias : tabAliases) { + String tabName = qb.getTabNameForAlias(alias); + String cteName = tabName.toLowerCase(); + + Table tab = db.getTable(tabName, false); + if (tab == null || + tab.getDbName().equals(SessionState.get().getCurrentDatabase())) { + Table materializedTab = ctx.getMaterializedTable(cteName); + if (materializedTab == null) { + // we first look for this alias from CTE, and then from catalog. + CTEClause cte = findCTEFromName(qb, cteName); + if (cte != null) { + if (!cte.materialize) { + addCTEAsSubQuery(qb, cteName, alias); + sqAliasToCTEName.put(alias, cteName); + continue; + } + tab = materializeCTE(cteName, cte); + } + } else { + tab = materializedTab; + } + } + + if (tab == null) { + ASTNode src = qb.getParseInfo().getSrcForAlias(alias); + if (null != src) { + throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(src)); + } else { + throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(alias)); + } + } + + // Disallow INSERT INTO on bucketized tables + boolean isAcid = AcidUtils.isAcidTable(tab); + boolean isTableWrittenTo = qb.getParseInfo().isInsertIntoTable(tab.getDbName(), tab.getTableName()); + if (isTableWrittenTo && + tab.getNumBuckets() > 0 && !isAcid) { + throw new SemanticException(ErrorMsg.INSERT_INTO_BUCKETIZED_TABLE. + getMsg("Table: " + tabName)); + } + // Disallow update and delete on non-acid tables + if ((updating() || deleting()) && !isAcid && isTableWrittenTo) { + //isTableWrittenTo: delete from acidTbl where a in (select id from nonAcidTable) + //so only assert this if we are actually writing to this table + // Whether we are using an acid compliant transaction manager has already been caught in + // UpdateDeleteSemanticAnalyzer, so if we are updating or deleting and getting nonAcid + // here, it means the table itself doesn't support it. + throw new SemanticException(ErrorMsg.ACID_OP_ON_NONACID_TABLE, tabName); + } + + if (tab.isView()) { + if (qb.getParseInfo().isAnalyzeCommand()) { + throw new SemanticException(ErrorMsg.ANALYZE_VIEW.getMsg()); + } + String fullViewName = tab.getDbName() + "." + tab.getTableName(); + // Prevent view cycles + if (viewsExpanded.contains(fullViewName)) { + throw new SemanticException("Recursive view " + fullViewName + + " detected (cycle: " + StringUtils.join(viewsExpanded, " -> ") + + " -> " + fullViewName + ")."); + } + replaceViewReferenceWithDefinition(qb, tab, tabName, alias); + // This is the last time we'll see the Table objects for views, so add it to the inputs + // now. isInsideView will tell if this view is embedded in another view. + ReadEntity viewInput = new ReadEntity(tab, parentInput, !qb.isInsideView()); + viewInput = PlanUtils.addInput(inputs, viewInput); + aliasToViewInfo.put(alias, new ObjectPair<String, ReadEntity>(fullViewName, viewInput)); + String aliasId = getAliasId(alias, qb); + if (aliasId != null) { + aliasId = aliasId.replace(SemanticAnalyzer.SUBQUERY_TAG_1, "") + .replace(SemanticAnalyzer.SUBQUERY_TAG_2, ""); + } + viewAliasToInput.put(aliasId, viewInput); + continue; + } + + if (!InputFormat.class.isAssignableFrom(tab.getInputFormatClass())) { + throw new SemanticException(generateErrorMessage( + qb.getParseInfo().getSrcForAlias(alias), + ErrorMsg.INVALID_INPUT_FORMAT_TYPE.getMsg())); + } + + qb.getMetaData().setSrcForAlias(alias, tab); + + if (qb.getParseInfo().isAnalyzeCommand()) { + // allow partial partition specification for nonscan since noscan is fast. + TableSpec ts = new TableSpec(db, conf, (ASTNode) ast.getChild(0), true, this.noscan); + if (ts.specType == SpecType.DYNAMIC_PARTITION) { // dynamic partitions + try { + ts.partitions = db.getPartitionsByNames(ts.tableHandle, ts.partSpec); + } catch (HiveException e) { + throw new SemanticException(generateErrorMessage( + qb.getParseInfo().getSrcForAlias(alias), + "Cannot get partitions for " + ts.partSpec), e); + } + } + // validate partial scan command + QBParseInfo qbpi = qb.getParseInfo(); + if (qbpi.isPartialScanAnalyzeCommand()) { + Class<? extends InputFormat> inputFormatClass = null; + switch (ts.specType) { + case TABLE_ONLY: + case DYNAMIC_PARTITION: + inputFormatClass = ts.tableHandle.getInputFormatClass(); + break; + case STATIC_PARTITION: + inputFormatClass = ts.partHandle.getInputFormatClass(); + break; + default: + assert false; + } + // throw a HiveException for formats other than rcfile or orcfile. + if (!(inputFormatClass.equals(RCFileInputFormat.class) || inputFormatClass + .equals(OrcInputFormat.class))) { + throw new SemanticException(ErrorMsg.ANALYZE_TABLE_PARTIALSCAN_NON_RCFILE.getMsg()); + } + } + + tab.setTableSpec(ts); + qb.getParseInfo().addTableSpec(alias, ts); + } + + ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput); + PlanUtils.addInput(inputs, + new ReadEntity(tab, parentViewInfo, parentViewInfo == null),mergeIsDirect); + } + + LOG.info("Get metadata for subqueries"); + // Go over the subqueries and getMetaData for these + for (String alias : qb.getSubqAliases()) { + boolean wasView = aliasToViewInfo.containsKey(alias); + boolean wasCTE = sqAliasToCTEName.containsKey(alias); + ReadEntity newParentInput = null; + if (wasView) { + viewsExpanded.add(aliasToViewInfo.get(alias).getFirst()); + newParentInput = aliasToViewInfo.get(alias).getSecond(); + } else if (wasCTE) { + ctesExpanded.add(sqAliasToCTEName.get(alias)); + } + QBExpr qbexpr = qb.getSubqForAlias(alias); + getMetaData(qbexpr, newParentInput); + if (wasView) { + viewsExpanded.remove(viewsExpanded.size() - 1); + } else if (wasCTE) { + ctesExpanded.remove(ctesExpanded.size() - 1); + } + } + + RowFormatParams rowFormatParams = new RowFormatParams(); + StorageFormat storageFormat = new StorageFormat(conf); + + LOG.info("Get metadata for destination tables"); + // Go over all the destination structures and populate the related + // metadata + QBParseInfo qbp = qb.getParseInfo(); + + for (String name : qbp.getClauseNamesForDest()) { + ASTNode ast = qbp.getDestForClause(name); + switch (ast.getToken().getType()) { + case HiveParser.TOK_TAB: { + TableSpec ts = new TableSpec(db, conf, ast); + if (ts.tableHandle.isView()) { + throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg()); + } + + Class<?> outputFormatClass = ts.tableHandle.getOutputFormatClass(); + if (!ts.tableHandle.isNonNative() && + !HiveOutputFormat.class.isAssignableFrom(outputFormatClass)) { + throw new SemanticException(ErrorMsg.INVALID_OUTPUT_FORMAT_TYPE + .getMsg(ast, "The class is " + outputFormatClass.toString())); + } + + // TableSpec ts is got from the query (user specified), + // which means the user didn't specify partitions in their query, + // but whether the table itself is partitioned is not know. + if (ts.specType != SpecType.STATIC_PARTITION) { + // This is a table or dynamic partition + qb.getMetaData().setDestForAlias(name, ts.tableHandle); + // has dynamic as well as static partitions + if (ts.partSpec != null && ts.partSpec.size() > 0) { + qb.getMetaData().setPartSpecForAlias(name, ts.partSpec); + } + } else { + // This is a partition + qb.getMetaData().setDestForAlias(name, ts.partHandle); + } + if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESTATSAUTOGATHER)) { + // Add the table spec for the destination table. + qb.getParseInfo().addTableSpec(ts.tableName.toLowerCase(), ts); + } + break; + } + + case HiveParser.TOK_DIR: { + // This is a dfs file + String fname = stripQuotes(ast.getChild(0).getText()); + if ((!qb.getParseInfo().getIsSubQ()) + && (((ASTNode) ast.getChild(0)).getToken().getType() == HiveParser.TOK_TMP_FILE)) { + + if (qb.isCTAS()) { + qb.setIsQuery(false); + ctx.setResDir(null); + ctx.setResFile(null); + + // allocate a t
<TRUNCATED>