This is an automated email from the ASF dual-hosted git repository. jcamacho pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new fc81b89 HIVE-22464: Implement support for NULLS FIRST/LAST in TopNKeyOperator (Krisztian Kasa, reviewed by Jesus Camacho Rodriguez) fc81b89 is described below commit fc81b8909b1a0e6aa15900387a98bccf38ae2247 Author: Krisztian Kasa <kk...@cloudera.com> AuthorDate: Sun Nov 17 20:09:10 2019 -0800 HIVE-22464: Implement support for NULLS FIRST/LAST in TopNKeyOperator (Krisztian Kasa, reviewed by Jesus Camacho Rodriguez) --- .../hadoop/hive/ql/exec/TopNKeyOperator.java | 43 +- .../hadoop/hive/ql/optimizer/TopNKeyProcessor.java | 10 +- .../apache/hadoop/hive/ql/plan/TopNKeyDesc.java | 14 + .../apache/hadoop/hive/ql/util/NullOrdering.java | 21 +- ql/src/test/queries/clientpositive/topnkey.q | 55 ++- .../test/results/clientpositive/llap/topnkey.q.out | 472 ++++++++++++++++++- .../test/results/clientpositive/tez/topnkey.q.out | 482 ++++++++++++++++++-- ql/src/test/results/clientpositive/topnkey.q.out | 499 ++++++++++++++++++++- .../objectinspector/ObjectInspectorUtils.java | 51 ++- 9 files changed, 1571 insertions(+), 76 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java index 4734824..d16500e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.TopNKeyDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; +import org.apache.hadoop.hive.ql.util.NullOrdering; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @@ -58,21 +59,26 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab } public static class KeyWrapperComparator implements Comparator<KeyWrapper> { - private ObjectInspector[] objectInspectors1; - private ObjectInspector[] objectInspectors2; - private boolean[] columnSortOrderIsDesc; - - public KeyWrapperComparator(ObjectInspector[] objectInspectors1, ObjectInspector[] - objectInspectors2, boolean[] columnSortOrderIsDesc) { - this.objectInspectors1 = objectInspectors1; - this.objectInspectors2 = objectInspectors2; - this.columnSortOrderIsDesc = columnSortOrderIsDesc; + + private final ObjectInspector[] keyObjectInspectors; + private final boolean[] columnSortOrderIsDesc; + private final ObjectInspectorUtils.NullValueOption[] nullSortOrder; + + KeyWrapperComparator(ObjectInspector[] keyObjectInspectors, String columnSortOrder, String nullSortOrder) { + this.keyObjectInspectors = keyObjectInspectors; + this.columnSortOrderIsDesc = new boolean[columnSortOrder.length()]; + this.nullSortOrder = new ObjectInspectorUtils.NullValueOption[nullSortOrder.length()]; + for (int i = 0; i < columnSortOrder.length(); ++i) { + this.columnSortOrderIsDesc[i] = columnSortOrder.charAt(i) == '-'; + this.nullSortOrder[i] = NullOrdering.fromSign(nullSortOrder.charAt(i)).getNullValueOption(); + } } @Override public int compare(KeyWrapper key1, KeyWrapper key2) { - return ObjectInspectorUtils.compare(key1.getKeyArray(), objectInspectors1, - key2.getKeyArray(), objectInspectors2, columnSortOrderIsDesc); + return ObjectInspectorUtils.compare( + key1.getKeyArray(), keyObjectInspectors, key2.getKeyArray(), keyObjectInspectors, + columnSortOrderIsDesc, nullSortOrder); } } @@ -82,12 +88,6 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab this.topN = conf.getTopN(); - String columnSortOrder = conf.getColumnSortOrder(); - boolean[] columnSortOrderIsDesc = new boolean[columnSortOrder.length()]; - for (int i = 0; i < columnSortOrderIsDesc.length; i++) { - columnSortOrderIsDesc[i] = (columnSortOrder.charAt(i) == '-'); - } - ObjectInspector rowInspector = inputObjInspectors[0]; ObjectInspector standardObjInspector = ObjectInspectorUtils.getStandardObjectInspector(rowInspector); outputObjInspector = rowInspector; @@ -107,8 +107,13 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab standardKeyObjectInspectors[i] = standardKeyFields[i].initialize(standardObjInspector); } - priorityQueue = new PriorityQueue<>(topN + 1, new TopNKeyOperator.KeyWrapperComparator( - standardKeyObjectInspectors, standardKeyObjectInspectors, columnSortOrderIsDesc)); + String columnSortOrder = conf.getColumnSortOrder(); + String nullSortOrder = conf.getNullOrder(); + + // We need a reversed comparator because the PriorityQueue.poll() method is used for filtering out keys. + // Ex.: When ORDER BY key1 ASC then call of poll() should remove the largest key. + priorityQueue = new PriorityQueue<>(topN + 1, + new KeyWrapperComparator(standardKeyObjectInspectors, columnSortOrder, nullSortOrder).reversed()); KeyWrapperFactory keyWrapperFactory = new KeyWrapperFactory(keyFields, keyObjectInspectors, standardKeyObjectInspectors); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java index 721a9b9..4b4cf99 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/TopNKeyProcessor.java @@ -53,7 +53,7 @@ public class TopNKeyProcessor implements NodeProcessor { @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, - Object... nodeOutputs) throws SemanticException { + Object... nodeOutputs) throws SemanticException { // Get ReduceSinkOperator ReduceSinkOperator reduceSinkOperator = (ReduceSinkOperator) nd; @@ -95,11 +95,11 @@ public class TopNKeyProcessor implements NodeProcessor { } // Insert a new top n key operator between the group by operator and its parent - TopNKeyDesc topNKeyDesc = new TopNKeyDesc(reduceSinkDesc.getTopN(), reduceSinkDesc.getOrder(), - groupByKeyColumns); + TopNKeyDesc topNKeyDesc = new TopNKeyDesc( + reduceSinkDesc.getTopN(), reduceSinkDesc.getOrder(), reduceSinkDesc.getNullOrder(), groupByKeyColumns); Operator<? extends OperatorDesc> newOperator = OperatorFactory.getAndMakeChild( - groupByOperator.getCompilationOpContext(), (OperatorDesc) topNKeyDesc, - new RowSchema(groupByOperator.getSchema()), groupByOperator.getParentOperators()); + groupByOperator.getCompilationOpContext(), (OperatorDesc) topNKeyDesc, + new RowSchema(groupByOperator.getSchema()), groupByOperator.getParentOperators()); newOperator.getChildOperators().add(groupByOperator); groupByOperator.getParentOperators().add(newOperator); parentOperator.removeChild(groupByOperator); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java index c62c4a9..84e6149 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java @@ -33,6 +33,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc { private int topN; private String columnSortOrder; + private String nullOrder; private List<ExprNodeDesc> keyColumns; public TopNKeyDesc() { @@ -41,10 +42,12 @@ public class TopNKeyDesc extends AbstractOperatorDesc { public TopNKeyDesc( final int topN, final String columnSortOrder, + final String nullOrder, final List<ExprNodeDesc> keyColumns) { this.topN = topN; this.columnSortOrder = columnSortOrder; + this.nullOrder = nullOrder; this.keyColumns = keyColumns; } @@ -66,6 +69,15 @@ public class TopNKeyDesc extends AbstractOperatorDesc { this.columnSortOrder = columnSortOrder; } + @Explain(displayName = "null sort order", explainLevels = { Level.EXTENDED }) + public String getNullOrder() { + return nullOrder; + } + + public void setNullOrder(String nullOrder) { + this.nullOrder = nullOrder; + } + @Explain(displayName = "keys") public String getKeyString() { return PlanUtils.getExprListString(keyColumns); @@ -98,6 +110,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc { TopNKeyDesc otherDesc = (TopNKeyDesc) other; return getTopN() == otherDesc.getTopN() && Objects.equals(columnSortOrder, otherDesc.columnSortOrder) && + Objects.equals(nullOrder, otherDesc.nullOrder) && ExprNodeDescUtils.isSame(keyColumns, otherDesc.keyColumns); } return false; @@ -108,6 +121,7 @@ public class TopNKeyDesc extends AbstractOperatorDesc { TopNKeyDesc ret = new TopNKeyDesc(); ret.setTopN(topN); ret.setColumnSortOrder(columnSortOrder); + ret.setNullOrder(nullOrder); ret.setKeyColumns(getKeyColumns() == null ? null : new ArrayList<>(getKeyColumns())); return ret; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java b/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java index 6bf1db2..46ff329 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/util/NullOrdering.java @@ -24,18 +24,20 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.NullVa * Enum for converting different Null ordering description types. */ public enum NullOrdering { - NULLS_FIRST(1, HiveParser.TOK_NULLS_FIRST, NullValueOption.MAXVALUE), - NULLS_LAST(0, HiveParser.TOK_NULLS_LAST, NullValueOption.MINVALUE); + NULLS_FIRST(1, HiveParser.TOK_NULLS_FIRST, NullValueOption.MAXVALUE, 'a'), + NULLS_LAST(0, HiveParser.TOK_NULLS_LAST, NullValueOption.MINVALUE, 'z'); - NullOrdering(int code, int token, NullValueOption nullValueOption) { + NullOrdering(int code, int token, NullValueOption nullValueOption, char sign) { this.code = code; this.token = token; this.nullValueOption = nullValueOption; + this.sign = sign; } private final int code; private final int token; private final NullValueOption nullValueOption; + private final char sign; public static NullOrdering fromToken(int token) { for (NullOrdering nullOrdering : NullOrdering.values()) { @@ -55,6 +57,15 @@ public enum NullOrdering { throw new EnumConstantNotPresentException(NullOrdering.class, "No enum constant present with code " + code); } + public static NullOrdering fromSign(char sign) { + for (NullOrdering nullOrdering : NullOrdering.values()) { + if (nullOrdering.sign == sign) { + return nullOrdering; + } + } + throw new EnumConstantNotPresentException(NullOrdering.class, "No enum constant present with sign " + sign); + } + public int getCode() { return code; } @@ -66,4 +77,8 @@ public enum NullOrdering { public NullValueOption getNullValueOption() { return nullValueOption; } + + public char getSign() { + return sign; + } } diff --git a/ql/src/test/queries/clientpositive/topnkey.q b/ql/src/test/queries/clientpositive/topnkey.q index e02a41d..283f426 100644 --- a/ql/src/test/queries/clientpositive/topnkey.q +++ b/ql/src/test/queries/clientpositive/topnkey.q @@ -14,7 +14,7 @@ set hive.tez.dynamic.partition.pruning=true; set hive.stats.fetch.column.stats=true; set hive.cbo.enable=true; -EXPLAIN +EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5; SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5; @@ -29,3 +29,56 @@ SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) SELECT src1.key, src2.value FROM src src1 JOIN src src2 ON (src1.key = src2.key) ORDER BY src1.key LIMIT 5; +CREATE TABLE t_test( + a int, + b int, + c int +); + +INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL); + +EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3; +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3; + + +EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2; + +DROP TABLE IF EXISTS t_test; + +CREATE TABLE t_test( + a int, + b int, + c int +); + +INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL); + +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2; +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2; + +DROP TABLE IF EXISTS t_test; diff --git a/ql/src/test/results/clientpositive/llap/topnkey.q.out b/ql/src/test/results/clientpositive/llap/topnkey.q.out index e3c0bc1..cd47e9d 100644 --- a/ql/src/test/results/clientpositive/llap/topnkey.q.out +++ b/ql/src/test/results/clientpositive/llap/topnkey.q.out @@ -1,13 +1,18 @@ -PREHOOK: query: EXPLAIN +PREHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 PREHOOK: type: QUERY PREHOOK: Input: default@src #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN +POSTHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### +OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1` +FROM `default`.`src` +GROUP BY `key` +ORDER BY `key` +LIMIT 5 STAGE DEPENDENCIES: Stage-1 is a root stage Stage-0 depends on stages: Stage-1 @@ -26,6 +31,7 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false Select Operator expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) outputColumnNames: _col0, _col1 @@ -33,6 +39,7 @@ STAGE PLANS: Top N Key Operator sort order: + keys: _col0 (type: string) + null sort order: z Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE top n: 5 Group By Operator @@ -44,15 +51,73 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) + null sort order: z sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 TopN Hash Memory Usage: 0.1 value expressions: _col1 (type: bigint) + auto parallelism: true Execution mode: llap LLAP IO: no inputs + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [src] Reducer 2 Execution mode: llap + Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) @@ -62,12 +127,17 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) + null sort order: z sort order: + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 TopN Hash Memory Usage: 0.1 value expressions: _col1 (type: bigint) + auto parallelism: false Reducer 3 Execution mode: llap + Needs Tagging: false Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) @@ -78,11 +148,26 @@ STAGE PLANS: Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false Stage: Stage-0 Fetch Operator @@ -330,3 +415,386 @@ POSTHOOK: Input: default@src 0 val_0 0 val_0 0 val_0 +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t_test + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: a, b + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + sort order: ++ + keys: a (type: int), b (type: int) + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + top n: 3 + Group By Operator + keys: a (type: int), b (type: int) + minReductionHashAggr: 0.3333333 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 16 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 3 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 1 +5 2 +6 2 +PREHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t_test + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: a, b + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + sort order: + + keys: a (type: int) + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + top n: 2 + Group By Operator + aggregations: count(b) + keys: a (type: int) + minReductionHashAggr: 0.6666666 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + Execution mode: llap + LLAP IO: no inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 20 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 2 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +7 3 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +7 3 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +7 3 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git a/ql/src/test/results/clientpositive/tez/topnkey.q.out b/ql/src/test/results/clientpositive/tez/topnkey.q.out index 3267f79..206c0c8 100644 --- a/ql/src/test/results/clientpositive/tez/topnkey.q.out +++ b/ql/src/test/results/clientpositive/tez/topnkey.q.out @@ -1,44 +1,177 @@ -PREHOOK: query: EXPLAIN +PREHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 PREHOOK: type: QUERY PREHOOK: Input: default@src PREHOOK: Output: hdfs://### HDFS PATH ### -POSTHOOK: query: EXPLAIN +POSTHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 POSTHOOK: type: QUERY POSTHOOK: Input: default@src POSTHOOK: Output: hdfs://### HDFS PATH ### -Plan optimized by CBO. +OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1` +FROM `default`.`src` +GROUP BY `key` +ORDER BY `key` +LIMIT 5 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 -Vertex dependency in root stage -Reducer 2 <- Map 1 (SIMPLE_EDGE) -Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false + Select Operator + expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + Top N Key Operator + sort order: + + keys: _col0 (type: string) + null sort order: z + Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + top n: 5 + Group By Operator + aggregations: sum(_col1) + keys: _col0 (type: string) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + auto parallelism: true + Path -> Alias: + hdfs://### HDFS PATH ### [src] + Path -> Partition: + hdfs://### HDFS PATH ### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + location hdfs://### HDFS PATH ### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + location hdfs://### HDFS PATH ### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [src] + Reducer 2 + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + auto parallelism: false + Reducer 3 + Needs Tagging: false + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 5 + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + GlobalTableId: 0 + directory: hdfs://### HDFS PATH ### + NumFilesPerFileSink: 1 + Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE + Stats Publishing Key Prefix: hdfs://### HDFS PATH ### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false -Stage-0 - Fetch Operator - limit:5 - Stage-1 - Reducer 3 - File Output Operator [FS_10] - Limit [LIM_9] (rows=5 width=95) - Number of rows:5 - Select Operator [SEL_8] (rows=250 width=95) - Output:["_col0","_col1"] - <-Reducer 2 [SIMPLE_EDGE] - SHUFFLE [RS_7] - Group By Operator [GBY_5] (rows=250 width=95) - Output:["_col0","_col1"],aggregations:["sum(VALUE._col0)"],keys:KEY._col0 - <-Map 1 [SIMPLE_EDGE] - SHUFFLE [RS_4] - PartitionCols:_col0 - Group By Operator [GBY_3] (rows=250 width=95) - Output:["_col0","_col1"],aggregations:["sum(_col1)"],keys:_col0 - Top N Key Operator [TNK_11] (rows=500 width=178) - keys:_col0,sort order:+,top n:5 - Select Operator [SEL_1] (rows=500 width=178) - Output:["_col0","_col1"] - TableScan [TS_0] (rows=500 width=178) - default@src,src,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + Stage: Stage-0 + Fetch Operator + limit: 5 + Processor Tree: + ListSink PREHOOK: query: SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 PREHOOK: type: QUERY @@ -228,3 +361,290 @@ POSTHOOK: Output: hdfs://### HDFS PATH ### 0 val_0 0 val_0 0 val_0 +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:3 + Stage-1 + Reducer 3 + File Output Operator [FS_9] + Limit [LIM_8] (rows=3 width=5) + Number of rows:3 + Select Operator [SEL_7] (rows=6 width=4) + Output:["_col0","_col1"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_6] + Group By Operator [GBY_4] (rows=6 width=4) + Output:["_col0","_col1"],keys:KEY._col0, KEY._col1 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_3] + PartitionCols:_col0, _col1 + Group By Operator [GBY_2] (rows=6 width=4) + Output:["_col0","_col1"],keys:a, b + Top N Key Operator [TNK_10] (rows=12 width=6) + keys:a, b,sort order:++,top n:3 + Select Operator [SEL_1] (rows=12 width=6) + Output:["a","b"] + TableScan [TS_0] (rows=12 width=6) + default@t_test,t_test,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"] + +PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 1 +5 2 +6 2 +PREHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +Plan optimized by CBO. + +Vertex dependency in root stage +Reducer 2 <- Map 1 (SIMPLE_EDGE) +Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:2 + Stage-1 + Reducer 3 + File Output Operator [FS_9] + Limit [LIM_8] (rows=2 width=10) + Number of rows:2 + Select Operator [SEL_7] (rows=4 width=10) + Output:["_col0","_col1"] + <-Reducer 2 [SIMPLE_EDGE] + SHUFFLE [RS_6] + Group By Operator [GBY_4] (rows=4 width=10) + Output:["_col0","_col1"],aggregations:["count(VALUE._col0)"],keys:KEY._col0 + <-Map 1 [SIMPLE_EDGE] + SHUFFLE [RS_3] + PartitionCols:_col0 + Group By Operator [GBY_2] (rows=4 width=10) + Output:["_col0","_col1"],aggregations:["count(b)"],keys:a + Top N Key Operator [TNK_10] (rows=12 width=6) + keys:a,sort order:+,top n:2 + Select Operator [SEL_1] (rows=12 width=6) + Output:["a","b"] + TableScan [TS_0] (rows=12 width=6) + default@t_test,t_test,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"] + +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +5 4 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +7 3 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL 0 +7 3 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +POSTHOOK: Output: hdfs://### HDFS PATH ### +7 3 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git a/ql/src/test/results/clientpositive/topnkey.q.out b/ql/src/test/results/clientpositive/topnkey.q.out index 7ea1a7a..66d0eca 100644 --- a/ql/src/test/results/clientpositive/topnkey.q.out +++ b/ql/src/test/results/clientpositive/topnkey.q.out @@ -1,13 +1,18 @@ -PREHOOK: query: EXPLAIN +PREHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 PREHOOK: type: QUERY PREHOOK: Input: default@src #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN +POSTHOOK: query: EXPLAIN EXTENDED SELECT key, SUM(CAST(SUBSTR(value,5) AS INT)) FROM src GROUP BY key ORDER BY key LIMIT 5 POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### +OPTIMIZED SQL: SELECT `key` AS `$f0`, SUM(CAST(SUBSTR(`value`, 5) AS INTEGER)) AS `$f1` +FROM `default`.`src` +GROUP BY `key` +ORDER BY `key` +LIMIT 5 STAGE DEPENDENCIES: Stage-1 is a root stage Stage-2 depends on stages: Stage-1 @@ -20,6 +25,7 @@ STAGE PLANS: TableScan alias: src Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE + GatherStats: false Select Operator expressions: key (type: string), UDFToInteger(substr(value, 5)) (type: int) outputColumnNames: _col0, _col1 @@ -33,11 +39,69 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) + null sort order: z sort order: + Map-reduce partition columns: _col0 (type: string) Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 TopN Hash Memory Usage: 0.1 value expressions: _col1 (type: bigint) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: src + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}} + bucket_count -1 + bucketing_version 2 + column.name.delimiter , + columns key,value + columns.comments 'default','default' + columns.types string:string +#### A masked pattern was here #### + name default.src + numFiles 1 + numRows 500 + rawDataSize 5312 + serialization.ddl struct src { string key, string value} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + totalSize 5812 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.src + name: default.src + Truncated Path -> Alias: + /src [$hdt$_0:src] + Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: sum(VALUE._col0) @@ -47,21 +111,66 @@ STAGE PLANS: Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false Stage: Stage-2 Map Reduce Map Operator Tree: TableScan + GatherStats: false Reduce Output Operator key expressions: _col0 (type: string) + null sort order: z sort order: + Statistics: Num rows: 250 Data size: 23750 Basic stats: COMPLETE Column stats: COMPLETE + tag: -1 + TopN: 5 TopN Hash Memory Usage: 0.1 value expressions: _col1 (type: bigint) + auto parallelism: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: -mr-10004 + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + column.name.delimiter , + columns _col0,_col1 + columns.types string,bigint + escape.delim \ + serialization.lib org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + Truncated Path -> Alias: +#### A masked pattern was here #### + Needs Tagging: false Reduce Operator Tree: Select Operator expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: bigint) @@ -72,11 +181,26 @@ STAGE PLANS: Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 Statistics: Num rows: 5 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE +#### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0,_col1 + columns.types string:bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false Stage: Stage-0 Fetch Operator @@ -313,3 +437,374 @@ POSTHOOK: Input: default@src 0 val_0 0 val_0 0 val_0 +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t_test + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: a, b + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: a (type: int), b (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Map-reduce partition columns: _col0 (type: int), _col1 (type: int) + Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: int), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int), _col1 (type: int) + sort order: ++ + Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 3 + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 3 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, b FROM t_test GROUP BY a, b ORDER BY a, b LIMIT 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 1 +5 2 +6 2 +PREHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: t_test + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: int), b (type: int) + outputColumnNames: a, b + Statistics: Num rows: 12 Data size: 72 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(b) + keys: a (type: int) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), VALUE._col0 (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 4 Data size: 48 Basic stats: COMPLETE Column stats: COMPLETE + Limit + Number of rows: 2 + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 2 + Processor Tree: + ListSink + +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +5 4 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a ASC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +5 4 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test +PREHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_test +POSTHOOK: query: CREATE TABLE t_test( + a int, + b int, + c int +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_test +PREHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@t_test +POSTHOOK: query: INSERT INTO t_test VALUES +(7, 8, 4), (7, 8, 4), (7, 8, 4), +(NULL, NULL, NULL), +(5, 2, 3), +(NULL, NULL, NULL), +(NULL, NULL, NULL), +(6, 2, 1), +(5, 1, 2), (5, 1, 2), (5, 1, 2), +(NULL, NULL, NULL) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@t_test +POSTHOOK: Lineage: t_test.a SCRIPT [] +POSTHOOK: Lineage: t_test.b SCRIPT [] +POSTHOOK: Lineage: t_test.c SCRIPT [] +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +7 3 +6 1 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS FIRST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +NULL 0 +7 3 +PREHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@t_test +#### A masked pattern was here #### +POSTHOOK: query: SELECT a, count(b) FROM t_test GROUP BY a ORDER BY a DESC NULLS LAST LIMIT 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_test +#### A masked pattern was here #### +7 3 +6 1 +PREHOOK: query: DROP TABLE IF EXISTS t_test +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@t_test +PREHOOK: Output: default@t_test +POSTHOOK: query: DROP TABLE IF EXISTS t_test +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@t_test +POSTHOOK: Output: default@t_test diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java index df6e788..ab1cfdc 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorUtils.java @@ -18,12 +18,16 @@ package org.apache.hadoop.hive.serde2.objectinspector; +import static java.util.Comparator.nullsFirst; +import static java.util.Comparator.nullsLast; + import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.lang.reflect.Type; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -944,20 +948,41 @@ public final class ObjectInspectorUtils { return 0; } - public static int compare(Object[] o1, ObjectInspector[] oi1, Object[] o2, - ObjectInspector[] oi2, boolean[] columnSortOrderIsDesc) { - assert (o1.length == oi1.length); - assert (o2.length == oi2.length); - assert (o1.length == o2.length); + public static int compare( + Object[] objectArray1, ObjectInspector[] oi1, Object[] objectArray2, ObjectInspector[] oi2, + boolean[] columnSortOrderIsDesc, NullValueOption[] nullSortOrder) { + assert (objectArray1.length == objectArray2.length); + assert (objectArray1.length == oi1.length); + assert (objectArray2.length == oi2.length); + assert (columnSortOrderIsDesc.length == objectArray1.length); + assert (nullSortOrder.length == objectArray1.length); - for (int i = 0; i < o1.length; i++) { - int r = compare(o1[i], oi1[i], o2[i], oi2[i]); - if (r != 0) { - if (columnSortOrderIsDesc[i]) { - return r; - } else { - return -r; - } + List<Comparator<Object>> comparators = new ArrayList<>(oi1.length); + for (int i = 0; i < oi1.length; i++) { + final int keyIndex = i; + + Comparator<Object> comparator = (o1, o2) -> compare( + o1, oi1[keyIndex], o2, oi2[keyIndex]); + + if (columnSortOrderIsDesc[i]) { + comparator = comparator.reversed(); + } + + if (nullSortOrder[i] == NullValueOption.MAXVALUE) { + comparators.add(nullsFirst(comparator)); + } else { + comparators.add(nullsLast(comparator)); + } + } + + return compare(comparators, objectArray1, objectArray2); + } + + public static <T> int compare(List<Comparator<T>> comparatorList, T[] o1, T[] o2) { + for (int i = 0; i < comparatorList.size(); ++i) { + int c = comparatorList.get(i).compare(o1[i], o2[i]); + if (c != 0) { + return c; } } return 0;