This is an automated email from the ASF dual-hosted git repository. jcamacho pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
commit d7ed17e27eda65dc0ea641887b6389bf0aec7e96 Author: Jesus Camacho Rodriguez <jcama...@apache.org> AuthorDate: Mon Mar 25 12:59:14 2019 -0700 HIVE-21496: Automatic sizing of unordered buffer can overflow (Jesus Camacho Rodriguez, reviewed by Prasanth Jayachandran) --- .../java/org/apache/hadoop/hive/ql/stats/StatsUtils.java | 5 ++--- .../clientpositive/llap/constraints_optimization.q.out | 6 +++--- .../clientpositive/llap/results_cache_temptable.q.out | 8 ++++---- .../clientpositive/perf/tez/constraints/query14.q.out | 14 +++++++------- ql/src/test/results/clientpositive/perf/tez/query14.q.out | 14 +++++++------- 5 files changed, 23 insertions(+), 24 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 6149880..46048cd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1967,10 +1967,7 @@ public class StatsUtils { if (useColStats) { List<ColStatistics> colStats = stats.getColumnStats(); for (ColStatistics cs : colStats) { - long oldNumNulls = cs.getNumNulls(); long oldDV = cs.getCountDistint(); - long newNumNulls = Math.round(ratio * oldNumNulls); - cs.setNumNulls(newNumNulls); if (affectedColumns.contains(cs.getColumnName())) { long newDV = oldDV; @@ -1987,6 +1984,8 @@ public class StatsUtils { if (oldDV > newNumRows) { cs.setCountDistint(newNumRows); } + long newNumNulls = Math.round(ratio * cs.getNumNulls()); + cs.setNumNulls(newNumNulls > newNumRows ? newNumRows: newNumNulls); } stats.setColumnStats(colStats); long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats); diff --git a/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out index fbdc702..afcf53f 100644 --- a/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out +++ b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out @@ -601,12 +601,12 @@ STAGE PLANS: minReductionHashAggr: 0.0 mode: hash outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: bigint), _col1 (type: string) sort order: ++ Map-reduce partition columns: _col0 (type: bigint), _col1 (type: string) - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE TopN Hash Memory Usage: 0.1 Execution mode: vectorized, llap LLAP IO: no inputs @@ -617,7 +617,7 @@ STAGE PLANS: keys: KEY._col0 (type: bigint), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: bigint) outputColumnNames: _col0 diff --git a/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out b/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out index 4f1e3a7..d6eb82d 100644 --- a/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out +++ b/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out @@ -316,10 +316,10 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator sort order: - Statistics: Num rows: 1 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: bigint) Execution mode: vectorized, llap LLAP IO: no inputs @@ -330,10 +330,10 @@ STAGE PLANS: aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 9223372036854775807 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out b/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out index 1a3aefe..0f48872 100644 --- a/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out @@ -297,21 +297,21 @@ Stage-0 Stage-1 Reducer 9 vectorized File Output Operator [FS_1350] - Limit [LIM_1349] (rows=7 width=192) + Limit [LIM_1349] (rows=7 width=212) Number of rows:100 - Select Operator [SEL_1348] (rows=7 width=192) + Select Operator [SEL_1348] (rows=7 width=212) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] <-Reducer 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_1347] - Select Operator [SEL_1346] (rows=7 width=192) + Select Operator [SEL_1346] (rows=7 width=212) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Group By Operator [GBY_1345] (rows=7 width=200) + Group By Operator [GBY_1345] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col5","_col6"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3, KEY._col4 <-Union 7 [SIMPLE_EDGE] <-Reducer 16 [CONTAINS] Reduce Output Operator [RS_1195] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1194] (rows=7 width=200) + Group By Operator [GBY_1194] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1193] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100 @@ -683,7 +683,7 @@ Stage-0 <-Reducer 22 [CONTAINS] Reduce Output Operator [RS_1202] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1201] (rows=7 width=200) + Group By Operator [GBY_1201] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1200] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100 @@ -866,7 +866,7 @@ Stage-0 <-Reducer 6 [CONTAINS] Reduce Output Operator [RS_1188] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1187] (rows=7 width=200) + Group By Operator [GBY_1187] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1186] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100 diff --git a/ql/src/test/results/clientpositive/perf/tez/query14.q.out b/ql/src/test/results/clientpositive/perf/tez/query14.q.out index fd8eb9b..66c4f39 100644 --- a/ql/src/test/results/clientpositive/perf/tez/query14.q.out +++ b/ql/src/test/results/clientpositive/perf/tez/query14.q.out @@ -297,21 +297,21 @@ Stage-0 Stage-1 Reducer 9 vectorized File Output Operator [FS_1335] - Limit [LIM_1334] (rows=7 width=192) + Limit [LIM_1334] (rows=7 width=212) Number of rows:100 - Select Operator [SEL_1333] (rows=7 width=192) + Select Operator [SEL_1333] (rows=7 width=212) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] <-Reducer 8 [SIMPLE_EDGE] vectorized SHUFFLE [RS_1332] - Select Operator [SEL_1331] (rows=7 width=192) + Select Operator [SEL_1331] (rows=7 width=212) Output:["_col0","_col1","_col2","_col3","_col4","_col5"] - Group By Operator [GBY_1330] (rows=7 width=200) + Group By Operator [GBY_1330] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col5","_col6"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3, KEY._col4 <-Union 7 [SIMPLE_EDGE] <-Reducer 16 [CONTAINS] Reduce Output Operator [RS_1177] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1176] (rows=7 width=200) + Group By Operator [GBY_1176] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1175] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100 @@ -687,7 +687,7 @@ Stage-0 <-Reducer 22 [CONTAINS] Reduce Output Operator [RS_1184] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1183] (rows=7 width=200) + Group By Operator [GBY_1183] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1182] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100 @@ -874,7 +874,7 @@ Stage-0 <-Reducer 6 [CONTAINS] Reduce Output Operator [RS_1170] PartitionCols:_col0, _col1, _col2, _col3, _col4 - Group By Operator [GBY_1169] (rows=7 width=200) + Group By Operator [GBY_1169] (rows=7 width=220) Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0, _col1, _col2, _col3, 0L Top N Key Operator [TNK_1168] (rows=3 width=221) keys:_col0, _col1, _col2, _col3, 0L,sort order:+++++,top n:100