This is an automated email from the ASF dual-hosted git repository.

jcamacho pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git

commit d7ed17e27eda65dc0ea641887b6389bf0aec7e96
Author: Jesus Camacho Rodriguez <jcama...@apache.org>
AuthorDate: Mon Mar 25 12:59:14 2019 -0700

    HIVE-21496: Automatic sizing of unordered buffer can overflow (Jesus 
Camacho Rodriguez, reviewed by Prasanth Jayachandran)
---
 .../java/org/apache/hadoop/hive/ql/stats/StatsUtils.java   |  5 ++---
 .../clientpositive/llap/constraints_optimization.q.out     |  6 +++---
 .../clientpositive/llap/results_cache_temptable.q.out      |  8 ++++----
 .../clientpositive/perf/tez/constraints/query14.q.out      | 14 +++++++-------
 ql/src/test/results/clientpositive/perf/tez/query14.q.out  | 14 +++++++-------
 5 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 6149880..46048cd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1967,10 +1967,7 @@ public class StatsUtils {
     if (useColStats) {
       List<ColStatistics> colStats = stats.getColumnStats();
       for (ColStatistics cs : colStats) {
-        long oldNumNulls = cs.getNumNulls();
         long oldDV = cs.getCountDistint();
-        long newNumNulls = Math.round(ratio * oldNumNulls);
-        cs.setNumNulls(newNumNulls);
         if (affectedColumns.contains(cs.getColumnName())) {
           long newDV = oldDV;
 
@@ -1987,6 +1984,8 @@ public class StatsUtils {
         if (oldDV > newNumRows) {
           cs.setCountDistint(newNumRows);
         }
+        long newNumNulls = Math.round(ratio * cs.getNumNulls());
+        cs.setNumNulls(newNumNulls > newNumRows ? newNumRows: newNumNulls);
       }
       stats.setColumnStats(colStats);
       long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, 
colStats);
diff --git 
a/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out 
b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out
index fbdc702..afcf53f 100644
--- a/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out
+++ b/ql/src/test/results/clientpositive/llap/constraints_optimization.q.out
@@ -601,12 +601,12 @@ STAGE PLANS:
                           minReductionHashAggr: 0.0
                           mode: hash
                           outputColumnNames: _col0, _col1
-                          Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 1 Data size: 92 Basic stats: 
COMPLETE Column stats: COMPLETE
                           Reduce Output Operator
                             key expressions: _col0 (type: bigint), _col1 
(type: string)
                             sort order: ++
                             Map-reduce partition columns: _col0 (type: 
bigint), _col1 (type: string)
-                            Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: COMPLETE
+                            Statistics: Num rows: 1 Data size: 92 Basic stats: 
COMPLETE Column stats: COMPLETE
                             TopN Hash Memory Usage: 0.1
             Execution mode: vectorized, llap
             LLAP IO: no inputs
@@ -617,7 +617,7 @@ STAGE PLANS:
                 keys: KEY._col0 (type: bigint), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 1 Data size: 92 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: bigint)
                   outputColumnNames: _col0
diff --git 
a/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out 
b/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out
index 4f1e3a7..d6eb82d 100644
--- a/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out
+++ b/ql/src/test/results/clientpositive/llap/results_cache_temptable.q.out
@@ -316,10 +316,10 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0
-                        Statistics: Num rows: 1 Data size: 9223372036854775807 
Basic stats: COMPLETE Column stats: NONE
+                        Statistics: Num rows: 1 Data size: 376 Basic stats: 
COMPLETE Column stats: NONE
                         Reduce Output Operator
                           sort order: 
-                          Statistics: Num rows: 1 Data size: 
9223372036854775807 Basic stats: COMPLETE Column stats: NONE
+                          Statistics: Num rows: 1 Data size: 376 Basic stats: 
COMPLETE Column stats: NONE
                           value expressions: _col0 (type: bigint)
             Execution mode: vectorized, llap
             LLAP IO: no inputs
@@ -330,10 +330,10 @@ STAGE PLANS:
                 aggregations: count(VALUE._col0)
                 mode: mergepartial
                 outputColumnNames: _col0
-                Statistics: Num rows: 1 Data size: 9223372036854775807 Basic 
stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE 
Column stats: NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 1 Data size: 9223372036854775807 Basic 
stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 1 Data size: 376 Basic stats: COMPLETE 
Column stats: NONE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git 
a/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out 
b/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out
index 1a3aefe..0f48872 100644
--- a/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/constraints/query14.q.out
@@ -297,21 +297,21 @@ Stage-0
     Stage-1
       Reducer 9 vectorized
       File Output Operator [FS_1350]
-        Limit [LIM_1349] (rows=7 width=192)
+        Limit [LIM_1349] (rows=7 width=212)
           Number of rows:100
-          Select Operator [SEL_1348] (rows=7 width=192)
+          Select Operator [SEL_1348] (rows=7 width=212)
             Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
           <-Reducer 8 [SIMPLE_EDGE] vectorized
             SHUFFLE [RS_1347]
-              Select Operator [SEL_1346] (rows=7 width=192)
+              Select Operator [SEL_1346] (rows=7 width=212)
                 Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
-                Group By Operator [GBY_1345] (rows=7 width=200)
+                Group By Operator [GBY_1345] (rows=7 width=220)
                   
Output:["_col0","_col1","_col2","_col3","_col5","_col6"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0,
 KEY._col1, KEY._col2, KEY._col3, KEY._col4
                 <-Union 7 [SIMPLE_EDGE]
                   <-Reducer 16 [CONTAINS]
                     Reduce Output Operator [RS_1195]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1194] (rows=7 width=200)
+                      Group By Operator [GBY_1194] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1193] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100
@@ -683,7 +683,7 @@ Stage-0
                   <-Reducer 22 [CONTAINS]
                     Reduce Output Operator [RS_1202]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1201] (rows=7 width=200)
+                      Group By Operator [GBY_1201] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1200] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100
@@ -866,7 +866,7 @@ Stage-0
                   <-Reducer 6 [CONTAINS]
                     Reduce Output Operator [RS_1188]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1187] (rows=7 width=200)
+                      Group By Operator [GBY_1187] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1186] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100
diff --git a/ql/src/test/results/clientpositive/perf/tez/query14.q.out 
b/ql/src/test/results/clientpositive/perf/tez/query14.q.out
index fd8eb9b..66c4f39 100644
--- a/ql/src/test/results/clientpositive/perf/tez/query14.q.out
+++ b/ql/src/test/results/clientpositive/perf/tez/query14.q.out
@@ -297,21 +297,21 @@ Stage-0
     Stage-1
       Reducer 9 vectorized
       File Output Operator [FS_1335]
-        Limit [LIM_1334] (rows=7 width=192)
+        Limit [LIM_1334] (rows=7 width=212)
           Number of rows:100
-          Select Operator [SEL_1333] (rows=7 width=192)
+          Select Operator [SEL_1333] (rows=7 width=212)
             Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
           <-Reducer 8 [SIMPLE_EDGE] vectorized
             SHUFFLE [RS_1332]
-              Select Operator [SEL_1331] (rows=7 width=192)
+              Select Operator [SEL_1331] (rows=7 width=212)
                 Output:["_col0","_col1","_col2","_col3","_col4","_col5"]
-                Group By Operator [GBY_1330] (rows=7 width=200)
+                Group By Operator [GBY_1330] (rows=7 width=220)
                   
Output:["_col0","_col1","_col2","_col3","_col5","_col6"],aggregations:["sum(VALUE._col0)","sum(VALUE._col1)"],keys:KEY._col0,
 KEY._col1, KEY._col2, KEY._col3, KEY._col4
                 <-Union 7 [SIMPLE_EDGE]
                   <-Reducer 16 [CONTAINS]
                     Reduce Output Operator [RS_1177]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1176] (rows=7 width=200)
+                      Group By Operator [GBY_1176] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1175] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100
@@ -687,7 +687,7 @@ Stage-0
                   <-Reducer 22 [CONTAINS]
                     Reduce Output Operator [RS_1184]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1183] (rows=7 width=200)
+                      Group By Operator [GBY_1183] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1182] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100
@@ -874,7 +874,7 @@ Stage-0
                   <-Reducer 6 [CONTAINS]
                     Reduce Output Operator [RS_1170]
                       PartitionCols:_col0, _col1, _col2, _col3, _col4
-                      Group By Operator [GBY_1169] (rows=7 width=200)
+                      Group By Operator [GBY_1169] (rows=7 width=220)
                         
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6"],aggregations:["sum(_col4)","sum(_col5)"],keys:_col0,
 _col1, _col2, _col3, 0L
                         Top N Key Operator [TNK_1168] (rows=3 width=221)
                           keys:_col0, _col1, _col2, _col3, 0L,sort 
order:+++++,top n:100

Reply via email to