Author: rhbutani
Date: Tue Dec  3 01:35:25 2013
New Revision: 1547258

URL: http://svn.apache.org/r1547258
Log:
HIVE-5898 Make fetching of column statistics configurable (Prasanth 
Jayachandran via Harish Butani)

Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q
    hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue 
Dec  3 01:35:25 2013
@@ -641,6 +641,9 @@ public class HiveConf extends Configurat
     HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10),
     // to accurately compute statistics for GROUPBY map side parallelism needs 
to be known
     HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1),
+    // statistics annotation fetches column statistics for all required 
columns and for all
+    // required partitions which can be very expensive sometimes
+    HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false),
 
     // Concurrency
     HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false),

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Tue 
Dec  3 01:35:25 2013
@@ -86,6 +86,7 @@ public class StatsUtils {
     List<String> neededColumns = tableScanOperator.getNeededColumns();
     String dbName = table.getDbName();
     String tabName = table.getTableName();
+    boolean fetchColStats = HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
 
     if (!table.isPartitioned()) {
       long nr = getNumRows(dbName, tabName);
@@ -106,7 +107,10 @@ public class StatsUtils {
       stats.setNumRows(nr);
       stats.setDataSize(rds);
 
-      List<ColStatistics> colStats = getTableColumnStats(table, schema, 
neededColumns);
+      List<ColStatistics> colStats = Lists.newArrayList();
+      if (fetchColStats) {
+        colStats = getTableColumnStats(table, schema, neededColumns);
+      }
 
       // if column stats available and if atleast one column doesn't have stats
       // then mark it as partial
@@ -128,11 +132,8 @@ public class StatsUtils {
         } else {
           stats.setColumnStatsState(Statistics.State.COMPLETE);
         }
-        stats.addToColumnStats(null);
-      } else {
-        // set col stats and mark it as table level col stats
-        stats.addToColumnStats(colStats);
       }
+      stats.addToColumnStats(colStats);
     } else {
 
       // For partitioned tables, get the size of all the partitions after 
pruning
@@ -176,7 +177,10 @@ public class StatsUtils {
 
         // column stats
         for (Partition part : partList.getNotDeniedPartns()) {
-          List<ColStatistics> colStats = getPartitionColumnStats(table, part, 
schema, neededColumns);
+          List<ColStatistics> colStats = Lists.newArrayList();
+          if (fetchColStats) {
+            colStats = getPartitionColumnStats(table, part, schema, 
neededColumns);
+          }
           if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) {
             stats.updateColumnStatsState(Statistics.State.PARTIAL);
           } else if (checkIfColStatsAvailable(colStats) && 
!colStats.contains(null)) {

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists loc_staging (
   state string,
   locid int,

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists loc_staging (
   state string,
   locid int,

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q Tue Dec 
 3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists emp_staging (
   lastname string,
   deptid int
@@ -28,7 +30,6 @@ LOAD DATA LOCAL INPATH '../../data/files
 LOAD DATA LOCAL INPATH '../../data/files/dept.txt' OVERWRITE INTO TABLE 
dept_staging;
 LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE 
loc_staging;
 
-
 insert overwrite table emp_orc select * from emp_staging;
 insert overwrite table dept_orc select * from dept_staging;
 insert overwrite table loc_orc select * from loc_staging;

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists loc_staging (
   state string,
   locid int,

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q Tue Dec 
 3 01:35:25 2013
@@ -1,3 +1,8 @@
+set hive.stats.fetch.column.stats=true;
+set hive.stats.autogather=false;
+set hive.exec.dynamic.partition=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+
 create table if not exists loc_staging (
   state string,
   locid int,
@@ -16,10 +21,6 @@ create table if not exists loc_orc (
 -- basicStatState: NONE colStatState: NONE
 explain extended select * from loc_orc;
 
-set hive.stats.autogather=false;
-set hive.exec.dynamic.partition=true;
-set hive.exec.dynamic.partition.mode=nonstrict;
-
 insert overwrite table loc_orc partition(year) select * from loc_staging;
 
 -- stats are disabled. basic stats will report the file size but not raw data 
size. so initial statistics will be PARTIAL

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists alltypes (
  bo1 boolean,
  ti1 tinyint,

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,6 @@
+set hive.stats.fetch.column.stats=true;
+set hive.stats.autogather=false;
+
 create table if not exists emp_staging (
   lastname string,
   deptid int
@@ -11,8 +14,6 @@ explain extended select * from emp_orc;
 
 LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE 
emp_staging;
 
-set hive.stats.autogather=false;
-
 insert overwrite table emp_orc select * from emp_staging;
 
 -- stats are disabled. basic stats will report the file size but not raw data 
size. so initial statistics will be PARTIAL

Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q?rev=1547258&r1=1547257&r2=1547258&view=diff
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q 
(original)
+++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q Tue 
Dec  3 01:35:25 2013
@@ -1,3 +1,5 @@
+set hive.stats.fetch.column.stats=true;
+
 create table if not exists loc_staging (
   state string,
   locid int,


Reply via email to