Author: rhbutani Date: Tue Dec 3 01:35:25 2013 New Revision: 1547258 URL: http://svn.apache.org/r1547258 Log: HIVE-5898 Make fetching of column statistics configurable (Prasanth Jayachandran via Harish Butani)
Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original) +++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Tue Dec 3 01:35:25 2013 @@ -641,6 +641,9 @@ public class HiveConf extends Configurat HIVE_STATS_MAP_NUM_ENTRIES("hive.stats.map.num.entries", 10), // to accurately compute statistics for GROUPBY map side parallelism needs to be known HIVE_STATS_MAP_SIDE_PARALLELISM("hive.stats.map.parallelism", 1), + // statistics annotation fetches column statistics for all required columns and for all + // required partitions which can be very expensive sometimes + HIVE_STATS_FETCH_COLUMN_STATS("hive.stats.fetch.column.stats", false), // Concurrency HIVE_SUPPORT_CONCURRENCY("hive.support.concurrency", false), Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java Tue Dec 3 01:35:25 2013 @@ -86,6 +86,7 @@ public class StatsUtils { List<String> neededColumns = tableScanOperator.getNeededColumns(); String dbName = table.getDbName(); String tabName = table.getTableName(); + boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS); if (!table.isPartitioned()) { long nr = getNumRows(dbName, tabName); @@ -106,7 +107,10 @@ public class StatsUtils { stats.setNumRows(nr); stats.setDataSize(rds); - List<ColStatistics> colStats = getTableColumnStats(table, schema, neededColumns); + List<ColStatistics> colStats = Lists.newArrayList(); + if (fetchColStats) { + colStats = getTableColumnStats(table, schema, neededColumns); + } // if column stats available and if atleast one column doesn't have stats // then mark it as partial @@ -128,11 +132,8 @@ public class StatsUtils { } else { stats.setColumnStatsState(Statistics.State.COMPLETE); } - stats.addToColumnStats(null); - } else { - // set col stats and mark it as table level col stats - stats.addToColumnStats(colStats); } + stats.addToColumnStats(colStats); } else { // For partitioned tables, get the size of all the partitions after pruning @@ -176,7 +177,10 @@ public class StatsUtils { // column stats for (Partition part : partList.getNotDeniedPartns()) { - List<ColStatistics> colStats = getPartitionColumnStats(table, part, schema, neededColumns); + List<ColStatistics> colStats = Lists.newArrayList(); + if (fetchColStats) { + colStats = getPartitionColumnStats(table, part, schema, neededColumns); + } if (checkIfColStatsAvailable(colStats) && colStats.contains(null)) { stats.updateColumnStatsState(Statistics.State.PARTIAL); } else if (checkIfColStatsAvailable(colStats) && !colStats.contains(null)) { Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_filter.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_groupby.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_join.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists emp_staging ( lastname string, deptid int @@ -28,7 +30,6 @@ LOAD DATA LOCAL INPATH '../../data/files LOAD DATA LOCAL INPATH '../../data/files/dept.txt' OVERWRITE INTO TABLE dept_staging; LOAD DATA LOCAL INPATH '../../data/files/loc.txt' OVERWRITE INTO TABLE loc_staging; - insert overwrite table emp_orc select * from emp_staging; insert overwrite table dept_orc select * from dept_staging; insert overwrite table loc_orc select * from loc_staging; Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_limit.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int, Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_part.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,8 @@ +set hive.stats.fetch.column.stats=true; +set hive.stats.autogather=false; +set hive.exec.dynamic.partition=true; +set hive.exec.dynamic.partition.mode=nonstrict; + create table if not exists loc_staging ( state string, locid int, @@ -16,10 +21,6 @@ create table if not exists loc_orc ( -- basicStatState: NONE colStatState: NONE explain extended select * from loc_orc; -set hive.stats.autogather=false; -set hive.exec.dynamic.partition=true; -set hive.exec.dynamic.partition.mode=nonstrict; - insert overwrite table loc_orc partition(year) select * from loc_staging; -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_select.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists alltypes ( bo1 boolean, ti1 tinyint, Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_table.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,6 @@ +set hive.stats.fetch.column.stats=true; +set hive.stats.autogather=false; + create table if not exists emp_staging ( lastname string, deptid int @@ -11,8 +14,6 @@ explain extended select * from emp_orc; LOAD DATA LOCAL INPATH '../../data/files/emp.txt' OVERWRITE INTO TABLE emp_staging; -set hive.stats.autogather=false; - insert overwrite table emp_orc select * from emp_staging; -- stats are disabled. basic stats will report the file size but not raw data size. so initial statistics will be PARTIAL Modified: hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q?rev=1547258&r1=1547257&r2=1547258&view=diff ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q (original) +++ hive/trunk/ql/src/test/queries/clientpositive/annotate_stats_union.q Tue Dec 3 01:35:25 2013 @@ -1,3 +1,5 @@ +set hive.stats.fetch.column.stats=true; + create table if not exists loc_staging ( state string, locid int,