[jira] [Commented] (HIVE-16811) Estimate statistics in absence of stats

Vineet Garg (JIRA) Mon, 17 Jul 2017 17:14:08 -0700

    [ 
https://issues.apache.org/jira/browse/HIVE-16811?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16090839#comment-16090839
 ]


Vineet Garg commented on HIVE-16811:
------------------------------------

It turns out "Estimating stats will prevent join ordering algorithm to bail out 
and come up with join at least better than cross join" statement is not true. 
Join ordering algorithm could come up with a join which has cross-join and has 
less cost than plan with non-cross joins
e.g.
{code:sql}
CREATE TABLE src (key STRING COMMENT 'default', value STRING COMMENT 'default') 
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH "${hiveconf:test.data.dir}/kv1.txt" INTO TABLE src;

ANALYZE TABLE src COMPUTE STATISTICS;
ANALYZE TABLE src COMPUTE STATISTICS FOR COLUMNS key,value;

CREATE TABLE smalltable(key string, value string) stored as textfile;
LOAD DATA local inpath '../../data/files/T1.txt' into table smalltable;
ANALYZE TABLE smalltable COMPUTE STATISTICS;
ANALYZE TABLE smalltable COMPUTE STATISTICS FOR COLUMNS key,value;

explain select src1.key, src2.key, smalltable.key from src src1 JOIN src src2 
ON (src1.key = src2.key) JOIN smalltable ON (src1.key + src2.key = 
smalltable.key);
{code}

{noformat}
Stage-0
  Fetch Operator
    limit:-1
    Stage-1
      Reducer 2
      File Output Operator [FS_17]
        Select Operator [SEL_16] (rows=275 width=18)
          Output:["_col0","_col1","_col2"]
          Filter Operator [FIL_15] (rows=275 width=18)
            predicate:((UDFToDouble(_col2) + UDFToDouble(_col0)) = 
UDFToDouble(_col1))
            Merge Join Operator [MERGEJOIN_25] (rows=550 width=18)
              
Conds:RS_12._col0=RS_13._col0(Inner),Output:["_col0","_col1","_col2"]
            <-Map 1 [SIMPLE_EDGE]
              SHUFFLE [RS_12]
                PartitionCols:_col0
                Map Join Operator [MAPJOIN_24] (rows=500 width=18)
                  Conds:(Inner),Output:["_col0","_col1"]
                <-Map 3 [BROADCAST_EDGE]
                  BROADCAST [RS_10]
                    Select Operator [SEL_5] (rows=1 width=7)
                      Output:["_col0"]
                      Filter Operator [FIL_22] (rows=1 width=7)
                        predicate:key is not null
                        TableScan [TS_3] (rows=1 width=7)
                          
default@smalltable,smalltable,Tbl:COMPLETE,Col:NONE,Output:["key"]
                <-Select Operator [SEL_2] (rows=500 width=10)
                    Output:["_col0"]
                    Filter Operator [FIL_21] (rows=500 width=10)
                      predicate:key is not null
                      TableScan [TS_0] (rows=500 width=10)
                        default@src,src2,Tbl:COMPLETE,Col:NONE,Output:["key"]
            <-Map 4 [SIMPLE_EDGE]
              SHUFFLE [RS_13]
                PartitionCols:_col0
                Select Operator [SEL_8] (rows=500 width=10)
                  Output:["_col0"]
                  Filter Operator [FIL_23] (rows=500 width=10)
                    predicate:key is not null
                    TableScan [TS_6] (rows=500 width=10)
                      default@src,src1,Tbl:COMPLETE,Col:NONE,Output:["key"]
{noformat}

Therefore even with estimating statistics and forcing join-reordering we could 
end up with cross-join

> Estimate statistics in absence of stats
> ---------------------------------------
>
>                 Key: HIVE-16811
>                 URL: https://issues.apache.org/jira/browse/HIVE-16811
>             Project: Hive
>          Issue Type: Improvement
>            Reporter: Vineet Garg
>            Assignee: Vineet Garg
>         Attachments: HIVE-16811.1.patch
>
>
> Currently Join ordering completely bails out in absence of statistics and 
> this could lead to bad joins such as cross joins.
> e.g. following select query will produce cross join.
> {code:sql}
> create table supplier (S_SUPPKEY INT, S_NAME STRING, S_ADDRESS STRING, 
> S_NATIONKEY INT, 
> S_PHONE STRING, S_ACCTBAL DOUBLE, S_COMMENT STRING)
> CREATE TABLE lineitem (L_ORDERKEY      INT,
>                                 L_PARTKEY       INT,
>                                 L_SUPPKEY       INT,
>                                 L_LINENUMBER    INT,
>                                 L_QUANTITY      DOUBLE,
>                                 L_EXTENDEDPRICE DOUBLE,
>                                 L_DISCOUNT      DOUBLE,
>                                 L_TAX           DOUBLE,
>                                 L_RETURNFLAG    STRING,
>                                 L_LINESTATUS    STRING,
>                                 l_shipdate      STRING,
>                                 L_COMMITDATE    STRING,
>                                 L_RECEIPTDATE   STRING,
>                                 L_SHIPINSTRUCT  STRING,
>                                 L_SHIPMODE      STRING,
>                                 L_COMMENT       STRING) partitioned by (dl 
> int)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '|';
> CREATE TABLE part(
>     p_partkey INT,
>     p_name STRING,
>     p_mfgr STRING,
>     p_brand STRING,
>     p_type STRING,
>     p_size INT,
>     p_container STRING,
>     p_retailprice DOUBLE,
>     p_comment STRING
> );
> explain select count(1) from part,supplier,lineitem where p_partkey = 
> l_partkey and s_suppkey = l_suppkey;
> {code}
> Estimating stats will prevent join ordering algorithm to bail out and come up 
> with join at least better than cross join 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (HIVE-16811) Estimate statistics in absence of stats

Reply via email to