Janaki Lahorani created HIVE-19940:
--------------------------------------

             Summary: Push predicates with deterministic UDFs with RBO
                 Key: HIVE-19940
                 URL: https://issues.apache.org/jira/browse/HIVE-19940
             Project: Hive
          Issue Type: Bug
            Reporter: Janaki Lahorani
            Assignee: Janaki Lahorani


With RBO, predicates with any UDF doesn't get pushed down.  It makes sense to 
not pushdown the predicates with non-deterministic function as the meaning of 
the query changes after the predicate is resolved to use the function.  But 
pushing a deterministic function is beneficial.

Test Case:
{code}
set hive.cbo.enable=false;

CREATE TABLE `testb`(
   `cola` string COMMENT '',
   `colb` string COMMENT '',
   `colc` string COMMENT '')
PARTITIONED BY (
   `part1` string,
   `part2` string,
   `part3` string)

STORED AS AVRO;

CREATE TABLE `testa`(
   `col1` string COMMENT '',
   `col2` string COMMENT '',
   `col3` string COMMENT '',
   `col4` string COMMENT '',
   `col5` string COMMENT '')
PARTITIONED BY (
   `part1` string,
   `part2` string,
   `part3` string)
STORED AS AVRO;

insert into testA partition (part1='US', part2='ABC', part3='123')
values ('12.34', '100', '200', '300', 'abc'),
('12.341', '1001', '2001', '3001', 'abcd');

insert into testA partition (part1='UK', part2='DEF', part3='123')
values ('12.34', '100', '200', '300', 'abc'),
('12.341', '1001', '2001', '3001', 'abcd');

insert into testA partition (part1='US', part2='DEF', part3='200')
values ('12.34', '100', '200', '300', 'abc'),
('12.341', '1001', '2001', '3001', 'abcd');

insert into testA partition (part1='CA', part2='ABC', part3='300')
values ('12.34', '100', '200', '300', 'abc'),
('12.341', '1001', '2001', '3001', 'abcd');

insert into testB partition (part1='CA', part2='ABC', part3='300')
values ('600', '700', 'abc'), ('601', '701', 'abcd');

insert into testB partition (part1='CA', part2='ABC', part3='400')
values ( '600', '700', 'abc'), ( '601', '701', 'abcd');

insert into testB partition (part1='UK', part2='PQR', part3='500')
values ('600', '700', 'abc'), ('601', '701', 'abcd');

insert into testB partition (part1='US', part2='DEF', part3='200')
values ( '600', '700', 'abc'), ('601', '701', 'abcd');

insert into testB partition (part1='US', part2='PQR', part3='123')
values ( '600', '700', 'abc'), ('601', '701', 'abcd');

-- views with deterministic functions
create view viewDeterministicUDFA partitioned on (vpart1, vpart2, vpart3) as 
select
 cast(col1 as decimal(38,18)) as vcol1,
 cast(col2 as decimal(38,18)) as vcol2,
 cast(col3 as decimal(38,18)) as vcol3,
 cast(col4 as decimal(38,18)) as vcol4,
 cast(col5 as char(10)) as vcol5,
 cast(part1 as char(2)) as vpart1,
 cast(part2 as char(3)) as vpart2,
 cast(part3 as char(3)) as vpart3
 from testa
where part1 in ('US', 'CA');

create view viewDeterministicUDFB partitioned on (vpart1, vpart2, vpart3) as 
select
 cast(cola as decimal(38,18)) as vcolA,
 cast(colb as decimal(38,18)) as vcolB,
 cast(colc as char(10)) as vcolC,
 cast(part1 as char(2)) as vpart1,
 cast(part2 as char(3)) as vpart2,
 cast(part3 as char(3)) as vpart3
 from testb
where part1 in ('US', 'CA');

explain
select vcol1, vcol2, vcol3, vcola, vcolb
from viewDeterministicUDFA a inner join viewDeterministicUDFB b
on a.vpart1 = b.vpart1
and a.vpart2 = b.vpart2
and a.vpart3 = b.vpart3
and a.vpart1 = 'US'
and a.vpart2 = 'DEF'
and a.vpart3 = '200';
{code}

Plan where the CAST is not pushed down.
{code}
STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: testa
            filterExpr: (part1) IN ('US', 'CA') (type: boolean)
            Statistics: Num rows: 6 Data size: 13740 Basic stats: COMPLETE 
Column stats: NONE
            Select Operator
              expressions: CAST( col1 AS decimal(38,18)) (type: 
decimal(38,18)), CAST( col2 AS decimal(38,18)) (type: decimal(38,18)), CAST( 
col3 AS decimal(38,18)) (type: decimal(38,18)), CAST( part1 AS CHAR(2)) (type: 
char(2)), CAST( part2 AS CHAR(3)) (type: char(3)), CAST( part3 AS CHAR(3)) 
(type: char(3))
              outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7
              Statistics: Num rows: 6 Data size: 13740 Basic stats: COMPLETE 
Column stats: NONE
              Filter Operator
                predicate: ((_col5 = 'US') and (_col6 = 'DEF') and (_col7 = 
'200')) (type: boolean)
                Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE 
Column stats: NONE
                Reduce Output Operator
                  key expressions: 'US' (type: char(2)), 'DEF' (type: char(3)), 
'200' (type: char(3))
                  sort order: +++
                  Map-reduce partition columns: 'US' (type: char(2)), 'DEF' 
(type: char(3)), '200' (type: char(3))
                  Statistics: Num rows: 1 Data size: 2290 Basic stats: COMPLETE 
Column stats: NONE
                  value expressions: _col0 (type: decimal(38,18)), _col1 (type: 
decimal(38,18)), _col2 (type: decimal(38,18))
          TableScan
            alias: testb
            filterExpr: (part1) IN ('US', 'CA') (type: boolean)
            Statistics: Num rows: 8 Data size: 12720 Basic stats: COMPLETE 
Column stats: NONE
            Select Operator
              expressions: CAST( cola AS decimal(38,18)) (type: 
decimal(38,18)), CAST( colb AS decimal(38,18)) (type: decimal(38,18)), CAST( 
part1 AS CHAR(2)) (type: char(2)), CAST( part2 AS CHAR(3)) (type: char(3)), 
CAST( part3 AS CHAR(3)) (type: char(3))
              outputColumnNames: _col0, _col1, _col3, _col4, _col5
              Statistics: Num rows: 8 Data size: 12720 Basic stats: COMPLETE 
Column stats: NONE
              Filter Operator
                predicate: ((_col5 = '200') and _col3 is not null and _col4 is 
not null) (type: boolean)
                Statistics: Num rows: 4 Data size: 6360 Basic stats: COMPLETE 
Column stats: NONE
                Reduce Output Operator
                  key expressions: _col3 (type: char(2)), _col4 (type: 
char(3)), '200' (type: char(3))
                  sort order: +++
                  Map-reduce partition columns: _col3 (type: char(2)), _col4 
(type: char(3)), '200' (type: char(3))
                  Statistics: Num rows: 4 Data size: 6360 Basic stats: COMPLETE 
Column stats: NONE
                  value expressions: _col0 (type: decimal(38,18)), _col1 (type: 
decimal(38,18))
      Reduce Operator Tree:
        Join Operator
          condition map:
               Inner Join 0 to 1
          keys:
            0 _col5 (type: char(2)), _col6 (type: char(3)), _col7 (type: 
char(3))
            1 _col3 (type: char(2)), _col4 (type: char(3)), _col5 (type: 
char(3))
          outputColumnNames: _col0, _col1, _col2, _col8, _col9
          Statistics: Num rows: 4 Data size: 6996 Basic stats: COMPLETE Column 
stats: NONE
          Select Operator
            expressions: _col0 (type: decimal(38,18)), _col1 (type: 
decimal(38,18)), _col2 (type: decimal(38,18)), _col8 (type: decimal(38,18)), 
_col9 (type: decimal(38,18))
            outputColumnNames: _col0, _col1, _col2, _col3, _col4
            Statistics: Num rows: 4 Data size: 6996 Basic stats: COMPLETE 
Column stats: NONE
            File Output Operator
              compressed: false
              Statistics: Num rows: 4 Data size: 6996 Basic stats: COMPLETE 
Column stats: NONE
              table:
                  input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink
{code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to