[ 
https://issues.apache.org/jira/browse/SPARK-33910?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yuming Wang updated SPARK-33910:
--------------------------------
    Description: 
Simplify/Optimize conditional expressions. We can improve these cases:
1. Reduce read datasource.
2. Simple CaseWhen/If to support filter push down.
{code:sql}
create table t1 using parquet as select * from range(100);
create table t2 using parquet as select * from range(200);

create temp view v1 as                                                          
   
select 'a' as event_type, * from t1                                             
   
union all                                                                       
   
select CASE WHEN id = 1 THEN 'b' WHEN id = 3 THEN 'c' end as event_type, * from 
t2 

explain select * from v1 where event_type = 'a';
{code}

Before this PR:

{noformat}
== Physical Plan ==
Union
:- *(1) Project [a AS event_type#30533, id#30535L]
:  +- *(1) ColumnarToRow
:     +- FileScan parquet default.t1[id#30535L] Batched: true, DataFilters: [], 
Format: Parquet
+- *(2) Project [CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
END AS event_type#30534, id#30536L]
   +- *(2) Filter (CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
END = a)
      +- *(2) ColumnarToRow
         +- FileScan parquet default.t2[id#30536L] Batched: true, DataFilters: 
[(CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c END = a)], 
Format: Parquet
{noformat}

After this PR:


{noformat}
== Physical Plan ==
*(1) Project [a AS event_type#8, id#4L]
+- *(1) ColumnarToRow
   +- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [], 
Format: Parquet
{noformat}


  was:
Simplify CaseWhen/If conditionals. We can improve these cases:
1. Reduce read datasource.
2. Simple CaseWhen/If to support filter push down.
{code:sql}
create table t1 using parquet as select * from range(100);
create table t2 using parquet as select * from range(200);

create temp view v1 as                                                          
   
select 'a' as event_type, * from t1                                             
   
union all                                                                       
   
select CASE WHEN id = 1 THEN 'b' WHEN id = 3 THEN 'c' end as event_type, * from 
t2 

explain select * from v1 where event_type = 'a';
{code}

Before this PR:

{noformat}
== Physical Plan ==
Union
:- *(1) Project [a AS event_type#30533, id#30535L]
:  +- *(1) ColumnarToRow
:     +- FileScan parquet default.t1[id#30535L] Batched: true, DataFilters: [], 
Format: Parquet
+- *(2) Project [CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
END AS event_type#30534, id#30536L]
   +- *(2) Filter (CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
END = a)
      +- *(2) ColumnarToRow
         +- FileScan parquet default.t2[id#30536L] Batched: true, DataFilters: 
[(CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c END = a)], 
Format: Parquet
{noformat}

After this PR:


{noformat}
== Physical Plan ==
*(1) Project [a AS event_type#8, id#4L]
+- *(1) ColumnarToRow
   +- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [], 
Format: Parquet
{noformat}



>  Simplify/Optimize conditional expressions
> ------------------------------------------
>
>                 Key: SPARK-33910
>                 URL: https://issues.apache.org/jira/browse/SPARK-33910
>             Project: Spark
>          Issue Type: Umbrella
>          Components: SQL
>    Affects Versions: 3.2.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> Simplify/Optimize conditional expressions. We can improve these cases:
> 1. Reduce read datasource.
> 2. Simple CaseWhen/If to support filter push down.
> {code:sql}
> create table t1 using parquet as select * from range(100);
> create table t2 using parquet as select * from range(200);
> create temp view v1 as                                                        
>      
> select 'a' as event_type, * from t1                                           
>      
> union all                                                                     
>      
> select CASE WHEN id = 1 THEN 'b' WHEN id = 3 THEN 'c' end as event_type, * 
> from t2 
> explain select * from v1 where event_type = 'a';
> {code}
> Before this PR:
> {noformat}
> == Physical Plan ==
> Union
> :- *(1) Project [a AS event_type#30533, id#30535L]
> :  +- *(1) ColumnarToRow
> :     +- FileScan parquet default.t1[id#30535L] Batched: true, DataFilters: 
> [], Format: Parquet
> +- *(2) Project [CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
> END AS event_type#30534, id#30536L]
>    +- *(2) Filter (CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN 
> c END = a)
>       +- *(2) ColumnarToRow
>          +- FileScan parquet default.t2[id#30536L] Batched: true, 
> DataFilters: [(CASE WHEN (id#30536L = 1) THEN b WHEN (id#30536L = 3) THEN c 
> END = a)], Format: Parquet
> {noformat}
> After this PR:
> {noformat}
> == Physical Plan ==
> *(1) Project [a AS event_type#8, id#4L]
> +- *(1) ColumnarToRow
>    +- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [], 
> Format: Parquet
> {noformat}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to