Liu Shuo created SPARK-43182:
--------------------------------

             Summary: 3 tables join with limit when AE is enabled and one table 
is skewed
                 Key: SPARK-43182
                 URL: https://issues.apache.org/jira/browse/SPARK-43182
             Project: Spark
          Issue Type: Bug
          Components: SQL
    Affects Versions: 3.4.0
            Reporter: Liu Shuo


When we test AE in Spark3.4.0 with the following case, we find If we disable AE 
or enable Ae but disable skewJoin, the sql will finish in 20s, but if we enable 
AE and enable skewJoin,it will take very long time.

The test case:
{code:java}
create table source_aqe(c1 int,c18 string) using csv options(path 
'file:///tmp/spark-warehouse/data/');
create table hive_snappy_aqe_table1(c1 int)stored as PARQUET partitioned by(c18 
string); 
insert into table hive_snappy_aqe_table1 partition(c18=1)select c1 from 
source_aqe;
insert into table hive_snappy_aqe_table1 partition(c18=2)select c1 from 
source_aqe limit 120000;
insert into table hive_snappy_aqe_table1 partition(c18=3)select c1 from 
source_aqe limit 150000;create table hive_snappy_aqe_table2(c1 int)stored as 
PARQUET partitioned by(c18 string); 
insert into table hive_snappy_aqe_table2 partition(c18=1)select c1 from 
source_aqe limit 160000;
insert into table hive_snappy_aqe_table2 partition(c18=2)select c1 from 
source_aqe limit 120000;create table hive_snappy_aqe_table3(c1 int)stored as 
PARQUET partitioned by(c18 string); 
insert into table hive_snappy_aqe_table3 partition(c18=1)select c1 from 
source_aqe limit 160000;
insert into table hive_snappy_aqe_table3 partition(c18=2)select c1 from 
source_aqe limit 120000;
set spark.sql.adaptive.enabled=false;
set spark.sql.adaptive.forceOptimizeSkewedJoin = false;
set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1;
set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB;
set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB;
set spark.sql.autoBroadcastJoinThreshold = 51200;
 
###it will finish in 20s 
select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on 
hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join 
hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 
limit 10;
set spark.sql.adaptive.enabled=true;
set spark.sql.adaptive.forceOptimizeSkewedJoin = true;
set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1;
set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB;
set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB;
set spark.sql.autoBroadcastJoinThreshold = 51200;
###it will take very long time 
select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on 
hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join 
hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 
limit 10;{code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to