[ https://issues.apache.org/jira/browse/SPARK-43182?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Liu Shuo updated SPARK-43182: ----------------------------- Description: [^data.zip]When we test AE in Spark3.4.0 with the following case, we find If we disable AE or enable Ae but disable skewJoin, the sql will finish in 20s, but if we enable AE and enable skewJoin,it will take very long time. The test case: {code:java} ### create table source_aqe(c1 int,c18 string) using csv options(path 'file:///tmp/spark-warehouse/data/'); create table hive_snappy_aqe_table1(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table1 partition(c18=1)select c1 from source_aqe; insert into table hive_snappy_aqe_table1 partition(c18=2)select c1 from source_aqe limit 120000; insert into table hive_snappy_aqe_table1 partition(c18=3)select c1 from source_aqe limit 150000;create table hive_snappy_aqe_table2(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table2 partition(c18=1)select c1 from source_aqe limit 160000; insert into table hive_snappy_aqe_table2 partition(c18=2)select c1 from source_aqe limit 120000;create table hive_snappy_aqe_table3(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table3 partition(c18=1)select c1 from source_aqe limit 160000; insert into table hive_snappy_aqe_table3 partition(c18=2)select c1 from source_aqe limit 120000; set spark.sql.adaptive.enabled=false; set spark.sql.adaptive.forceOptimizeSkewedJoin = false; set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; set spark.sql.autoBroadcastJoinThreshold = 51200; ###it will finish in 20s select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10; set spark.sql.adaptive.enabled=true; set spark.sql.adaptive.forceOptimizeSkewedJoin = true; set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; set spark.sql.autoBroadcastJoinThreshold = 51200; ###it will take very long time select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10;{code} was: When we test AE in Spark3.4.0 with the following case, we find If we disable AE or enable Ae but disable skewJoin, the sql will finish in 20s, but if we enable AE and enable skewJoin,it will take very long time. The test case: {code:java} create table source_aqe(c1 int,c18 string) using csv options(path 'file:///tmp/spark-warehouse/data/'); create table hive_snappy_aqe_table1(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table1 partition(c18=1)select c1 from source_aqe; insert into table hive_snappy_aqe_table1 partition(c18=2)select c1 from source_aqe limit 120000; insert into table hive_snappy_aqe_table1 partition(c18=3)select c1 from source_aqe limit 150000;create table hive_snappy_aqe_table2(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table2 partition(c18=1)select c1 from source_aqe limit 160000; insert into table hive_snappy_aqe_table2 partition(c18=2)select c1 from source_aqe limit 120000;create table hive_snappy_aqe_table3(c1 int)stored as PARQUET partitioned by(c18 string); insert into table hive_snappy_aqe_table3 partition(c18=1)select c1 from source_aqe limit 160000; insert into table hive_snappy_aqe_table3 partition(c18=2)select c1 from source_aqe limit 120000; set spark.sql.adaptive.enabled=false; set spark.sql.adaptive.forceOptimizeSkewedJoin = false; set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; set spark.sql.autoBroadcastJoinThreshold = 51200; ###it will finish in 20s select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10; set spark.sql.adaptive.enabled=true; set spark.sql.adaptive.forceOptimizeSkewedJoin = true; set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; set spark.sql.autoBroadcastJoinThreshold = 51200; ###it will take very long time select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join hive_snappy_aqe_table3 on hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10;{code} > Mutilple tables join with limit when AE is enabled and one table is skewed > -------------------------------------------------------------------------- > > Key: SPARK-43182 > URL: https://issues.apache.org/jira/browse/SPARK-43182 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.4.0 > Reporter: Liu Shuo > Priority: Critical > > [^data.zip]When we test AE in Spark3.4.0 with the following case, we find If > we disable AE or enable Ae but disable skewJoin, the sql will finish in 20s, > but if we enable AE and enable skewJoin,it will take very long time. > The test case: > {code:java} > ### > create table source_aqe(c1 int,c18 string) using csv options(path > 'file:///tmp/spark-warehouse/data/'); > create table hive_snappy_aqe_table1(c1 int)stored as PARQUET partitioned > by(c18 string); > insert into table hive_snappy_aqe_table1 partition(c18=1)select c1 from > source_aqe; > insert into table hive_snappy_aqe_table1 partition(c18=2)select c1 from > source_aqe limit 120000; > insert into table hive_snappy_aqe_table1 partition(c18=3)select c1 from > source_aqe limit 150000;create table hive_snappy_aqe_table2(c1 int)stored as > PARQUET partitioned by(c18 string); > insert into table hive_snappy_aqe_table2 partition(c18=1)select c1 from > source_aqe limit 160000; > insert into table hive_snappy_aqe_table2 partition(c18=2)select c1 from > source_aqe limit 120000;create table hive_snappy_aqe_table3(c1 int)stored as > PARQUET partitioned by(c18 string); > insert into table hive_snappy_aqe_table3 partition(c18=1)select c1 from > source_aqe limit 160000; > insert into table hive_snappy_aqe_table3 partition(c18=2)select c1 from > source_aqe limit 120000; > set spark.sql.adaptive.enabled=false; > set spark.sql.adaptive.forceOptimizeSkewedJoin = false; > set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; > set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; > set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; > set spark.sql.autoBroadcastJoinThreshold = 51200; > > ###it will finish in 20s > select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on > hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join > hive_snappy_aqe_table3 on > hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10; > set spark.sql.adaptive.enabled=true; > set spark.sql.adaptive.forceOptimizeSkewedJoin = true; > set spark.sql.adaptive.skewJoin.skewedPartitionFactor=1; > set spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes=10KB; > set spark.sql.adaptive.advisoryPartitionSizeInBytes=100KB; > set spark.sql.autoBroadcastJoinThreshold = 51200; > ###it will take very long time > select * from hive_snappy_aqe_table1 join hive_snappy_aqe_table2 on > hive_snappy_aqe_table1.c18=hive_snappy_aqe_table2.c18 join > hive_snappy_aqe_table3 on > hive_snappy_aqe_table1.c18=hive_snappy_aqe_table3.c18 limit 10;{code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org