Victoria Markman created DRILL-2369: ---------------------------------------
Summary: It takes too long to plan inner join query with 10 join predicates Key: DRILL-2369 URL: https://issues.apache.org/jira/browse/DRILL-2369 Project: Apache Drill Issue Type: Bug Components: Query Planning & Optimization Reporter: Victoria Markman Assignee: Jinfeng Ni Inner join with 10 join predicates takes 11 seconds to plan. If number of predicates goes up to 14, planning time increases to 41 seconds. (1) ON clause with 10 columns in join condition -- Total elapsed time : 11.407 seconds -- Planning time : 11.03 seconds {code} select count(*) from alltypes_with_nulls a INNER JOIN alltypes_with_nulls b ON ( a.c_boolean = b.c_boolean AND a.c_timestamp = b.c_timestamp AND a.c_time = b.c_time AND a.c_date = b.c_date AND a.c_float = b.c_float AND a.c_bigdecimal = b.c_bigdecimal AND a.c_smalldecimal = b.c_smalldecimal AND a.c_bigint = b.c_bigint AND a.c_integer = b.c_integer AND a.c_varchar = b.c_varchar ) ; {code} Explain plan: {code} 00-01 StreamAgg(group=[{}], EXPR$0=[COUNT($0)]) 00-02 HashAgg(group=[{0}]) 00-03 Project(c_date=[$3]) 00-04 HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner]) 00-06 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3]) 00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]]) 00-05 Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9]) 00-07 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3]) 00-09 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]]) {code} (2) 10 columns in JOIN condition, in the WHERE clause (just to check that nothing funny is going on with ON clause) -- Total elapsed time : 11.139 seconds -- Planning time : 11.416 seconds {code} select count(*) from alltypes_with_nulls a, alltypes_with_nulls b where a.c_boolean = b.c_boolean AND a.c_timestamp = b.c_timestamp AND a.c_time = b.c_time AND a.c_date = b.c_date AND a.c_float = b.c_float AND a.c_bigdecimal = b.c_bigdecimal AND a.c_smalldecimal = b.c_smalldecimal AND a.c_bigint = b.c_bigint AND a.c_integer = b.c_integer AND a.c_varchar = b.c_varchar ; {code} Explain plan: {code} 00-01 StreamAgg(group=[{}], EXPR$0=[COUNT()]) 00-02 Project($f0=[0]) 00-03 HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4, $14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner]) 00-05 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3]) 00-07 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]]) 00-04 Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9]) 00-06 Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4], c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3]) 00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`, `c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`, `c_integer`, `c_varchar`]]]) {code} (3) 14 columns in join condition -- Total elapsed time : 41.101 seconds -- Planning time : 39.771 seconds {code} select count(distinct a.c_date) from alltypes_with_nulls a INNER JOIN alltypes_with_nulls b ON ( a.c_varchar = b.c_varchar AND a.c_integer = b.c_integer AND a.c_bigint = b.c_bigint AND a.c_smalldecimal = b.c_smalldecimal AND a.c_bigdecimal = b.c_bigdecimal AND a.c_float = b.c_float AND a.c_date = b.c_date AND a.c_time = b.c_time AND a.c_timestamp = b.c_timestamp AND a.c_boolean = b.c_boolean AND a.d9 = b.d9 AND a.d18 = b.d18 AND a.d28 = b.d28 AND a.d38 = b.d38 ) ; {code} Explain plan: {code} 00-01 StreamAgg(group=[{}], EXPR$0=[COUNT($0)]) 00-02 HashAgg(group=[{0}]) 00-03 Project(c_date=[$6]) 00-04 HashJoin(condition=[AND(=($0, $14), =($1, $15), =($2, $16), =($3, $17), =($4, $18), =($5, $19), =($6, $20), =($7, $21), =($8, $22), =($9, $23), =($10, $24), =($11, $25), =($12, $26), =($13, $27))], joinType=[inner]) 00-06 Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12]) 00-08 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]]) 00-05 Project(c_varchar0=[$0], c_integer0=[$1], c_bigint0=[$2], c_smalldecimal0=[$3], c_bigdecimal0=[$4], c_float0=[$5], c_date0=[$6], c_time0=[$7], c_timestamp0=[$8], c_boolean0=[$9], d90=[$10], d180=[$11], d280=[$12], d380=[$13]) 00-07 Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6], c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7], d9=[$13], d18=[$8], d28=[$0], d38=[$12]) 00-09 Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]], selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`, `c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`, `c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]]) {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)