[jira] [Updated] (SPARK-30876) Optimizer cannot infer from inferred constraints with join
[ https://issues.apache.org/jira/browse/SPARK-30876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Dongjoon Hyun updated SPARK-30876: -- Affects Version/s: (was: 2.4.5) (was: 2.3.4) (was: 3.0.0) 3.1.0 > Optimizer cannot infer from inferred constraints with join > -- > > Key: SPARK-30876 > URL: https://issues.apache.org/jira/browse/SPARK-30876 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 3.1.0 >Reporter: Yuming Wang >Priority: Major > > How to reproduce this issue: > {code:sql} > create table t1(a int, b int, c int); > create table t2(a int, b int, c int); > create table t3(a int, b int, c int); > select count(*) from t1 join t2 join t3 on (t1.a = t2.b and t2.b = t3.c and > t3.c = 1); > {code} > Spark 2.3+: > {noformat} > == Physical Plan == > *(4) HashAggregate(keys=[], functions=[count(1)]) > +- Exchange SinglePartition, true, [id=#102] >+- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) > +- *(3) Project > +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight > :- *(3) Project [b#10] > : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight > : :- *(3) Project [a#6] > : : +- *(3) Filter isnotnull(a#6) > : : +- *(3) ColumnarToRow > : :+- FileScan parquet default.t1[a#6] Batched: true, > DataFilters: [isnotnull(a#6)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(a)], ReadSchema: > struct > : +- BroadcastExchange > HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), > [id=#87] > :+- *(1) Project [b#10] > : +- *(1) Filter (isnotnull(b#10) AND (b#10 = 1)) > : +- *(1) ColumnarToRow > : +- FileScan parquet default.t2[b#10] Batched: > true, DataFilters: [isnotnull(b#10), (b#10 = 1)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(b), EqualTo(b,1)], > ReadSchema: struct > +- BroadcastExchange > HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), > [id=#96] >+- *(2) Project [c#14] > +- *(2) Filter (isnotnull(c#14) AND (c#14 = 1)) > +- *(2) ColumnarToRow > +- FileScan parquet default.t3[c#14] Batched: true, > DataFilters: [isnotnull(c#14), (c#14 = 1)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(c), EqualTo(c,1)], > ReadSchema: struct > Time taken: 3.785 seconds, Fetched 1 row(s) > {noformat} > Spark 2.2.x: > {noformat} > == Physical Plan == > *HashAggregate(keys=[], functions=[count(1)]) > +- Exchange SinglePartition >+- *HashAggregate(keys=[], functions=[partial_count(1)]) > +- *Project > +- *SortMergeJoin [b#19], [c#23], Inner > :- *Project [b#19] > : +- *SortMergeJoin [a#15], [b#19], Inner > : :- *Sort [a#15 ASC NULLS FIRST], false, 0 > : : +- Exchange hashpartitioning(a#15, 200) > : : +- *Filter (isnotnull(a#15) && (a#15 = 1)) > : :+- HiveTableScan [a#15], HiveTableRelation > `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#15, > b#16, c#17] > : +- *Sort [b#19 ASC NULLS FIRST], false, 0 > :+- Exchange hashpartitioning(b#19, 200) > : +- *Filter (isnotnull(b#19) && (b#19 = 1)) > : +- HiveTableScan [b#19], HiveTableRelation > `default`.`t2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#18, > b#19, c#20] > +- *Sort [c#23 ASC NULLS FIRST], false, 0 >+- Exchange hashpartitioning(c#23, 200) > +- *Filter (isnotnull(c#23) && (c#23 = 1)) > +- HiveTableScan [c#23], HiveTableRelation > `default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#21, > b#22, c#23] > Time taken: 0.728 seconds, Fetched 1 row(s) > {noformat} > Spark 2.2 can infer {{(a#15 = 1)}}, but Spark 2.3+ can't. -- This message was sent by Atlassian Jira (v8.3.4#803005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional
[jira] [Updated] (SPARK-30876) Optimizer cannot infer from inferred constraints with join
[ https://issues.apache.org/jira/browse/SPARK-30876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yuming Wang updated SPARK-30876: Description: How to reproduce this issue: {code:sql} create table t1(a int, b int, c int); create table t2(a int, b int, c int); create table t3(a int, b int, c int); select count(*) from t1 join t2 join t3 on (t1.a = t2.b and t2.b = t3.c and t3.c = 1); {code} Spark 2.3+: {noformat} == Physical Plan == *(4) HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition, true, [id=#102] +- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) +- *(3) Project +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight :- *(3) Project [b#10] : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight : :- *(3) Project [a#6] : : +- *(3) Filter isnotnull(a#6) : : +- *(3) ColumnarToRow : :+- FileScan parquet default.t1[a#6] Batched: true, DataFilters: [isnotnull(a#6)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(a)], ReadSchema: struct : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#87] :+- *(1) Project [b#10] : +- *(1) Filter (isnotnull(b#10) AND (b#10 = 1)) : +- *(1) ColumnarToRow : +- FileScan parquet default.t2[b#10] Batched: true, DataFilters: [isnotnull(b#10), (b#10 = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(b), EqualTo(b,1)], ReadSchema: struct +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#96] +- *(2) Project [c#14] +- *(2) Filter (isnotnull(c#14) AND (c#14 = 1)) +- *(2) ColumnarToRow +- FileScan parquet default.t3[c#14] Batched: true, DataFilters: [isnotnull(c#14), (c#14 = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(c), EqualTo(c,1)], ReadSchema: struct Time taken: 3.785 seconds, Fetched 1 row(s) {noformat} Spark 2.2.x: {noformat} == Physical Plan == *HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition +- *HashAggregate(keys=[], functions=[partial_count(1)]) +- *Project +- *SortMergeJoin [b#19], [c#23], Inner :- *Project [b#19] : +- *SortMergeJoin [a#15], [b#19], Inner : :- *Sort [a#15 ASC NULLS FIRST], false, 0 : : +- Exchange hashpartitioning(a#15, 200) : : +- *Filter (isnotnull(a#15) && (a#15 = 1)) : :+- HiveTableScan [a#15], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#15, b#16, c#17] : +- *Sort [b#19 ASC NULLS FIRST], false, 0 :+- Exchange hashpartitioning(b#19, 200) : +- *Filter (isnotnull(b#19) && (b#19 = 1)) : +- HiveTableScan [b#19], HiveTableRelation `default`.`t2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#18, b#19, c#20] +- *Sort [c#23 ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(c#23, 200) +- *Filter (isnotnull(c#23) && (c#23 = 1)) +- HiveTableScan [c#23], HiveTableRelation `default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#21, b#22, c#23] Time taken: 0.728 seconds, Fetched 1 row(s) {noformat} Spark 2.2 can infer {{(a#15 = 1)}}, but Spark 2.3+ can't. was: How to reproduce this issue: {code:sql} create table t1(a int, b int, c int); create table t2(a int, b int, c int); create table t3(a int, b int, c int); select count(*) from t1 join t2 join t3 on (t1.a = t2.b and t2.b = t3.c and t3.c = 1) {code} Spark 2.3+: {noformat} == Physical Plan == *(4) HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition, true, [id=#102] +- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) +- *(3) Project +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight :- *(3) Project [b#10] : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight : :- *(3) Project [a#6] : : +- *(3) Filter isnotnull(a#6) : : +- *(3) ColumnarToRow : :+- FileScan parquet default.t1[a#6] Batched: true, DataFilters: [isnotnull(a#6)], Format: Parquet, Location:
[jira] [Updated] (SPARK-30876) Optimizer cannot infer from inferred constraints with join
[ https://issues.apache.org/jira/browse/SPARK-30876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yuming Wang updated SPARK-30876: Description: How to reproduce this issue: {code:sql} create table t1(a int, b int, c int); create table t2(a int, b int, c int); create table t3(a int, b int, c int); select count(*) from t1 join t2 join t3 on (t1.a = t2.b and t2.b = t3.c and t3.c = 1) {code} Spark 2.3+: {noformat} == Physical Plan == *(4) HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition, true, [id=#102] +- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) +- *(3) Project +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight :- *(3) Project [b#10] : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight : :- *(3) Project [a#6] : : +- *(3) Filter isnotnull(a#6) : : +- *(3) ColumnarToRow : :+- FileScan parquet default.t1[a#6] Batched: true, DataFilters: [isnotnull(a#6)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(a)], ReadSchema: struct : +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#87] :+- *(1) Project [b#10] : +- *(1) Filter (isnotnull(b#10) AND (b#10 = 1)) : +- *(1) ColumnarToRow : +- FileScan parquet default.t2[b#10] Batched: true, DataFilters: [isnotnull(b#10), (b#10 = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(b), EqualTo(b,1)], ReadSchema: struct +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), [id=#96] +- *(2) Project [c#14] +- *(2) Filter (isnotnull(c#14) AND (c#14 = 1)) +- *(2) ColumnarToRow +- FileScan parquet default.t3[c#14] Batched: true, DataFilters: [isnotnull(c#14), (c#14 = 1)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., PartitionFilters: [], PushedFilters: [IsNotNull(c), EqualTo(c,1)], ReadSchema: struct Time taken: 3.785 seconds, Fetched 1 row(s) {noformat} Spark 2.2.x: {noformat} == Physical Plan == *HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition +- *HashAggregate(keys=[], functions=[partial_count(1)]) +- *Project +- *SortMergeJoin [b#19], [c#23], Inner :- *Project [b#19] : +- *SortMergeJoin [a#15], [b#19], Inner : :- *Sort [a#15 ASC NULLS FIRST], false, 0 : : +- Exchange hashpartitioning(a#15, 200) : : +- *Filter (isnotnull(a#15) && (a#15 = 1)) : :+- HiveTableScan [a#15], HiveTableRelation `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#15, b#16, c#17] : +- *Sort [b#19 ASC NULLS FIRST], false, 0 :+- Exchange hashpartitioning(b#19, 200) : +- *Filter (isnotnull(b#19) && (b#19 = 1)) : +- HiveTableScan [b#19], HiveTableRelation `default`.`t2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#18, b#19, c#20] +- *Sort [c#23 ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(c#23, 200) +- *Filter (isnotnull(c#23) && (c#23 = 1)) +- HiveTableScan [c#23], HiveTableRelation `default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#21, b#22, c#23] Time taken: 0.728 seconds, Fetched 1 row(s) {noformat} Spark 2.2 can infer {{(a#15 = 1)}}, but Spark 2.3+ can't. was: How to reproduce this issue: {code:sql} create table t1(a int, b int, c int); create table t2(a int, b int, c int); create table t3(a int, b int, c int); {code} Spark 2.3+: {noformat} == Physical Plan == *(4) HashAggregate(keys=[], functions=[count(1)]) +- Exchange SinglePartition, true, [id=#102] +- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) +- *(3) Project +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight :- *(3) Project [b#10] : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight : :- *(3) Project [a#6] : : +- *(3) Filter isnotnull(a#6) : : +- *(3) ColumnarToRow : :+- FileScan parquet default.t1[a#6] Batched: true, DataFilters: [isnotnull(a#6)], Format: Parquet, Location:
[jira] [Updated] (SPARK-30876) Optimizer cannot infer from inferred constraints with join
[ https://issues.apache.org/jira/browse/SPARK-30876?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Yuming Wang updated SPARK-30876: Summary: Optimizer cannot infer from inferred constraints with join (was: Optimizer cannot infer more constraint) > Optimizer cannot infer from inferred constraints with join > -- > > Key: SPARK-30876 > URL: https://issues.apache.org/jira/browse/SPARK-30876 > Project: Spark > Issue Type: Improvement > Components: SQL >Affects Versions: 2.3.4, 2.4.5, 3.0.0 >Reporter: Yuming Wang >Priority: Major > > How to reproduce this issue: > {code:sql} > create table t1(a int, b int, c int); > create table t2(a int, b int, c int); > create table t3(a int, b int, c int); > {code} > Spark 2.3+: > {noformat} > == Physical Plan == > *(4) HashAggregate(keys=[], functions=[count(1)]) > +- Exchange SinglePartition, true, [id=#102] >+- *(3) HashAggregate(keys=[], functions=[partial_count(1)]) > +- *(3) Project > +- *(3) BroadcastHashJoin [b#10], [c#14], Inner, BuildRight > :- *(3) Project [b#10] > : +- *(3) BroadcastHashJoin [a#6], [b#10], Inner, BuildRight > : :- *(3) Project [a#6] > : : +- *(3) Filter isnotnull(a#6) > : : +- *(3) ColumnarToRow > : :+- FileScan parquet default.t1[a#6] Batched: true, > DataFilters: [isnotnull(a#6)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(a)], ReadSchema: > struct > : +- BroadcastExchange > HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), > [id=#87] > :+- *(1) Project [b#10] > : +- *(1) Filter (isnotnull(b#10) AND (b#10 = 1)) > : +- *(1) ColumnarToRow > : +- FileScan parquet default.t2[b#10] Batched: > true, DataFilters: [isnotnull(b#10), (b#10 = 1)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(b), EqualTo(b,1)], > ReadSchema: struct > +- BroadcastExchange > HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint))), > [id=#96] >+- *(2) Project [c#14] > +- *(2) Filter (isnotnull(c#14) AND (c#14 = 1)) > +- *(2) ColumnarToRow > +- FileScan parquet default.t3[c#14] Batched: true, > DataFilters: [isnotnull(c#14), (c#14 = 1)], Format: Parquet, Location: > InMemoryFileIndex[file:/Users/yumwang/Downloads/spark-3.0.0-preview2-bin-hadoop2.7/spark-warehous..., > PartitionFilters: [], PushedFilters: [IsNotNull(c), EqualTo(c,1)], > ReadSchema: struct > Time taken: 3.785 seconds, Fetched 1 row(s) > {noformat} > Spark 2.2.x: > {noformat} > == Physical Plan == > *HashAggregate(keys=[], functions=[count(1)]) > +- Exchange SinglePartition >+- *HashAggregate(keys=[], functions=[partial_count(1)]) > +- *Project > +- *SortMergeJoin [b#19], [c#23], Inner > :- *Project [b#19] > : +- *SortMergeJoin [a#15], [b#19], Inner > : :- *Sort [a#15 ASC NULLS FIRST], false, 0 > : : +- Exchange hashpartitioning(a#15, 200) > : : +- *Filter (isnotnull(a#15) && (a#15 = 1)) > : :+- HiveTableScan [a#15], HiveTableRelation > `default`.`t1`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#15, > b#16, c#17] > : +- *Sort [b#19 ASC NULLS FIRST], false, 0 > :+- Exchange hashpartitioning(b#19, 200) > : +- *Filter (isnotnull(b#19) && (b#19 = 1)) > : +- HiveTableScan [b#19], HiveTableRelation > `default`.`t2`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#18, > b#19, c#20] > +- *Sort [c#23 ASC NULLS FIRST], false, 0 >+- Exchange hashpartitioning(c#23, 200) > +- *Filter (isnotnull(c#23) && (c#23 = 1)) > +- HiveTableScan [c#23], HiveTableRelation > `default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#21, > b#22, c#23] > Time taken: 0.728 seconds, Fetched 1 row(s) > {noformat} > Spark 2.2 can infer {{(a#15 = 1)}}, but Spark 2.3+ can't. -- This message was sent by Atlassian Jira (v8.3.4#803005) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org