[ 
https://issues.apache.org/jira/browse/SPARK-27318?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17162641#comment-17162641
 ] 

Shay Elbaz edited comment on SPARK-27318 at 7/22/20, 10:14 AM:
---------------------------------------------------------------

 

Was able to reproduce on 2.4.3, without external files/data

Executed via spark-shell, on HDP 2.6.5 cluster:

 
{code:java}
scala> spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

// create bucketed table:
scala> spark.range(1000).repartition(4, $"id").write.bucketBy(4, 
"id").sortBy("id").mode("overwrite").saveAsTable("buckets")
20/07/22 02:26:25 WARN HiveExternalCatalog: Persisting bucketed data source 
table `shay_test`.`buckets` into Hive metastore in Spark SQL specific format, 
which is NOT compatible with Hive.

// join with pre-partitioned df:
scala> spark.range(100).repartition(4, $"id").join(spark.table("buckets"), 
Seq("id")).explain
java.lang.IllegalArgumentException: requirement failed: PartitioningCollection 
requires all of its partitionings have the same numPartitions.
  at scala.Predef$.require(Predef.scala:224)
  at 
org.apache.spark.sql.catalyst.plans.physical.PartitioningCollection.<init>(partitioning.scala:291)
  at 
org.apache.spark.sql.execution.joins.SortMergeJoinExec.outputPartitioning(SortMergeJoinExec.scala:69)
  at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:150)
...
...
...
{code}
 

 


was (Author: shay_elbaz):
 

Was able to reproduce on 2.4.3, without external files/data

Executed via spark-shell, on HDP 2.6.5 cluster:

 
{code:java}
// create bucketed table:
scala> spark.range(1000).repartition(4, $"id").write.bucketBy(4, 
"id").sortBy("id").mode("overwrite").saveAsTable("buckets")
20/07/22 02:26:25 WARN HiveExternalCatalog: Persisting bucketed data source 
table `shay_test`.`buckets` into Hive metastore in Spark SQL specific format, 
which is NOT compatible with Hive.

// join with pre-partitioned df:
scala> spark.range(100).repartition(4, $"id").join(spark.table("buckets"), 
Seq("id")).explain
java.lang.IllegalArgumentException: requirement failed: PartitioningCollection 
requires all of its partitionings have the same numPartitions.
  at scala.Predef$.require(Predef.scala:224)
  at 
org.apache.spark.sql.catalyst.plans.physical.PartitioningCollection.<init>(partitioning.scala:291)
  at 
org.apache.spark.sql.execution.joins.SortMergeJoinExec.outputPartitioning(SortMergeJoinExec.scala:69)
  at 
org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:150)
...
...
...
{code}
 

 

> Join operation on bucketing table fails with base adaptive enabled
> ------------------------------------------------------------------
>
>                 Key: SPARK-27318
>                 URL: https://issues.apache.org/jira/browse/SPARK-27318
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.4.0
>            Reporter: Supritha
>            Priority: Major
>
> Join Operation on bucketed table is failing.
> Steps to reproduce the issue.
> {code}
> spark.sql("set spark.sql.adaptive.enabled=true")
> {code}
> 1. Create table bukcet3 and bucket4 Table as below and load the data.
> {code}
> sql("create table bucket3(id3 int,country3 String, sports3 String) row format 
> delimited fields terminated by ','").show()
> sql("create table bucket4(id4 int,country4 String) row format delimited 
> fields terminated by ','").show()
> sql("load data local inpath '/opt/abhidata/bucket2.txt' into table 
> bucket3").show()
> sql("load data local inpath '/opt/abhidata/bucket3.txt' into table 
> bucket4").show()
> {code}
> 2. Create bucketing table as below
> {code}
> spark.sqlContext.table("bucket3").write.bucketBy(3, 
> "id3").saveAsTable("bucketed_table_3");
> spark.sqlContext.table("bucket4").write.bucketBy(4, 
> "id4").saveAsTable("bucketed_table_4");
> {code}
> 3. Execute the join query on the bucketed table 
> {code}
> sql("select * from bucketed_table_3 join bucketed_table_4 on 
> bucketed_table_3.id3 = bucketed_table_4.id4").show()
> {code}
>  
> {code:java}
> java.lang.IllegalArgumentException: requirement failed: 
> PartitioningCollection requires all of its partitionings have the same 
> numPartitions. at scala.Predef$.require(Predef.scala:224) at 
> org.apache.spark.sql.catalyst.plans.physical.PartitioningCollection.<init>(partitioning.scala:291)
>  at 
> org.apache.spark.sql.execution.joins.SortMergeJoinExec.outputPartitioning(SortMergeJoinExec.scala:69)
>  at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:150)
>  at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering$1.apply(EnsureRequirements.scala:149)
>  at 
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
>  at 
> scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
>  at scala.collection.immutable.List.foreach(List.scala:392) at 
> scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at 
> scala.collection.immutable.List.map(List.scala:296) at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements.org$apache$spark$sql$execution$exchange$EnsureRequirements$$ensureDistributionAndOrdering(EnsureRequirements.scala:149)
>  at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$apply$1.applyOrElse(EnsureRequirements.scala:304)
>  at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements$$anonfun$apply$1.applyOrElse(EnsureRequirements.scala:296)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$2.apply(TreeNode.scala:282)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$2.apply(TreeNode.scala:282)
>  at 
> org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:281) 
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:275)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:326)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)
>  at 
> org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:324) 
> at 
> org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:275) 
> at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements.apply(EnsureRequirements.scala:296)
>  at 
> org.apache.spark.sql.execution.exchange.EnsureRequirements.apply(EnsureRequirements.scala:38)
>  at 
> org.apache.spark.sql.execution.QueryExecution$$anonfun$prepareForExecution$1.apply(QueryExecution.scala:87)
>  at 
> org.apache.spark.sql.execution.QueryExecution$$anonfun$prepareForExecution$1.apply(QueryExecution.scala:87)
>  at 
> scala.collection.LinearSeqOptimized$class.foldLeft(LinearSeqOptimized.scala:124)
>  at scala.collection.immutable.List.foldLeft(List.scala:84) at 
> org.apache.spark.sql.execution.QueryExecution.prepareForExecution(QueryExecution.scala:87)
>  at 
> org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:77)
>  at 
> org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:77)
>  at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3360) at 
> org.apache.spark.sql.Dataset.head(Dataset.scala:2545) at 
> org.apache.spark.sql.Dataset.take(Dataset.scala:2759) at 
> org.apache.spark.sql.Dataset.getRows(Dataset.scala:255) at 
> org.apache.spark.sql.Dataset.showString(Dataset.scala:292) at 
> org.apache.spark.sql.Dataset.show(Dataset.scala:746) at 
> org.apache.spark.sql.Dataset.show(Dataset.scala:705) at 
> org.apache.spark.sql.Dataset.show(Dataset.scala:714) ... 49 elided  
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to