Github user chrysan commented on a diff in the pull request: https://github.com/apache/spark/pull/19001#discussion_r175640958 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala --- @@ -184,6 +189,43 @@ case class InsertIntoHadoopFsRelationCommand( Seq.empty[Row] } + private def getBucketIdExpression(dataColumns: Seq[Attribute]): Option[Expression] = { + bucketSpec.map { spec => + val bucketColumns = spec.bucketColumnNames.map(c => dataColumns.find(_.name == c).get) + // Use `HashPartitioning.partitionIdExpression` as our bucket id expression, so that we can + // guarantee the data distribution is same between shuffle and bucketed data source, which + // enables us to only shuffle one side when join a bucketed table and a normal one. + HashPartitioning( + bucketColumns, + spec.numBuckets, + classOf[Murmur3Hash] + ).partitionIdExpression + } + } + + /** + * How is `requiredOrdering` determined ? --- End diff -- Why the definition of requiredOrdering here differs from that in InsertIntoHiveTable?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org