[ https://issues.apache.org/jira/browse/SPARK-41386?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17644141#comment-17644141 ]
Zhe Dong edited comment on SPARK-41386 at 12/7/22 8:29 AM: ----------------------------------------------------------- OptimizeSkewInRebalancePartitions.scala {noformat} /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.spark.sql.execution.adaptiveimport org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionSpec, SparkPlan} import org.apache.spark.sql.execution.exchange.{REBALANCE_PARTITIONS_BY_COL, REBALANCE_PARTITIONS_BY_NONE, ShuffleOrigin} import org.apache.spark.sql.internal.SQLConf/** * A rule to optimize the skewed shuffle partitions in [[RebalancePartitions]] based on the map * output statistics, which can avoid data skew that hurt performance. * * We use ADVISORY_PARTITION_SIZE_IN_BYTES size to decide if a partition should be optimized. * Let's say we have 3 maps with 3 shuffle partitions, and assuming r1 has data skew issue. * the map side looks like: * m0:[b0, b1, b2], m1:[b0, b1, b2], m2:[b0, b1, b2] * and the reduce side looks like: * (without this rule) r1[m0-b1, m1-b1, m2-b1] * / \ * r0:[m0-b0, m1-b0, m2-b0], r1-0:[m0-b1], r1-1:[m1-b1], r1-2:[m2-b1], r2[m0-b2, m1-b2, m2-b2] */ object OptimizeSkewInRebalancePartitions extends AQEShuffleReadRule { override val supportedShuffleOrigins: Seq[ShuffleOrigin] = Seq(REBALANCE_PARTITIONS_BY_NONE, REBALANCE_PARTITIONS_BY_COL) /** * Splits the skewed partition based on the map size and the target partition size * after split. Create a list of `PartialReducerPartitionSpec` for skewed partition and * create `CoalescedPartition` for normal partition. */ private def optimizeSkewedPartitions( shuffleId: Int, bytesByPartitionId: Array[Long], targetSize: Long, smallPartitionFactor: Double): Seq[ShufflePartitionSpec] = { bytesByPartitionId.indices.flatMap { reduceIndex => val bytes = bytesByPartitionId(reduceIndex) if (bytes > targetSize) { val newPartitionSpec = ShufflePartitionsUtil.createSkewPartitionSpecs( shuffleId, reduceIndex, targetSize, smallPartitionFactor) if (newPartitionSpec.isEmpty) { CoalescedPartitionSpec(reduceIndex, reduceIndex + 1, bytes) :: Nil } else { logDebug(s"For shuffle $shuffleId, partition $reduceIndex is skew, " + s"split it into ${newPartitionSpec.get.size} parts.") newPartitionSpec.get } } else if (bytes < targetSize * smallPartitionFactor) { CoalescedPartitionSpec(reduceIndex, reduceIndex + 1, bytes) :: Nil } else { CoalescedPartitionSpec(reduceIndex, reduceIndex, bytes) :: Nil } } } private def tryOptimizeSkewedPartitions(shuffle: ShuffleQueryStageExec): SparkPlan = { val advisorySize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES) val smallPartitionFactor = conf.getConf(SQLConf.ADAPTIVE_REBALANCE_PARTITIONS_SMALL_PARTITION_FACTOR) val mapStats = shuffle.mapStats if (mapStats.isEmpty || mapStats.get.bytesByPartitionId.forall( r => r <= advisorySize && r >= advisorySize * smallPartitionFactor)) { return shuffle } val newPartitionsSpec = optimizeSkewedPartitions( mapStats.get.shuffleId, mapStats.get.bytesByPartitionId, advisorySize, smallPartitionFactor) // return origin plan if we can not optimize partitions if (newPartitionsSpec.length == mapStats.get.bytesByPartitionId.length) { shuffle } else { AQEShuffleReadExec(shuffle, newPartitionsSpec) } } override def apply(plan: SparkPlan): SparkPlan = { if (!conf.getConf(SQLConf.ADAPTIVE_OPTIMIZE_SKEWS_IN_REBALANCE_PARTITIONS_ENABLED)) { return plan } plan transformUp { case stage: ShuffleQueryStageExec if isSupported(stage.shuffle) => tryOptimizeSkewedPartitions(stage) } } } {noformat} was (Author: JIRAUSER298432): {noformat} if (mapStats.isEmpty || mapStats.get.bytesByPartitionId.forall(_ <= advisorySize && _ >= advisorySize * smallPartitionFactor )) { return shuffle } -------------------------------------------------------------------------------- if (bytes > targetSize) { ... } else if ( bytes < targetSize * smallPartitionFactor ){ CoalescedPartitionSpec(reduceIndex, reduceIndex + 1, bytes) :: Nil }else { return shuffle // dummy }{noformat} > There are some small files when using rebalance(column) > ------------------------------------------------------- > > Key: SPARK-41386 > URL: https://issues.apache.org/jira/browse/SPARK-41386 > Project: Spark > Issue Type: Improvement > Components: SQL > Affects Versions: 3.3.1 > Reporter: Zhe Dong > Priority: Minor > > *Problem ( REBALANCE(column)* {*}){*}: > SparkSession config: > {noformat} > config("spark.sql.adaptive.optimizeSkewsInRebalancePartitions.enabled", > "true") > config("spark.sql.adaptive.advisoryPartitionSizeInBytes", "20m") > config("spark.sql.adaptive.rebalancePartitionsSmallPartitionFactor", > "0.5"){noformat} > so, we except that files size should be bigger than 20m*0.5=10m at least. > but in fact , we got some small files like the following: > {noformat} > -rw-r--r-- 1 jp28948 staff 12.1 M 2022-12-07 13:13 > .../part-00000-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet > -rw-r--r-- 1 jp28948 staff 12.1 M 2022-12-07 13:13 > .../part-00001-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet > -rw-r--r-- 1 jp28948 staff 12.1 M 2022-12-07 13:13 > .../part-00002-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet > -rw-r--r-- 1 jp28948 staff 12.1 M 2022-12-07 13:13 > .../part-00003-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet > -rw-r--r-- 1 jp28948 staff 9.1 M 2022-12-07 13:13 > .../part-00004-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet > -rw-r--r-- 1 jp28948 staff 3.0 M 2022-12-07 13:13 > .../part-00005-1ece1aae-f4f6-47ac-abe2-170ccb61f60e.c000.snappy.parquet{noformat} > 9.1 M and 3.0 M is smaller than 10M. we have to handle these small files in > another way. -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org