[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387717872 ## File path: core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala ## @@ -341,32 +341,86 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block -val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) -val remoteBlocks = Seq[BlockId]( +val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) +val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) +val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) +val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) +val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer()) +val transfer = createMockTransfer(mergedRemoteBlocks) + +val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))), + (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 1.toIterator + +val taskContext = TaskContext.empty() +val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() +val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1500, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + +var numResults = 0 +// After initialize(), there will be 6 FetchRequests. And each of the first 5 requests +// includes 1 merged block which is merged from 3 shuffle blocks. The last request has 1 merged +// block which merged from 2 shuffle blocks. So, only the first 5 requests(5 * 3 * 100 >= 1500) +// can be sent. The second FetchRequest will hit maxBlocksInFlightPerAddress so it won't Review comment: `The second` -> `The 6th`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387570436 ## File path: core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala ## @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block -val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) -val remoteBlocks = Seq[BlockId]( +val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) +val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) +val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) +val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) +val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer()) +val transfer = createMockTransfer(mergedRemoteBlocks) + +val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))), + (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 1.toIterator + +val taskContext = TaskContext.empty() +val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() +val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1500, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + +var numResults = 0 +// After initialize(), there will be 6 FetchRequests, and the each of the first 5 +// includes 3 merged blocks and the last one has 1 merged block. So, only the Review comment: ok let's update This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387524638 ## File path: core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala ## @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block -val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) -val remoteBlocks = Seq[BlockId]( +val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) +val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) +val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) +val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) +val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer()) +val transfer = createMockTransfer(mergedRemoteBlocks) + +val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))), + (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 1.toIterator + +val taskContext = TaskContext.empty() +val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() +val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1500, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + +var numResults = 0 +// After initialize(), there will be 6 FetchRequests, and the each of the first 5 +// includes 3 merged blocks and the last one has 1 merged block. So, only the Review comment: or do you mean shuffle blocks? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387524362 ## File path: core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala ## @@ -341,32 +341,84 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) } - test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + test("fetch continuous blocks in batch should respect maxBytesInFlight") { val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure remote blocks would return the merged block -val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) -val remoteBlocks = Seq[BlockId]( +val remoteBmId1 = BlockManagerId("test-client-1", "test-client-1", 1) +val remoteBmId2 = BlockManagerId("test-client-2", "test-client-2", 2) +val remoteBlocks1 = (0 until 15).map(ShuffleBlockId(0, 3, _)) +val remoteBlocks2 = Seq[BlockId](ShuffleBlockId(0, 4, 0), ShuffleBlockId(0, 4, 1)) +val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 3, 6) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 6, 9) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 9, 12) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 3, 12, 15) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer()) +val transfer = createMockTransfer(mergedRemoteBlocks) + +val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId1, remoteBlocks1.map(blockId => (blockId, 100L, 1))), + (remoteBmId2, remoteBlocks2.map(blockId => (blockId, 100L, 1.toIterator + +val taskContext = TaskContext.empty() +val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() +val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 1500, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + +var numResults = 0 +// After initialize(), there will be 6 FetchRequests, and the each of the first 5 +// includes 3 merged blocks and the last one has 1 merged block. So, only the Review comment: there are 6 merged blocks in total, how can each request includes 3 merged blocks? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387497757 ## File path: core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala ## @@ -367,12 +367,12 @@ final class ShuffleBlockFetcherIterator( // For batch fetch, the actual block in flight should count for merged block. val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= maxBlocksInFlightPerAddress if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) { -createFetchRequests() +createFetchRequests(true) Review comment: let's write down the parameter name. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group
cloud-fan commented on a change in pull request #27786: [SPARK-31034][CORE] ShuffleBlockFetcherIterator should always create request for last block group URL: https://github.com/apache/spark/pull/27786#discussion_r387497646 ## File path: core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala ## @@ -339,14 +339,14 @@ final class ShuffleBlockFetcherIterator( + s"with ${blocks.size} blocks") } -def createFetchRequests(): Unit = { +def createFetchRequests(hasMore: Boolean): Unit = { Review comment: nit: `isLast`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org