Frank Yin created SPARK-44547: --------------------------------- Summary: BlockManagerDecommissioner throws exceptions when migrating RDD cached blocks to fallback storage Key: SPARK-44547 URL: https://issues.apache.org/jira/browse/SPARK-44547 Project: Spark Issue Type: Bug Components: Spark Core Affects Versions: 3.4.1 Reporter: Frank Yin
Looks like the RDD cache doesn't support fallback storage and we should stop the migration if the only viable peer is the fallback storage. ``` 23/07/25 05:12:58 WARN BlockManager: Failed to replicate rdd_18_25 to BlockManagerId(fallback, remote, 7337, None), failure #0 java.io.IOException: Failed to connect to remote:7337 at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:288) at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218) at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230) at org.apache.spark.network.netty.NettyBlockTransferService.uploadBlock(NettyBlockTransferService.scala:168) at org.apache.spark.network.BlockTransferService.uploadBlockSync(BlockTransferService.scala:121) at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$replicate(BlockManager.scala:1784) at org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2(BlockManager.scala:1721) at org.apache.spark.storage.BlockManager.$anonfun$replicateBlock$2$adapted(BlockManager.scala:1707) at scala.Option.forall(Option.scala:390) at org.apache.spark.storage.BlockManager.replicateBlock(BlockManager.scala:1707) at org.apache.spark.storage.BlockManagerDecommissioner.migrateBlock(BlockManagerDecommissioner.scala:356) at org.apache.spark.storage.BlockManagerDecommissioner.$anonfun$decommissionRddCacheBlocks$3(BlockManagerDecommissioner.scala:340) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at scala.collection.TraversableLike.map(TraversableLike.scala:286) at scala.collection.TraversableLike.map$(TraversableLike.scala:279) at scala.collection.AbstractTraversable.map(Traversable.scala:108) at org.apache.spark.storage.BlockManagerDecommissioner.decommissionRddCacheBlocks(BlockManagerDecommissioner.scala:339) at org.apache.spark.storage.BlockManagerDecommissioner$$anon$1.run(BlockManagerDecommissioner.scala:214) at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source) at java.base/java.util.concurrent.FutureTask.run(Unknown Source) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) at java.base/java.lang.Thread.run(Unknown Source) Caused by: java.net.UnknownHostException: remote at java.base/java.net.InetAddress$CachedAddresses.get(Unknown Source) at java.base/java.net.InetAddress.getAllByName0(Unknown Source) at java.base/java.net.InetAddress.getAllByName(Unknown Source) at java.base/java.net.InetAddress.getAllByName(Unknown Source) at java.base/java.net.InetAddress.getByName(Unknown Source) at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:156) at io.netty.util.internal.SocketUtils$8.run(SocketUtils.java:153) at java.base/java.security.AccessController.doPrivileged(Native Method) at io.netty.util.internal.SocketUtils.addressByName(SocketUtils.java:153) at io.netty.resolver.DefaultNameResolver.doResolve(DefaultNameResolver.java:41) at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:61) at io.netty.resolver.SimpleNameResolver.resolve(SimpleNameResolver.java:53) at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:55) at io.netty.resolver.InetSocketAddressResolver.doResolve(InetSocketAddressResolver.java:31) at io.netty.resolver.AbstractAddressResolver.resolve(AbstractAddressResolver.java:106) at io.netty.bootstrap.Bootstrap.doResolveAndConnect0(Bootstrap.java:206) at io.netty.bootstrap.Bootstrap.access$000(Bootstrap.java:46) at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:180) at io.netty.bootstrap.Bootstrap$1.operationComplete(Bootstrap.java:166) at io.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:578) at io.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:552) at io.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:491) at io.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:616) at io.netty.util.concurrent.DefaultPromise.setSuccess0(DefaultPromise.java:605) at io.netty.util.concurrent.DefaultPromise.trySuccess(DefaultPromise.java:104) at io.netty.channel.DefaultChannelPromise.trySuccess(DefaultChannelPromise.java:84) at io.netty.channel.AbstractChannel$AbstractUnsafe.safeSetSuccess(AbstractChannel.java:990) at io.netty.channel.AbstractChannel$AbstractUnsafe.register0(AbstractChannel.java:516) at io.netty.channel.AbstractChannel$AbstractUnsafe.access$200(AbstractChannel.java:429) at io.netty.channel.AbstractChannel$AbstractUnsafe$1.run(AbstractChannel.java:486) at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:164) at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:469) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:503) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ``` -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org