[ https://issues.apache.org/jira/browse/RATIS-590?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tsz Wo Nicholas Sze resolved RATIS-590. --------------------------------------- Resolution: Fixed Fix Version/s: 0.4.0 I have committed this. Thanks [~ljain]! Thanks also [~rakeshr] for discovering the bug. > Deadlock in ratis client > ------------------------ > > Key: RATIS-590 > URL: https://issues.apache.org/jira/browse/RATIS-590 > Project: Ratis > Issue Type: Bug > Components: client > Affects Versions: 0.3.0 > Reporter: Rakesh R > Assignee: Lokesh Jain > Priority: Major > Fix For: 0.4.0 > > Attachments: > Freon_baseline_100Threads_64MB_Keysize_8Keys_10buckets.bin, > RATIS-590.001.patch > > > Ran Freon benchmark in a three node cluster with 100 writer threads. After > some time the client got hanged due to deadlock issue. > +Freon with the args:-+ > --numOfBuckets=10 --numOfKeys=8 --keySize=67108864 --numOfVolumes=100 > --numOfThreads=100 > 3 BLOCKED threads. Attached whole threaddump. > {code} > Found one Java-level deadlock: > ============================= > "grpc-default-executor-6": > waiting for ownable synchronizer 0x000000021546bd00, (a > java.util.concurrent.locks.ReentrantReadWriteLock$FairSync), > which is held by "ForkJoinPool.commonPool-worker-7" > "ForkJoinPool.commonPool-worker-7": > waiting to lock monitor 0x00007f48fc99c448 (object 0x000000021546be30, a > org.apache.ratis.util.SlidingWindow$Client), > which is held by "grpc-default-executor-6" > {code} > {code} > ForkJoinPool.commonPool-worker-7 > priority:5 - threadId:0x00007f48d834b000 - nativeId:0x9ffb - nativeId > (decimal):40955 - state:BLOCKED > stackTrace: > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.ratis.util.SlidingWindow$Client.resetFirstSeqNum(SlidingWindow.java:348) > - waiting to lock <0x000000021546be30> (a > org.apache.ratis.util.SlidingWindow$Client) > at > org.apache.ratis.client.impl.OrderedAsync.resetSlidingWindow(OrderedAsync.java:122) > at > org.apache.ratis.client.impl.OrderedAsync$$Lambda$943/1670264164.accept(Unknown > Source) > at > org.apache.ratis.client.impl.RaftClientImpl.lambda$handleIOException$6(RaftClientImpl.java:352) > at > org.apache.ratis.client.impl.RaftClientImpl$$Lambda$944/769363367.accept(Unknown > Source) > at java.util.Optional.ifPresent(Optional.java:159) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:352) > at > org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$10(OrderedAsync.java:235) > at > org.apache.ratis.client.impl.OrderedAsync$$Lambda$776/1213731951.apply(Unknown > Source) > at > java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:870) > at > java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:852) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474) > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.completeReplyExceptionally(GrpcClientProtocolClient.java:324) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.close(GrpcClientProtocolClient.java:313) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$400(GrpcClientProtocolClient.java:245) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient.lambda$close$1(GrpcClientProtocolClient.java:131) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$$Lambda$950/1948156329.accept(Unknown > Source) > at java.util.Optional.ifPresent(Optional.java:159) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient.close(GrpcClientProtocolClient.java:131) > at > org.apache.ratis.util.PeerProxyMap$PeerAndProxy.lambda$close$1(PeerProxyMap.java:73) > at > org.apache.ratis.util.PeerProxyMap$PeerAndProxy$$Lambda$948/427065222.run(Unknown > Source) > at > org.apache.ratis.util.LifeCycle.lambda$checkStateAndClose$2(LifeCycle.java:231) > at org.apache.ratis.util.LifeCycle$$Lambda$949/1311526821.get(Unknown Source) > at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:251) > at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:229) > at org.apache.ratis.util.PeerProxyMap$PeerAndProxy.close(PeerProxyMap.java:70) > - locked <0x00000003e793ef48> (a > org.apache.ratis.util.PeerProxyMap$PeerAndProxy) > at org.apache.ratis.util.PeerProxyMap.resetProxy(PeerProxyMap.java:126) > - locked <0x0000000215453400> (a java.lang.Object) > at org.apache.ratis.util.PeerProxyMap.handleException(PeerProxyMap.java:135) > at > org.apache.ratis.client.impl.RaftClientRpcWithProxy.handleException(RaftClientRpcWithProxy.java:47) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:375) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:341) > at > org.apache.ratis.client.impl.UnorderedAsync.lambda$sendRequestWithRetry$4(UnorderedAsync.java:108) > at > org.apache.ratis.client.impl.UnorderedAsync$$Lambda$976/655038759.accept(Unknown > Source) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736) > at > java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:443) > at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) > at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) > at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) > at > java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157) > {code} > {code} > orkJoinPool.commonPool-worker-25 > priority:5 - threadId:0x00007f48d8349000 - nativeId:0x9ff4 - nativeId > (decimal):40948 - state:BLOCKED > stackTrace: > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.ratis.util.SlidingWindow$Client.resetFirstSeqNum(SlidingWindow.java:348) > - waiting to lock <0x0000000083738128> (a > org.apache.ratis.util.SlidingWindow$Client) > at > org.apache.ratis.client.impl.OrderedAsync.resetSlidingWindow(OrderedAsync.java:122) > at > org.apache.ratis.client.impl.OrderedAsync$$Lambda$943/1670264164.accept(Unknown > Source) > at > org.apache.ratis.client.impl.RaftClientImpl.lambda$handleIOException$6(RaftClientImpl.java:352) > at > org.apache.ratis.client.impl.RaftClientImpl$$Lambda$944/769363367.accept(Unknown > Source) > at java.util.Optional.ifPresent(Optional.java:159) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:352) > at > org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$10(OrderedAsync.java:235) > at > org.apache.ratis.client.impl.OrderedAsync$$Lambda$776/1213731951.apply(Unknown > Source) > at > java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:870) > at > java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:852) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474) > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.completeReplyExceptionally(GrpcClientProtocolClient.java:324) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.close(GrpcClientProtocolClient.java:313) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$400(GrpcClientProtocolClient.java:245) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient.lambda$close$1(GrpcClientProtocolClient.java:131) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient$$Lambda$950/1948156329.accept(Unknown > Source) > at java.util.Optional.ifPresent(Optional.java:159) > at > org.apache.ratis.grpc.client.GrpcClientProtocolClient.close(GrpcClientProtocolClient.java:131) > at > org.apache.ratis.util.PeerProxyMap$PeerAndProxy.lambda$close$1(PeerProxyMap.java:73) > at > org.apache.ratis.util.PeerProxyMap$PeerAndProxy$$Lambda$948/427065222.run(Unknown > Source) > at > org.apache.ratis.util.LifeCycle.lambda$checkStateAndClose$2(LifeCycle.java:231) > at org.apache.ratis.util.LifeCycle$$Lambda$949/1311526821.get(Unknown Source) > at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:251) > at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:229) > at org.apache.ratis.util.PeerProxyMap$PeerAndProxy.close(PeerProxyMap.java:70) > - locked <0x00000001fced7120> (a > org.apache.ratis.util.PeerProxyMap$PeerAndProxy) > at org.apache.ratis.util.PeerProxyMap.resetProxy(PeerProxyMap.java:126) > - locked <0x0000000083743b10> (a java.lang.Object) > at org.apache.ratis.util.PeerProxyMap.handleException(PeerProxyMap.java:135) > at > org.apache.ratis.client.impl.RaftClientRpcWithProxy.handleException(RaftClientRpcWithProxy.java:47) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:375) > at > org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:341) > at > org.apache.ratis.client.impl.UnorderedAsync.lambda$sendRequestWithRetry$4(UnorderedAsync.java:108) > at > org.apache.ratis.client.impl.UnorderedAsync$$Lambda$976/655038759.accept(Unknown > Source) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736) > at > java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:443) > at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) > at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056) > at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692) > at > java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157) > {code} > {code} > pool-2-thread-20 > priority:5 - threadId:0x00007f48fd02d000 - nativeId:0x9e26 - nativeId > (decimal):40486 - state:BLOCKED > stackTrace: > java.lang.Thread.State: BLOCKED (on object monitor) > at > org.apache.ratis.util.SlidingWindow$Client.submitNewRequest(SlidingWindow.java:242) > - waiting to lock <0x0000000083738128> (a > org.apache.ratis.util.SlidingWindow$Client) > at org.apache.ratis.client.impl.OrderedAsync.send(OrderedAsync.java:156) > at > org.apache.ratis.client.impl.RaftClientImpl.sendAsync(RaftClientImpl.java:148) > at > org.apache.ratis.client.impl.RaftClientImpl.sendAsync(RaftClientImpl.java:128) > at > org.apache.hadoop.hdds.scm.XceiverClientRatis.sendRequestAsync(XceiverClientRatis.java:224) > at > org.apache.hadoop.hdds.scm.XceiverClientRatis.sendCommandAsync(XceiverClientRatis.java:301) > at > org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.writeChunkAsync(ContainerProtocolCalls.java:310) > at > org.apache.hadoop.hdds.scm.storage.BlockOutputStream.writeChunkToContainer(BlockOutputStream.java:609) > at > org.apache.hadoop.hdds.scm.storage.BlockOutputStream.writeChunk(BlockOutputStream.java:466) > at > org.apache.hadoop.hdds.scm.storage.BlockOutputStream.write(BlockOutputStream.java:248) > at > org.apache.hadoop.ozone.client.io.BlockOutputStreamEntry.write(BlockOutputStreamEntry.java:129) > at > org.apache.hadoop.ozone.client.io.KeyOutputStream.handleWrite(KeyOutputStream.java:211) > at > org.apache.hadoop.ozone.client.io.KeyOutputStream.write(KeyOutputStream.java:193) > at > org.apache.hadoop.ozone.client.io.OzoneOutputStream.write(OzoneOutputStream.java:49) > at > org.apache.hadoop.ozone.freon.RandomKeyGenerator$OfflineProcessor.run(RandomKeyGenerator.java:650) > at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)