[ 
https://issues.apache.org/jira/browse/RATIS-590?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Tsz Wo Nicholas Sze resolved RATIS-590.
---------------------------------------
       Resolution: Fixed
    Fix Version/s: 0.4.0

I have committed this.  Thanks [~ljain]!

Thanks also [~rakeshr] for discovering the bug.

> Deadlock in ratis client
> ------------------------
>
>                 Key: RATIS-590
>                 URL: https://issues.apache.org/jira/browse/RATIS-590
>             Project: Ratis
>          Issue Type: Bug
>          Components: client
>    Affects Versions: 0.3.0
>            Reporter: Rakesh R
>            Assignee: Lokesh Jain
>            Priority: Major
>             Fix For: 0.4.0
>
>         Attachments: 
> Freon_baseline_100Threads_64MB_Keysize_8Keys_10buckets.bin, 
> RATIS-590.001.patch
>
>
> Ran Freon benchmark in a three node cluster with 100 writer threads. After 
> some time the client got hanged due to deadlock issue.
> +Freon with the args:-+
> --numOfBuckets=10 --numOfKeys=8 --keySize=67108864 --numOfVolumes=100 
> --numOfThreads=100
> 3 BLOCKED threads. Attached whole threaddump.
> {code}
> Found one Java-level deadlock:
> =============================
> "grpc-default-executor-6":
>   waiting for ownable synchronizer 0x000000021546bd00, (a 
> java.util.concurrent.locks.ReentrantReadWriteLock$FairSync),
>   which is held by "ForkJoinPool.commonPool-worker-7"
> "ForkJoinPool.commonPool-worker-7":
>   waiting to lock monitor 0x00007f48fc99c448 (object 0x000000021546be30, a 
> org.apache.ratis.util.SlidingWindow$Client),
>   which is held by "grpc-default-executor-6"
> {code}
> {code}
> ForkJoinPool.commonPool-worker-7
> priority:5 - threadId:0x00007f48d834b000 - nativeId:0x9ffb - nativeId 
> (decimal):40955 - state:BLOCKED
> stackTrace:
> java.lang.Thread.State: BLOCKED (on object monitor)
> at 
> org.apache.ratis.util.SlidingWindow$Client.resetFirstSeqNum(SlidingWindow.java:348)
> - waiting to lock <0x000000021546be30> (a 
> org.apache.ratis.util.SlidingWindow$Client)
> at 
> org.apache.ratis.client.impl.OrderedAsync.resetSlidingWindow(OrderedAsync.java:122)
> at 
> org.apache.ratis.client.impl.OrderedAsync$$Lambda$943/1670264164.accept(Unknown
>  Source)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.lambda$handleIOException$6(RaftClientImpl.java:352)
> at 
> org.apache.ratis.client.impl.RaftClientImpl$$Lambda$944/769363367.accept(Unknown
>  Source)
> at java.util.Optional.ifPresent(Optional.java:159)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:352)
> at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$10(OrderedAsync.java:235)
> at 
> org.apache.ratis.client.impl.OrderedAsync$$Lambda$776/1213731951.apply(Unknown
>  Source)
> at 
> java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:870)
> at 
> java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:852)
> at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
> at 
> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.completeReplyExceptionally(GrpcClientProtocolClient.java:324)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.close(GrpcClientProtocolClient.java:313)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$400(GrpcClientProtocolClient.java:245)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient.lambda$close$1(GrpcClientProtocolClient.java:131)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$$Lambda$950/1948156329.accept(Unknown
>  Source)
> at java.util.Optional.ifPresent(Optional.java:159)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient.close(GrpcClientProtocolClient.java:131)
> at 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy.lambda$close$1(PeerProxyMap.java:73)
> at 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy$$Lambda$948/427065222.run(Unknown
>  Source)
> at 
> org.apache.ratis.util.LifeCycle.lambda$checkStateAndClose$2(LifeCycle.java:231)
> at org.apache.ratis.util.LifeCycle$$Lambda$949/1311526821.get(Unknown Source)
> at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:251)
> at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:229)
> at org.apache.ratis.util.PeerProxyMap$PeerAndProxy.close(PeerProxyMap.java:70)
> - locked <0x00000003e793ef48> (a 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy)
> at org.apache.ratis.util.PeerProxyMap.resetProxy(PeerProxyMap.java:126)
> - locked <0x0000000215453400> (a java.lang.Object)
> at org.apache.ratis.util.PeerProxyMap.handleException(PeerProxyMap.java:135)
> at 
> org.apache.ratis.client.impl.RaftClientRpcWithProxy.handleException(RaftClientRpcWithProxy.java:47)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:375)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:341)
> at 
> org.apache.ratis.client.impl.UnorderedAsync.lambda$sendRequestWithRetry$4(UnorderedAsync.java:108)
> at 
> org.apache.ratis.client.impl.UnorderedAsync$$Lambda$976/655038759.accept(Unknown
>  Source)
> at 
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760)
> at 
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736)
> at 
> java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:443)
> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
> at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
> at 
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
> {code}
> {code}
> orkJoinPool.commonPool-worker-25
> priority:5 - threadId:0x00007f48d8349000 - nativeId:0x9ff4 - nativeId 
> (decimal):40948 - state:BLOCKED
> stackTrace:
> java.lang.Thread.State: BLOCKED (on object monitor)
> at 
> org.apache.ratis.util.SlidingWindow$Client.resetFirstSeqNum(SlidingWindow.java:348)
> - waiting to lock <0x0000000083738128> (a 
> org.apache.ratis.util.SlidingWindow$Client)
> at 
> org.apache.ratis.client.impl.OrderedAsync.resetSlidingWindow(OrderedAsync.java:122)
> at 
> org.apache.ratis.client.impl.OrderedAsync$$Lambda$943/1670264164.accept(Unknown
>  Source)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.lambda$handleIOException$6(RaftClientImpl.java:352)
> at 
> org.apache.ratis.client.impl.RaftClientImpl$$Lambda$944/769363367.accept(Unknown
>  Source)
> at java.util.Optional.ifPresent(Optional.java:159)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:352)
> at 
> org.apache.ratis.client.impl.OrderedAsync.lambda$sendRequest$10(OrderedAsync.java:235)
> at 
> org.apache.ratis.client.impl.OrderedAsync$$Lambda$776/1213731951.apply(Unknown
>  Source)
> at 
> java.util.concurrent.CompletableFuture.uniExceptionally(CompletableFuture.java:870)
> at 
> java.util.concurrent.CompletableFuture$UniExceptionally.tryFire(CompletableFuture.java:852)
> at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
> at 
> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.completeReplyExceptionally(GrpcClientProtocolClient.java:324)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.close(GrpcClientProtocolClient.java:313)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$AsyncStreamObservers.access$400(GrpcClientProtocolClient.java:245)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient.lambda$close$1(GrpcClientProtocolClient.java:131)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient$$Lambda$950/1948156329.accept(Unknown
>  Source)
> at java.util.Optional.ifPresent(Optional.java:159)
> at 
> org.apache.ratis.grpc.client.GrpcClientProtocolClient.close(GrpcClientProtocolClient.java:131)
> at 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy.lambda$close$1(PeerProxyMap.java:73)
> at 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy$$Lambda$948/427065222.run(Unknown
>  Source)
> at 
> org.apache.ratis.util.LifeCycle.lambda$checkStateAndClose$2(LifeCycle.java:231)
> at org.apache.ratis.util.LifeCycle$$Lambda$949/1311526821.get(Unknown Source)
> at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:251)
> at org.apache.ratis.util.LifeCycle.checkStateAndClose(LifeCycle.java:229)
> at org.apache.ratis.util.PeerProxyMap$PeerAndProxy.close(PeerProxyMap.java:70)
> - locked <0x00000001fced7120> (a 
> org.apache.ratis.util.PeerProxyMap$PeerAndProxy)
> at org.apache.ratis.util.PeerProxyMap.resetProxy(PeerProxyMap.java:126)
> - locked <0x0000000083743b10> (a java.lang.Object)
> at org.apache.ratis.util.PeerProxyMap.handleException(PeerProxyMap.java:135)
> at 
> org.apache.ratis.client.impl.RaftClientRpcWithProxy.handleException(RaftClientRpcWithProxy.java:47)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:375)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.handleIOException(RaftClientImpl.java:341)
> at 
> org.apache.ratis.client.impl.UnorderedAsync.lambda$sendRequestWithRetry$4(UnorderedAsync.java:108)
> at 
> org.apache.ratis.client.impl.UnorderedAsync$$Lambda$976/655038759.accept(Unknown
>  Source)
> at 
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:760)
> at 
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:736)
> at 
> java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:443)
> at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
> at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
> at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
> at 
> java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:157)
> {code}
> {code}
> pool-2-thread-20
> priority:5 - threadId:0x00007f48fd02d000 - nativeId:0x9e26 - nativeId 
> (decimal):40486 - state:BLOCKED
> stackTrace:
> java.lang.Thread.State: BLOCKED (on object monitor)
> at 
> org.apache.ratis.util.SlidingWindow$Client.submitNewRequest(SlidingWindow.java:242)
> - waiting to lock <0x0000000083738128> (a 
> org.apache.ratis.util.SlidingWindow$Client)
> at org.apache.ratis.client.impl.OrderedAsync.send(OrderedAsync.java:156)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.sendAsync(RaftClientImpl.java:148)
> at 
> org.apache.ratis.client.impl.RaftClientImpl.sendAsync(RaftClientImpl.java:128)
> at 
> org.apache.hadoop.hdds.scm.XceiverClientRatis.sendRequestAsync(XceiverClientRatis.java:224)
> at 
> org.apache.hadoop.hdds.scm.XceiverClientRatis.sendCommandAsync(XceiverClientRatis.java:301)
> at 
> org.apache.hadoop.hdds.scm.storage.ContainerProtocolCalls.writeChunkAsync(ContainerProtocolCalls.java:310)
> at 
> org.apache.hadoop.hdds.scm.storage.BlockOutputStream.writeChunkToContainer(BlockOutputStream.java:609)
> at 
> org.apache.hadoop.hdds.scm.storage.BlockOutputStream.writeChunk(BlockOutputStream.java:466)
> at 
> org.apache.hadoop.hdds.scm.storage.BlockOutputStream.write(BlockOutputStream.java:248)
> at 
> org.apache.hadoop.ozone.client.io.BlockOutputStreamEntry.write(BlockOutputStreamEntry.java:129)
> at 
> org.apache.hadoop.ozone.client.io.KeyOutputStream.handleWrite(KeyOutputStream.java:211)
> at 
> org.apache.hadoop.ozone.client.io.KeyOutputStream.write(KeyOutputStream.java:193)
> at 
> org.apache.hadoop.ozone.client.io.OzoneOutputStream.write(OzoneOutputStream.java:49)
> at 
> org.apache.hadoop.ozone.freon.RandomKeyGenerator$OfflineProcessor.run(RandomKeyGenerator.java:650)
> at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> at java.util.concurrent.FutureTask.run(FutureTask.java:266)
> at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to