[ 
https://issues.apache.org/jira/browse/TEZ-2421?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14529679#comment-14529679
 ] 

Bikas Saha commented on TEZ-2421:
---------------------------------

"App Shared Pool - #1" #102 daemon prio=5 os_prio=0 tid=0x0000000002426000 
nid=0x8bd waiting on condition [0x00007fa2a841d000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000006f58b09c0> (a 
java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:870)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
        at 
java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:943)
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.scheduleTasks(VertexImpl.java:1389)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerPluginContextImpl.scheduleVertexTasks(VertexManager.java:206)
        - locked <0x00000006f58d2c08> (a 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerPluginContextImpl)
        at 
org.apache.tez.dag.library.vertexmanager.InputReadyVertexManager.handleSourceTaskFinished(InputReadyVertexManager.java:277)
        at 
org.apache.tez.dag.library.vertexmanager.InputReadyVertexManager.onSourceTaskCompleted(InputReadyVertexManager.java:198)
        - locked <0x00000006f58d2d90> (a 
org.apache.tez.dag.library.vertexmanager.InputReadyVertexManager)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerEventSourceTaskCompleted.invoke(VertexManager.java:601)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerEvent$1.run(VertexManager.java:656)
        - locked <0x00000006f58d2d30> (a 
org.apache.tez.dag.app.dag.impl.VertexManager)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerEvent$1.run(VertexManager.java:651)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerEvent.call(VertexManager.java:651)
        at 
org.apache.tez.dag.app.dag.impl.VertexManager$VertexManagerEvent.call(VertexManager.java:640)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"TaskSchedulerAppCaller #0" #92 daemon prio=5 os_prio=0 tid=0x0000000001884800 
nid=0x8af waiting on condition [0x00007fa2a9127000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007aa165038> (a 
java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at 
java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.getState(TaskAttemptImpl.java:547)
        at 
org.apache.tez.dag.app.dag.impl.TaskImpl.selectBestAttempt(TaskImpl.java:715)
        at 
org.apache.tez.dag.app.dag.impl.TaskImpl.getProgress(TaskImpl.java:473)
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.computeProgress(VertexImpl.java:1179)
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.getProgress(VertexImpl.java:1117)
        at org.apache.tez.dag.app.dag.impl.DAGImpl.getProgress(DAGImpl.java:767)
        at 
org.apache.tez.dag.app.DAGAppMaster.getProgress(DAGAppMaster.java:1134)
        at 
org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.getProgress(TaskSchedulerEventHandler.java:556)
        at 
org.apache.tez.dag.app.rm.TaskSchedulerAppCallbackWrapper$GetProgressCallable.call(TaskSchedulerAppCallbackWrapper.java:291)
        at 
org.apache.tez.dag.app.rm.TaskSchedulerAppCallbackWrapper$GetProgressCallable.call(TaskSchedulerAppCallbackWrapper.java:282)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"IPC Server handler 0 on 59316" #52 daemon prio=5 os_prio=0 
tid=0x00007fa2d967b000 nid=0x887 waiting on condition [0x00007fa2abb4a000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000006f58b09c0> (a 
java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at 
java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.getVertexProgress(VertexImpl.java:1126)
        at 
org.apache.tez.dag.app.dag.impl.DAGImpl.getDAGStatus(DAGImpl.java:806)
        at 
org.apache.tez.dag.app.dag.impl.DAGImpl.getDAGStatus(DAGImpl.java:867)
        at 
org.apache.tez.dag.api.client.DAGClientHandler.getDAGStatus(DAGClientHandler.java:67)
        at 
org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolBlockingPBServerImpl.getDAGStatus(DAGClientAMProtocolBlockingPBServerImpl.java:99)
        at 
org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolRPC$DAGClientAMProtocol$2.callBlockingMethod(DAGClientAMProtocolRPC.java:7465)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:616)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:969)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2049)
        at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2045)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1657)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2043)

"Dispatcher thread: Central" #36 prio=5 os_prio=0 tid=0x00007fa2bc653800 
nid=0x878 waiting on condition [0x00007fa2ac353000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000006f58b09c0> (a 
java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:967)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1283)
        at 
java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:727)
        at 
org.apache.tez.dag.app.dag.impl.VertexImpl.getTaskLocationHint(VertexImpl.java:1162)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.getTaskLocationHint(TaskAttemptImpl.java:940)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl$ScheduleTaskattemptTransition.transition(TaskAttemptImpl.java:1033)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl$ScheduleTaskattemptTransition.transition(TaskAttemptImpl.java:1003)
        at 
org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385)
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302)
        at 
org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46)
        at 
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448)
        - locked <0x00000007aa1651b8> (a 
org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.handle(TaskAttemptImpl.java:676)
        at 
org.apache.tez.dag.app.dag.impl.TaskAttemptImpl.handle(TaskAttemptImpl.java:113)
        at 
org.apache.tez.dag.app.DAGAppMaster$TaskAttemptEventDispatcher.handle(DAGAppMaster.java:1920)
        at 
org.apache.tez.dag.app.DAGAppMaster$TaskAttemptEventDispatcher.handle(DAGAppMaster.java:1905)
        at 
org.apache.tez.common.AsyncDispatcher.dispatch(AsyncDispatcher.java:183)
        at org.apache.tez.common.AsyncDispatcher$1.run(AsyncDispatcher.java:114)
        at java.lang.Thread.run(Thread.java:745)



> Deadlock in AM because attempt and vertex locking each other out
> ----------------------------------------------------------------
>
>                 Key: TEZ-2421
>                 URL: https://issues.apache.org/jira/browse/TEZ-2421
>             Project: Apache Tez
>          Issue Type: Bug
>            Reporter: Bikas Saha
>            Assignee: Bikas Saha
>
> Ideally locks should be taken one way - either going down or up. Preferably 
> not going up because most such data can be passed in during object 
> construction.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to