[ https://issues.apache.org/jira/browse/TEZ-2267?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Jeff Zhang updated TEZ-2267: ---------------------------- Attachment: syslog_dag_1427965027460_0001_1 jstack2.txt > Deadlock caused by TEZ-2149 > --------------------------- > > Key: TEZ-2267 > URL: https://issues.apache.org/jira/browse/TEZ-2267 > Project: Apache Tez > Issue Type: Bug > Reporter: Jeff Zhang > Priority: Critical > Attachments: jstack.txt, jstack2.txt, syslog_dag_1427965027460_0001_1 > > > {code} > "TaskSchedulerAppCaller #0" daemon prio=10 tid=0x00007f044005e800 nid=0x7be6 > waiting on condition [0x00007f04350ce000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00000000fc279e18> (a > java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireShared(AbstractQueuedSynchronizer.java:964) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireShared(AbstractQueuedSynchronizer.java:1282) > at > java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.lock(ReentrantReadWriteLock.java:731) > at org.apache.tez.dag.app.dag.impl.TaskImpl.isFinished(TaskImpl.java:394) > at > org.apache.tez.dag.app.dag.impl.VertexImpl.computeProgress(VertexImpl.java:1064) > at > org.apache.tez.dag.app.dag.impl.VertexImpl.getProgress(VertexImpl.java:1002) > at org.apache.tez.dag.app.dag.impl.DAGImpl.getProgress(DAGImpl.java:676) > at org.apache.tez.dag.app.DAGAppMaster.getProgress(DAGAppMaster.java:1067) > at > org.apache.tez.dag.app.rm.TaskSchedulerEventHandler.getProgress(TaskSchedulerEventHandler.java:558) > at > org.apache.tez.dag.app.rm.TaskSchedulerAppCallbackWrapper$GetProgressCallable.call(TaskSchedulerAppCallbackWrapper.java:291) > at > org.apache.tez.dag.app.rm.TaskSchedulerAppCallbackWrapper$GetProgressCallable.call(TaskSchedulerAppCallbackWrapper.java:1) > at java.util.concurrent.FutureTask.run(FutureTask.java:262) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > Locked ownable synchronizers: > - <0x00000000fc20aed8> (a java.util.concurrent.ThreadPoolExecutor$Worker) > "IPC Server handler 0 on 47949" daemon prio=10 tid=0x00007f0448036800 > nid=0x7bc0 waiting on condition [0x00007f04372f0000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00000000fc200160> (a > java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2090) > at org.apache.tez.dag.app.dag.impl.DAGImpl.getDAGStatus(DAGImpl.java:763) > at > org.apache.tez.dag.api.client.DAGClientHandler.getDAGStatus(DAGClientHandler.java:67) > at > org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolBlockingPBServerImpl.getDAGStatus(DAGClientAMProtocolBlockingPBServerImpl.java:99) > at > org.apache.tez.dag.api.client.rpc.DAGClientAMProtocolRPC$DAGClientAMProtocol$2.callBlockingMethod(DAGClientAMProtocolRPC.java:7465) > at > org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:619) > at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:962) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2039) > at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2035) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:415) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1628) > at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2033) > Locked ownable synchronizers: > - None > {code} > Or maybe the timeoutNanos is a very large number. -- This message was sent by Atlassian JIRA (v6.3.4#6332)