[ https://issues.apache.org/jira/browse/HADOOP-19061?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xing Lin updated HADOOP-19061: ------------------------------ Description: _rpcRequestThread.start()_ is called outside of the try-catch{} block. However, it can throw OOM. In such cases, we fail to start the Connection and Connection.rpcRequestThread threads. However, this OOM won't be captured in {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() will return an Connection object and we will continue with _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging forever at its while loop, because we don't mark this connection as closed and we don't have the rpcRequestSender thread to poll from the queue. {code:java} IPC.Connection.run() @Override public void run() { // Don't start the ipc parameter sending thread until we start this // thread, because the shutdown logic only gets triggered if this // thread is started. rpcRequestThread.start(); if (LOG.isDebugEnabled()) LOG.debug(getName() + ": starting, having connections " + connections.size()); try { while (waitForWork()) {//wait here for work - read or close connection receiveRpcResponse(); } } catch (Throwable t) { // This truly is unexpected, since we catch IOException in receiveResponse // -- this is only to be really sure that we don't leave a client hanging // forever. LOG.warn("Unexpected error reading responses on connection " + this, t); markClosed(new IOException("Error reading responses", t)); }{code} while loop in sendRpcRequest {code:java} while (!shouldCloseConnection.get()) { if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { break; } }{code} OOM exception in starting the rpcRequestSender thread. {code:java} Exception in thread "IPC Client (1664093259) connection to ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" java.lang.OutOfMemoryError: unable to create new native thread at java.lang.Thread.start0(Native Method) at java.lang.Thread.start(Thread.java:717) at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) {code} Multiple threads blocked by queue.offer(). and we don't found any "IPC Client" or "IPC Parameter Sending Thread" in thread dump. {code:java} Thread 2156123: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may be imprecise) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=215 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, boolean, long) @bci=148, line=695 (Compiled frame) - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) - org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) @bci=88, line=1134 (Compiled frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, int, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled frame) - com.sun.proxy.$Proxy11.getBlockLocations({code} was: _rpcRequestThread.start()_ is called outside of the try-catch{} block. However, it can throw OOM. In such cases, we fail to start the Connection and Connection.rpcRequestThread threads. However, this OOM won't be captured in {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() will return an Connection object and we will continue with _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging forever at its while loop (because we don't mark this connection as closed). {code:java} IPC.Connection.run() @Override public void run() { // Don't start the ipc parameter sending thread until we start this // thread, because the shutdown logic only gets triggered if this // thread is started. rpcRequestThread.start(); if (LOG.isDebugEnabled()) LOG.debug(getName() + ": starting, having connections " + connections.size()); try { while (waitForWork()) {//wait here for work - read or close connection receiveRpcResponse(); } } catch (Throwable t) { // This truly is unexpected, since we catch IOException in receiveResponse // -- this is only to be really sure that we don't leave a client hanging // forever. LOG.warn("Unexpected error reading responses on connection " + this, t); markClosed(new IOException("Error reading responses", t)); }{code} while loop in sendRpcRequest {code:java} while (!shouldCloseConnection.get()) { if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { break; } }{code} OOM exception in starting the rpcRequestSender thread. {code:java} Exception in thread "IPC Client (1664093259) connection to ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" java.lang.OutOfMemoryError: unable to create new native thread at java.lang.Thread.start0(Native Method) at java.lang.Thread.start(Thread.java:717) at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) {code} Multiple threads blocked by queue.offer(). and we don't found any "IPC Client" or "IPC Parameter Sending Thread" in thread dump {code:java} Thread 2156123: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may be imprecise) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=215 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, boolean, long) @bci=148, line=695 (Compiled frame) - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) - org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) @bci=88, line=1134 (Compiled frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, int, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled frame) - com.sun.proxy.$Proxy11.getBlockLocations({code} > Capture exception in rpcRequestSender.start() in IPC.Connection.run() > --------------------------------------------------------------------- > > Key: HADOOP-19061 > URL: https://issues.apache.org/jira/browse/HADOOP-19061 > Project: Hadoop Common > Issue Type: Bug > Components: ipc > Affects Versions: 3.5.0 > Reporter: Xing Lin > Priority: Major > > _rpcRequestThread.start()_ is called outside of the try-catch{} block. > However, it can throw OOM. In such cases, we fail to start the Connection and > Connection.rpcRequestThread threads. However, this OOM won't be captured in > {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() > will return an Connection object and we will continue with > _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging > forever at its while loop, because we don't mark this connection as closed > and we don't have the rpcRequestSender thread to poll from the queue. > > > {code:java} > IPC.Connection.run() > @Override > public void run() { > // Don't start the ipc parameter sending thread until we start this > // thread, because the shutdown logic only gets triggered if this > // thread is started. > rpcRequestThread.start(); > if (LOG.isDebugEnabled()) > LOG.debug(getName() + ": starting, having connections " > + connections.size()); > try { > while (waitForWork()) {//wait here for work - read or close connection > receiveRpcResponse(); > } > } catch (Throwable t) { > // This truly is unexpected, since we catch IOException in > receiveResponse > // -- this is only to be really sure that we don't leave a client > hanging > // forever. > LOG.warn("Unexpected error reading responses on connection " + this, > t); > markClosed(new IOException("Error reading responses", t)); > }{code} > > > while loop in sendRpcRequest > {code:java} > while (!shouldCloseConnection.get()) { > if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { > break; > } > }{code} > OOM exception in starting the rpcRequestSender thread. > {code:java} > Exception in thread "IPC Client (1664093259) connection to > ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" > java.lang.OutOfMemoryError: unable to create new native thread > at java.lang.Thread.start0(Native Method) > at java.lang.Thread.start(Thread.java:717) > at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) > {code} > Multiple threads blocked by queue.offer(). and we don't found any "IPC > Client" or "IPC Parameter Sending Thread" in thread dump. > {code:java} > Thread 2156123: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) > @bci=20, line=215 (Compiled frame) > - > java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, > java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) > - > java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, > boolean, long) @bci=148, line=695 (Compiled frame) > - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, > java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) > - > org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) > @bci=88, line=1134 (Compiled frame) > - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, > org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, > int, java.util.concurrent.atomic.AtomicBoolean, > org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) > - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, > org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, > java.util.concurrent.atomic.AtomicBoolean, > org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) > - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, > java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled > frame) > - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, > java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled > frame) > - com.sun.proxy.$Proxy11.getBlockLocations({code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org