[ https://issues.apache.org/jira/browse/HADOOP-19061?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xing Lin updated HADOOP-19061: ------------------------------ Description: _rpcRequestThread.start()_ is called outside of the try-catch{} block. However, it can throw OOM. In such cases, we fail to start the Connection and Connection.rpcRequestThread threads. However, this OOM won't be captured in {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() will return an Connection object and we will continue with _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging forever at its while loop, because we don't mark this connection as closed and we don't have the rpcRequestSender thread to poll the request from the queue. {code:java} IPC.Connection.run() @Override public void run() { // Don't start the ipc parameter sending thread until we start this // thread, because the shutdown logic only gets triggered if this // thread is started. rpcRequestThread.start(); if (LOG.isDebugEnabled()) LOG.debug(getName() + ": starting, having connections " + connections.size()); try { while (waitForWork()) {//wait here for work - read or close connection receiveRpcResponse(); } } catch (Throwable t) { // This truly is unexpected, since we catch IOException in receiveResponse // -- this is only to be really sure that we don't leave a client hanging // forever. LOG.warn("Unexpected error reading responses on connection " + this, t); markClosed(new IOException("Error reading responses", t)); }{code} while loop in sendRpcRequest {code:java} while (!shouldCloseConnection.get()) { if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { break; } }{code} OOM exception in starting the rpcRequestSender thread. {code:java} Exception in thread "IPC Client (1664093259) connection to ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" java.lang.OutOfMemoryError: unable to create new native thread at java.lang.Thread.start0(Native Method) at java.lang.Thread.start(Thread.java:717) at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) {code} Multiple threads blocked by queue.offer(). and we don't found any "IPC Client" or "IPC Parameter Sending Thread" in thread dump. {code:java} Thread 2156123: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may be imprecise) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=215 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, boolean, long) @bci=148, line=695 (Compiled frame) - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) - org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) @bci=88, line=1134 (Compiled frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, int, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled frame) - com.sun.proxy.$Proxy11.getBlockLocations({code} was: _rpcRequestThread.start()_ is called outside of the try-catch{} block. However, it can throw OOM. In such cases, we fail to start the Connection and Connection.rpcRequestThread threads. However, this OOM won't be captured in {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() will return an Connection object and we will continue with _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging forever at its while loop, because we don't mark this connection as closed and we don't have the rpcRequestSender thread to poll from the queue. {code:java} IPC.Connection.run() @Override public void run() { // Don't start the ipc parameter sending thread until we start this // thread, because the shutdown logic only gets triggered if this // thread is started. rpcRequestThread.start(); if (LOG.isDebugEnabled()) LOG.debug(getName() + ": starting, having connections " + connections.size()); try { while (waitForWork()) {//wait here for work - read or close connection receiveRpcResponse(); } } catch (Throwable t) { // This truly is unexpected, since we catch IOException in receiveResponse // -- this is only to be really sure that we don't leave a client hanging // forever. LOG.warn("Unexpected error reading responses on connection " + this, t); markClosed(new IOException("Error reading responses", t)); }{code} while loop in sendRpcRequest {code:java} while (!shouldCloseConnection.get()) { if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { break; } }{code} OOM exception in starting the rpcRequestSender thread. {code:java} Exception in thread "IPC Client (1664093259) connection to ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" java.lang.OutOfMemoryError: unable to create new native thread at java.lang.Thread.start0(Native Method) at java.lang.Thread.start(Thread.java:717) at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) {code} Multiple threads blocked by queue.offer(). and we don't found any "IPC Client" or "IPC Parameter Sending Thread" in thread dump. {code:java} Thread 2156123: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may be imprecise) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=215 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, boolean, long) @bci=148, line=695 (Compiled frame) - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) - org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) @bci=88, line=1134 (Compiled frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, int, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, java.util.concurrent.atomic.AtomicBoolean, org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled frame) - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled frame) - com.sun.proxy.$Proxy11.getBlockLocations({code} > Capture exception in rpcRequestSender.start() in IPC.Connection.run() > --------------------------------------------------------------------- > > Key: HADOOP-19061 > URL: https://issues.apache.org/jira/browse/HADOOP-19061 > Project: Hadoop Common > Issue Type: Bug > Components: ipc > Affects Versions: 3.5.0 > Reporter: Xing Lin > Priority: Major > > _rpcRequestThread.start()_ is called outside of the try-catch{} block. > However, it can throw OOM. In such cases, we fail to start the Connection and > Connection.rpcRequestThread threads. However, this OOM won't be captured in > {_}Connection.setupIOStreams(){_}. Instead, it returns and getConnection() > will return an Connection object and we will continue with > _connection.sendRpcRequest(call). sendRpcRequest()_ will then be hanging > forever at its while loop, because we don't mark this connection as closed > and we don't have the rpcRequestSender thread to poll the request from the > queue. > > > {code:java} > IPC.Connection.run() > @Override > public void run() { > // Don't start the ipc parameter sending thread until we start this > // thread, because the shutdown logic only gets triggered if this > // thread is started. > rpcRequestThread.start(); > if (LOG.isDebugEnabled()) > LOG.debug(getName() + ": starting, having connections " > + connections.size()); > try { > while (waitForWork()) {//wait here for work - read or close connection > receiveRpcResponse(); > } > } catch (Throwable t) { > // This truly is unexpected, since we catch IOException in > receiveResponse > // -- this is only to be really sure that we don't leave a client > hanging > // forever. > LOG.warn("Unexpected error reading responses on connection " + this, > t); > markClosed(new IOException("Error reading responses", t)); > }{code} > > > while loop in sendRpcRequest > {code:java} > while (!shouldCloseConnection.get()) { > if (rpcRequestQueue.offer(Pair.of(call, buf), 1, TimeUnit.SECONDS)) { > break; > } > }{code} > OOM exception in starting the rpcRequestSender thread. > {code:java} > Exception in thread "IPC Client (1664093259) connection to > ltx1-holdemnn01.grid.linkedin.com/10.150.1.55:9000 from kafkaetl" > java.lang.OutOfMemoryError: unable to create new native thread > at java.lang.Thread.start0(Native Method) > at java.lang.Thread.start(Thread.java:717) > at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1034) > {code} > Multiple threads blocked by queue.offer(). and we don't found any "IPC > Client" or "IPC Parameter Sending Thread" in thread dump. > {code:java} > Thread 2156123: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) > @bci=20, line=215 (Compiled frame) > - > java.util.concurrent.SynchronousQueue$TransferQueue.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferQueue$QNode, > java.lang.Object, boolean, long) @bci=156, line=764 (Compiled frame) > - > java.util.concurrent.SynchronousQueue$TransferQueue.transfer(java.lang.Object, > boolean, long) @bci=148, line=695 (Compiled frame) > - java.util.concurrent.SynchronousQueue.offer(java.lang.Object, long, > java.util.concurrent.TimeUnit) @bci=24, line=895 (Compiled frame) > - > org.apache.hadoop.ipc.Client$Connection.sendRpcRequest(org.apache.hadoop.ipc.Client$Call) > @bci=88, line=1134 (Compiled frame) > - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, > org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, > int, java.util.concurrent.atomic.AtomicBoolean, > org.apache.hadoop.ipc.AlignmentContext) @bci=36, line=1402 (Interpreted frame) > - org.apache.hadoop.ipc.Client.call(org.apache.hadoop.ipc.RPC$RpcKind, > org.apache.hadoop.io.Writable, org.apache.hadoop.ipc.Client$ConnectionId, > java.util.concurrent.atomic.AtomicBoolean, > org.apache.hadoop.ipc.AlignmentContext) @bci=9, line=1349 (Compiled frame) > - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, > java.lang.reflect.Method, java.lang.Object[]) @bci=248, line=230 (Compiled > frame) > - org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(java.lang.Object, > java.lang.reflect.Method, java.lang.Object[]) @bci=4, line=118 (Compiled > frame) > - com.sun.proxy.$Proxy11.getBlockLocations({code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-issues-h...@hadoop.apache.org