[ 
https://issues.apache.org/jira/browse/HDDS-11360?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

JiangHua Zhu updated HDDS-11360:
--------------------------------
    Description: 
Found some NullPointerException in OMRatisHelper. Here are some cases.
om ha switch:
{code:java}
2024-08-23 13:41:35,785 [om22-server-thread545] INFO 
org.apache.ratis.server.RaftServer$Division: om22@group-61C56C563FC9: receive 
transferLeadership 
TransferLeadershipRequest:client-CBC5546B4108->om22@group-61C56C563FC9, cid=3, 
seq=null, RO, null
2024-08-23 13:41:35,786 [om22-server-thread545] INFO 
org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: start 
transferring leadership to om21
2024-08-23 13:41:35,787 [om22-server-thread545] INFO 
org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: 
sendStartLeaderElection to follower om21, lastEntry=(t:77, i:12154700362)
2024-08-23 13:41:35,787 [om22-server-thread545] INFO 
org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: 
SUCCESS sent StartLeaderElection to transferee om21 immediately as it already 
has up-to-date log
{code}

OMRatisHelper:
{code:java}
2024-08-23 13:41:35,869 [IPC Server handler 113 on default port 9862] WARN 
org.apache.hadoop.ipc.Server: IPC Server handler 113 on default port 9862, call 
Call#8836 Retry#0 
org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol.submitRequest from 
xxx.xxx.xxx.xxx:33796 / xx.xx.xx.xx:33796
java.lang.NullPointerException: Cannot invoke 
"org.apache.ratis.protocol.Message.getContent()" because the return value of 
"org.apache.ratis.protocol.RaftClientReply.getMessage()" is null
        at 
org.apache.hadoop.ozone.om.helpers.OMRatisHelper.getOMResponseFromRaftClientReply(OMRatisHelper.java:66)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponseImpl(OzoneManagerRatisServer.java:524)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.lambda$1(OzoneManagerRatisServer.java:279)
        at 
org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponse(OzoneManagerRatisServer.java:277)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.submitRequest(OzoneManagerRatisServer.java:257)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequestToRatis(OzoneManagerProtocolServerSideTranslatorPB.java:257)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:236)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:172)
        at 
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:163)
        at 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server.processCall(ProtobufRpcEngine.java:484)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:595)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1098)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1021)
        at 
java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
        at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3060)
{code}

s3gateway log:
{code:java}
2024-08-23 13:41:35,801 [qtp1396431506-4981] INFO 
org.apache.hadoop.io.retry.RetryInvocationHandler: 
com.google.protobuf.ServiceException: 
org.apache.hadoop.ipc.RemoteException(java.lang.NullPointerException): Cannot 
invoke "org.apache.ratis.protocol.Message.getContent()" because the return 
value of "org.apache.ratis.protocol.RaftClientReply.getMessage()" is null
        at 
org.apache.hadoop.ozone.om.helpers.OMRatisHelper.getOMResponseFromRaftClientReply(OMRatisHelper.java:66)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponseImpl(OzoneManagerRatisServer.java:524)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.lambda$1(OzoneManagerRatisServer.java:279)
        at 
org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponse(OzoneManagerRatisServer.java:277)
        at 
org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.submitRequest(OzoneManagerRatisServer.java:257)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequestToRatis(OzoneManagerProtocolServerSideTranslatorPB.java:257)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:236)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:172)
        at 
org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
        at 
org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:163)
        at 
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine$Server.processCall(ProtobufRpcEngine.java:484)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:595)
        at 
org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
        at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1098)
        at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1021)
        at 
java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
        at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
        at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3060)
, while invoking $Proxy124.submitRequest over 
nodeId=om22,nodeAddress=xx.xx.xx.xx:9862. Trying to failover after sleeping for 
2000ms. Current retry count: 0.
{code}

Looks like this is related to om ha switch.

> NPE in OMRatisHelper
> --------------------
>
>                 Key: HDDS-11360
>                 URL: https://issues.apache.org/jira/browse/HDDS-11360
>             Project: Apache Ozone
>          Issue Type: Improvement
>          Components: OM
>    Affects Versions: 1.4.0
>            Reporter: JiangHua Zhu
>            Priority: Major
>
> Found some NullPointerException in OMRatisHelper. Here are some cases.
> om ha switch:
> {code:java}
> 2024-08-23 13:41:35,785 [om22-server-thread545] INFO 
> org.apache.ratis.server.RaftServer$Division: om22@group-61C56C563FC9: receive 
> transferLeadership 
> TransferLeadershipRequest:client-CBC5546B4108->om22@group-61C56C563FC9, 
> cid=3, seq=null, RO, null
> 2024-08-23 13:41:35,786 [om22-server-thread545] INFO 
> org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: 
> start transferring leadership to om21
> 2024-08-23 13:41:35,787 [om22-server-thread545] INFO 
> org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: 
> sendStartLeaderElection to follower om21, lastEntry=(t:77, i:12154700362)
> 2024-08-23 13:41:35,787 [om22-server-thread545] INFO 
> org.apache.ratis.server.impl.TransferLeadership: om22@group-61C56C563FC9: 
> SUCCESS sent StartLeaderElection to transferee om21 immediately as it already 
> has up-to-date log
> {code}
> OMRatisHelper:
> {code:java}
> 2024-08-23 13:41:35,869 [IPC Server handler 113 on default port 9862] WARN 
> org.apache.hadoop.ipc.Server: IPC Server handler 113 on default port 9862, 
> call Call#8836 Retry#0 
> org.apache.hadoop.ozone.om.protocol.OzoneManagerProtocol.submitRequest from 
> xxx.xxx.xxx.xxx:33796 / xx.xx.xx.xx:33796
> java.lang.NullPointerException: Cannot invoke 
> "org.apache.ratis.protocol.Message.getContent()" because the return value of 
> "org.apache.ratis.protocol.RaftClientReply.getMessage()" is null
>       at 
> org.apache.hadoop.ozone.om.helpers.OMRatisHelper.getOMResponseFromRaftClientReply(OMRatisHelper.java:66)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponseImpl(OzoneManagerRatisServer.java:524)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.lambda$1(OzoneManagerRatisServer.java:279)
>       at 
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponse(OzoneManagerRatisServer.java:277)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.submitRequest(OzoneManagerRatisServer.java:257)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequestToRatis(OzoneManagerProtocolServerSideTranslatorPB.java:257)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:236)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:172)
>       at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:163)
>       at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server.processCall(ProtobufRpcEngine.java:484)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:595)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
>       at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
>       at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1098)
>       at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1021)
>       at 
> java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
>       at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
>       at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
>       at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3060)
> {code}
> s3gateway log:
> {code:java}
> 2024-08-23 13:41:35,801 [qtp1396431506-4981] INFO 
> org.apache.hadoop.io.retry.RetryInvocationHandler: 
> com.google.protobuf.ServiceException: 
> org.apache.hadoop.ipc.RemoteException(java.lang.NullPointerException): Cannot 
> invoke "org.apache.ratis.protocol.Message.getContent()" because the return 
> value of "org.apache.ratis.protocol.RaftClientReply.getMessage()" is null
>       at 
> org.apache.hadoop.ozone.om.helpers.OMRatisHelper.getOMResponseFromRaftClientReply(OMRatisHelper.java:66)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponseImpl(OzoneManagerRatisServer.java:524)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.lambda$1(OzoneManagerRatisServer.java:279)
>       at 
> org.apache.hadoop.util.MetricUtil.captureLatencyNs(MetricUtil.java:45)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.createOmResponse(OzoneManagerRatisServer.java:277)
>       at 
> org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer.submitRequest(OzoneManagerRatisServer.java:257)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequestToRatis(OzoneManagerProtocolServerSideTranslatorPB.java:257)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.internalProcessRequest(OzoneManagerProtocolServerSideTranslatorPB.java:236)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.processRequest(OzoneManagerProtocolServerSideTranslatorPB.java:172)
>       at 
> org.apache.hadoop.hdds.server.OzoneProtocolMessageDispatcher.processRequest(OzoneProtocolMessageDispatcher.java:89)
>       at 
> org.apache.hadoop.ozone.protocolPB.OzoneManagerProtocolServerSideTranslatorPB.submitRequest(OzoneManagerProtocolServerSideTranslatorPB.java:163)
>       at 
> org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos$OzoneManagerService$2.callBlockingMethod(OzoneManagerProtocolProtos.java)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine$Server.processCall(ProtobufRpcEngine.java:484)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:595)
>       at 
> org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
>       at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
>       at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1098)
>       at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1021)
>       at 
> java.base/java.security.AccessController.doPrivileged(AccessController.java:712)
>       at java.base/javax.security.auth.Subject.doAs(Subject.java:439)
>       at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1953)
>       at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3060)
> , while invoking $Proxy124.submitRequest over 
> nodeId=om22,nodeAddress=xx.xx.xx.xx:9862. Trying to failover after sleeping 
> for 2000ms. Current retry count: 0.
> {code}
> Looks like this is related to om ha switch.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to