[ https://issues.apache.org/jira/browse/RATIS-977?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17135737#comment-17135737 ]
runzhiwang commented on RATIS-977: ---------------------------------- [~ljain] Thanks for review. The root cause is thread unsafe of reply. The PR: https://github.com/apache/incubator-ratis/pull/127 > Fix gRPC failed to read message > ------------------------------- > > Key: RATIS-977 > URL: https://issues.apache.org/jira/browse/RATIS-977 > Project: Ratis > Issue Type: Bug > Reporter: runzhiwang > Assignee: runzhiwang > Priority: Major > > 1. s0 reply to s3, but s3 failed to read message. > {code:java} > 2020-06-15T02:09:05.5568987Z 2020-06-15 02:09:05,554 [Thread-82] DEBUG > impl.RaftServerImpl (RaftServerImpl.java:logAppendEntries(916)) - > s0@group-E95D39D4BA60: succeeded to handle AppendEntries. Reply: > s3<-s0#9:OK,SUCCESS,nextIndex:2,term:2,followerCommit:0,matchIndex:1 > 2020-06-15T02:09:05.6204785Z 2020-06-15 02:09:05,605 > [grpc-default-executor-1] WARN server.GrpcLogAppender > (LogUtils.java:warn(122)) - > s3@group-E95D39D4BA60->s0-AppendLogResponseHandler: Failed appendEntries: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: CANCELLED: Failed > to read message. > {code} > 2. The stack is as follow: > {code:java} > 2020-06-15T02:09:05.6092588Z > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: CANCELLED: Failed > to read message. > 2020-06-15T02:09:05.6092751Z at > org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:533) > 2020-06-15T02:09:05.6092920Z at > org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:449) > 2020-06-15T02:09:05.6093083Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:426) > 2020-06-15T02:09:05.6093245Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.access$500(ClientCallImpl.java:66) > 2020-06-15T02:09:05.6093420Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.close(ClientCallImpl.java:689) > 2020-06-15T02:09:05.6093787Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.access$900(ClientCallImpl.java:577) > 2020-06-15T02:09:05.6093967Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:670) > 2020-06-15T02:09:05.6094134Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:643) > 2020-06-15T02:09:05.6094300Z at > org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37) > 2020-06-15T02:09:05.6094460Z at > org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123) > 2020-06-15T02:09:05.6094615Z at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > 2020-06-15T02:09:05.6094846Z at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > 2020-06-15T02:09:05.6094997Z at java.lang.Thread.run(Thread.java:748) > 2020-06-15T02:09:05.6095159Z Caused by: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: INTERNAL: Invalid > protobuf byte sequence > 2020-06-15T02:09:05.6095316Z at > org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:524) > 2020-06-15T02:09:05.6095480Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parse(ProtoLiteUtils.java:218) > 2020-06-15T02:09:05.6095649Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parse(ProtoLiteUtils.java:118) > 2020-06-15T02:09:05.6095816Z at > org.apache.ratis.thirdparty.io.grpc.MethodDescriptor.parseResponse(MethodDescriptor.java:275) > 2020-06-15T02:09:05.6095972Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:658) > 2020-06-15T02:09:05.6096126Z ... 6 more > 2020-06-15T02:09:05.6096283Z Caused by: > org.apache.ratis.thirdparty.com.google.protobuf.InvalidProtocolBufferException: > Protocol message contained an invalid tag (zero). > 2020-06-15T02:09:05.6096473Z at > org.apache.ratis.thirdparty.com.google.protobuf.InvalidProtocolBufferException.invalidTag(InvalidProtocolBufferException.java:102) > 2020-06-15T02:09:05.6096697Z at > org.apache.ratis.thirdparty.com.google.protobuf.CodedInputStream$ArrayDecoder.readTag(CodedInputStream.java:627) > 2020-06-15T02:09:05.6096864Z at > org.apache.ratis.proto.RaftProtos$AppendEntriesReplyProto.<init>(RaftProtos.java:16335) > {code} > Another stack. > {code:java} > 2020-06-15T04:48:06.1567901Z 2020-06-15 04:48:06,155 [Thread-296] DEBUG > impl.RaftServerImpl (RaftServerImpl.java:logAppendEntries(916)) - > s1@group-1D71FE491F6A: succeeded to handle AppendEntries. Reply: > <ByteString@3d2c5532 size=2 contents="s3"> <- <ByteString@5b64bed size=2 > contents="s1"> group:<ByteString@24ec82ef size=16 > contents="\\p\001\222r\003@\030\226b\035q\376I\037j">#74:OK,SUCCESS,nextIndex:3,term:2,followerCommit:1,matchIndex:2 > 2020-06-15T04:48:06.2709970Z java.lang.AssertionError > 2020-06-15T04:48:06.2710285Z at org.junit.Assert.fail(Assert.java:86) > 2020-06-15T04:48:06.2710573Z at org.junit.Assert.assertTrue(Assert.java:41) > 2020-06-15T04:48:06.2710881Z at org.junit.Assert.assertTrue(Assert.java:52) > 2020-06-15T04:48:06.2711370Z at > org.apache.ratis.grpc.TestRaftWithGrpc.lambda$null$6(TestRaftWithGrpc.java:99) > 2020-06-15T04:48:06.2711707Z at > org.apache.ratis.util.function.CheckedRunnable.lambda$asCheckedSupplier$0(CheckedRunnable.java:32) > 2020-06-15T04:48:06.2712016Z at > org.apache.ratis.util.JavaUtils.attempt(JavaUtils.java:160) > 2020-06-15T04:48:06.2712316Z at > org.apache.ratis.util.JavaUtils.attemptRepeatedly(JavaUtils.java:146) > 2020-06-15T04:48:06.2712612Z at > org.apache.ratis.util.JavaUtils.attempt(JavaUtils.java:180) > 2020-06-15T04:48:06.2712905Z at > org.apache.ratis.grpc.TestRaftWithGrpc.lambda$null$7(TestRaftWithGrpc.java:93) > 2020-06-15T04:48:06.2713189Z at > org.apache.ratis.util.JavaUtils.runAsUnchecked(JavaUtils.java:89) > 2020-06-15T04:48:06.2713484Z at > org.apache.ratis.util.JavaUtils.runAsUnchecked(JavaUtils.java:83) > 2020-06-15T04:48:06.2713785Z at > org.apache.ratis.grpc.TestRaftWithGrpc.lambda$runTestUpdateViaHeartbeat$8(TestRaftWithGrpc.java:93) > 2020-06-15T04:48:06.2714249Z at > java.util.Spliterators$ArraySpliterator.forEachRemaining(Spliterators.java:948) > 2020-06-15T04:48:06.2714570Z at > java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:647) > 2020-06-15T04:48:06.2714882Z at > org.apache.ratis.grpc.TestRaftWithGrpc.runTestUpdateViaHeartbeat(TestRaftWithGrpc.java:92) > 2020-06-15T04:48:06.2715192Z at > org.apache.ratis.MiniRaftCluster$Factory$Get.runWithNewCluster(MiniRaftCluster.java:125) > 2020-06-15T04:48:06.2715499Z at > org.apache.ratis.MiniRaftCluster$Factory$Get.runWithNewCluster(MiniRaftCluster.java:113) > 2020-06-15T04:48:06.2715805Z at > org.apache.ratis.grpc.TestRaftWithGrpc.testUpdateViaHeartbeat(TestRaftWithGrpc.java:53) > 2020-06-15T04:48:06.2716106Z at > sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > 2020-06-15T04:48:06.2716402Z at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > 2020-06-15T04:48:06.2716707Z at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > 2020-06-15T04:48:06.2717012Z at > java.lang.reflect.Method.invoke(Method.java:498) > 2020-06-15T04:48:06.2717318Z at > org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50) > 2020-06-15T04:48:06.2717627Z at > org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) > 2020-06-15T04:48:06.2717928Z at > org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47) > 2020-06-15T04:48:06.2718228Z at > org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) > 2020-06-15T04:48:06.2718545Z at > org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) > 2020-06-15T04:48:06.2766658Z at > org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:298) > 2020-06-15T04:48:06.2767094Z at > org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:292) > 2020-06-15T04:48:06.2767640Z at > java.util.concurrent.FutureTask.run(FutureTask.java:266) > 2020-06-15T04:48:06.2767986Z at java.lang.Thread.run(Thread.java:748) > 2020-06-15T04:48:06.2768844Z 2020-06-15 04:48:06,264 > [grpc-default-executor-8] ERROR server.GrpcLogAppender > (GrpcLogAppender.java:onError(332)) - grpc error stack > 2020-06-15T04:48:06.2769220Z > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: CANCELLED: Failed > to read message. > 2020-06-15T04:48:06.2769558Z at > org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:533) > 2020-06-15T04:48:06.2769901Z at > org.apache.ratis.thirdparty.io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:449) > 2020-06-15T04:48:06.2770245Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:426) > 2020-06-15T04:48:06.2770692Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl.access$500(ClientCallImpl.java:66) > 2020-06-15T04:48:06.2771051Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.close(ClientCallImpl.java:689) > 2020-06-15T04:48:06.2771568Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl.access$900(ClientCallImpl.java:577) > 2020-06-15T04:48:06.2771947Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:670) > 2020-06-15T04:48:06.2772301Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInContext(ClientCallImpl.java:643) > 2020-06-15T04:48:06.2772635Z at > org.apache.ratis.thirdparty.io.grpc.internal.ContextRunnable.run(ContextRunnable.java:37) > 2020-06-15T04:48:06.2772969Z at > org.apache.ratis.thirdparty.io.grpc.internal.SerializingExecutor.run(SerializingExecutor.java:123) > 2020-06-15T04:48:06.2773301Z at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > 2020-06-15T04:48:06.2773631Z at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > 2020-06-15T04:48:06.2774050Z at java.lang.Thread.run(Thread.java:748) > 2020-06-15T04:48:06.2774488Z Caused by: > org.apache.ratis.thirdparty.io.grpc.StatusRuntimeException: INTERNAL: Invalid > protobuf byte sequence > 2020-06-15T04:48:06.2774799Z at > org.apache.ratis.thirdparty.io.grpc.Status.asRuntimeException(Status.java:524) > 2020-06-15T04:48:06.2775113Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parse(ProtoLiteUtils.java:218) > 2020-06-15T04:48:06.2775431Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parse(ProtoLiteUtils.java:118) > 2020-06-15T04:48:06.2775748Z at > org.apache.ratis.thirdparty.io.grpc.MethodDescriptor.parseResponse(MethodDescriptor.java:275) > 2020-06-15T04:48:06.2776618Z at > org.apache.ratis.thirdparty.io.grpc.internal.ClientCallImpl$ClientStreamListenerImpl$1MessagesAvailable.runInternal(ClientCallImpl.java:658) > 2020-06-15T04:48:06.2776889Z ... 6 more > 2020-06-15T04:48:06.2777205Z Caused by: > org.apache.ratis.thirdparty.com.google.protobuf.InvalidProtocolBufferException: > While parsing a protocol message, the input ended unexpectedly in the middle > of a field. This could mean either that the input has been truncated or that > an embedded message misreported its own length. > 2020-06-15T04:48:06.2777518Z at > org.apache.ratis.thirdparty.com.google.protobuf.InvalidProtocolBufferException.truncatedMessage(InvalidProtocolBufferException.java:84) > 2020-06-15T04:48:06.2777791Z at > org.apache.ratis.thirdparty.com.google.protobuf.CodedInputStream$ArrayDecoder.readRawByte(CodedInputStream.java:1238) > 2020-06-15T04:48:06.2778063Z at > org.apache.ratis.thirdparty.com.google.protobuf.CodedInputStream$ArrayDecoder.readRawVarint64SlowPath(CodedInputStream.java:1126) > 2020-06-15T04:48:06.2778328Z at > org.apache.ratis.thirdparty.com.google.protobuf.CodedInputStream$ArrayDecoder.readRawVarint64(CodedInputStream.java:1119) > 2020-06-15T04:48:06.2778588Z at > org.apache.ratis.thirdparty.com.google.protobuf.CodedInputStream$ArrayDecoder.readUInt64(CodedInputStream.java:757) > 2020-06-15T04:48:06.2778843Z at > org.apache.ratis.proto.RaftProtos$AppendEntriesReplyProto.<init>(RaftProtos.java:16376) > 2020-06-15T04:48:06.2779093Z at > org.apache.ratis.proto.RaftProtos$AppendEntriesReplyProto.<init>(RaftProtos.java:16297) > 2020-06-15T04:48:06.2779348Z at > org.apache.ratis.proto.RaftProtos$AppendEntriesReplyProto$1.parsePartialFrom(RaftProtos.java:17411) > 2020-06-15T04:48:06.2779582Z at > org.apache.ratis.proto.RaftProtos$AppendEntriesReplyProto$1.parsePartialFrom(RaftProtos.java:17405) > 2020-06-15T04:48:06.2779829Z at > org.apache.ratis.thirdparty.com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:86) > 2020-06-15T04:48:06.2780077Z at > org.apache.ratis.thirdparty.com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:48) > 2020-06-15T04:48:06.2780390Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parseFrom(ProtoLiteUtils.java:223) > 2020-06-15T04:48:06.2783772Z at > org.apache.ratis.thirdparty.io.grpc.protobuf.lite.ProtoLiteUtils$MessageMarshaller.parse(ProtoLiteUtils.java:215) > 2020-06-15T04:48:06.2784921Z ... 9 more > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)