[ https://issues.apache.org/jira/browse/YARN-7176?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
lujie updated YARN-7176: ------------------------ Description: I submit a job, but I need to kill it immediately due to some reason. Then I found the RM was killed. I check the RMLog and found ArrayIndexOutOfBoundsException and NullPointerException.According to the log,RM was killed due to NullPointerException, but i still don't understand why those Exception happen I attath the whole RM log. {code:java} 2017-09-08 02:34:37,967 INFO org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher: Error launching appattempt_1504809243340_0001_000001. Got exception: java.lang.ArrayIndexOutOfBoundsException: 3 at java.util.ArrayList.add(ArrayList.java:441) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto$Builder.addAllApplicationACLs(YarnProtos.java:39956) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.addApplicationACLs(ContainerLaunchContextPBImpl.java:446) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToBuilder(ContainerLaunchContextPBImpl.java:121) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToProto(ContainerLaunchContextPBImpl.java:128) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.getProto(ContainerLaunchContextPBImpl.java:70) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.convertToProtoFormat(StartContainerRequestPBImpl.java:156) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToBuilder(StartContainerRequestPBImpl.java:85) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToProto(StartContainerRequestPBImpl.java:95) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.getProto(StartContainerRequestPBImpl.java:57) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.convertToProtoFormat(StartContainersRequestPBImpl.java:137) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.addLocalRequestsToProto(StartContainersRequestPBImpl.java:97) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToBuilder(StartContainersRequestPBImpl.java:79) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToProto(StartContainersRequestPBImpl.java:72) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.getProto(StartContainersRequestPBImpl.java:48) at org.apache.hadoop.yarn.api.impl.pb.client.ContainerManagementProtocolPBClientImpl.startContainers(ContainerManagementProtocolPBClientImpl.java:93) at org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.launch(AMLauncher.java:119) at org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.run(AMLauncher.java:254) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,968 ERROR org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Error updating app: application_1504809243340_0001 java.lang.NullPointerException at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) at org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) at org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) at org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) at org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) at org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) at org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,978 FATAL org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Received a org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent of type STATE_STORE_OP_FAILED. Cause: java.lang.NullPointerException at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) at org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) at org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) at org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) at org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) at org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) at org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl: container_1504809243340_0001_01_000001 Container Transitioned from ACQUIRED to KILLED 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp: Completed container: container_1504809243340_0001_01_000001 in state: KILLED event:KILL 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER=hires OPERATION=AM Released Container TARGET=SchedulerApp RESULT=SUCCESS APPID=application_1504809243340_0001 CONTAINERID=container_1504809243340_0001_01_000001 2017-09-08 02:34:37,988 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode: Released container container_1504809243340_0001_01_000001 of capacity <memory:2048, vCores:1> on host hadoop11:45454, which currently has 0 containers, <memory:0, vCores:0> used and <memory:8096, vCores:8> available, release resources=true 2017-09-08 02:34:37,988 INFO org.apache.hadoop.util.ExitUtil: Exiting with status 1 2017-09-08 02:34:37,988 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: default used=<memory:0, vCores:0> numContainers=0 user=hires user-resources=<memory:0, vCores:0> 2017-09-08 02:34:37,989 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: completedContainer container=Container: [ContainerId: container_1504809243340_0001_01_000001, NodeId: hadoop11:45454, NodeHttpAddress: hadoop11:8042, Resource: <memory:2048, vCores:1>, Priority: 0, Token: Token { kind: ContainerToken, service: 10.3.1.11:45454 }, ] queue=default: capacity=1.0, absoluteCapacity=1.0, usedResources=<memory:0, vCores:0>, usedCapacity=0.0, absoluteUsedCapacity=0.0, numApps=1, numContainers=0 cluster=<memory:16192, vCores:16> 2017-09-08 02:34:37,989 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: completedContainer queue=root usedCapacity=0.0 absoluteUsedCapacity=0.0 used=<memory:0, vCores:0> cluster=<memory:16192, vCores:16> 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: Re-sorting completed queue: root.default stats: default: capacity=1.0, absoluteCapacity=1.0, usedResources=<memory:0, vCores:0>, usedCapacity=0.0, absoluteUsedCapacity=0.0, numApps=1, numContainers=0 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler: Application attempt appattempt_1504809243340_0001_000001 released container container_1504809243340_0001_01_000001 on node: host: hadoop11:45454 #containers=0 available=8096 used=0 with event: KILL 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppSchedulingInfo: Application application_1504809243340_0001 requests cleared 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: Application removed - appId: application_1504809243340_0001 user: hires queue: default #user-pending-applications: 0 #user-active-applications: 0 #queue-pending-applications: 0 #queue-active-applications: 0 2017-09-08 02:34:38,001 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,005 INFO org.mortbay.log: Stopped HttpServer2$SelectChannelConnectorWithSafeStartup@hadoop11:8088 2017-09-08 02:34:38,005 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,006 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,108 INFO org.apache.hadoop.ipc.Server: Stopping server on 8032 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server listener on 8032 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping server on 8033 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server Responder 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server listener on 8033 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server Responder[^attachment-name.zip] {code} was: I submit a job, but I need to kill it immediately due to some reason. Then I found the RM was killed. I check the RMLog and found ArrayIndexOutOfBoundsException and NullPointerException.According to the log,RM was killed due to NullPointerException, but i still don't understand why those Exception happen I attath the whole RM log {code:java} 2017-09-08 02:34:37,967 INFO org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher: Error launching appattempt_1504809243340_0001_000001. Got exception: java.lang.ArrayIndexOutOfBoundsException: 3 at java.util.ArrayList.add(ArrayList.java:441) at com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto$Builder.addAllApplicationACLs(YarnProtos.java:39956) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.addApplicationACLs(ContainerLaunchContextPBImpl.java:446) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToBuilder(ContainerLaunchContextPBImpl.java:121) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToProto(ContainerLaunchContextPBImpl.java:128) at org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.getProto(ContainerLaunchContextPBImpl.java:70) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.convertToProtoFormat(StartContainerRequestPBImpl.java:156) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToBuilder(StartContainerRequestPBImpl.java:85) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToProto(StartContainerRequestPBImpl.java:95) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.getProto(StartContainerRequestPBImpl.java:57) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.convertToProtoFormat(StartContainersRequestPBImpl.java:137) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.addLocalRequestsToProto(StartContainersRequestPBImpl.java:97) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToBuilder(StartContainersRequestPBImpl.java:79) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToProto(StartContainersRequestPBImpl.java:72) at org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.getProto(StartContainersRequestPBImpl.java:48) at org.apache.hadoop.yarn.api.impl.pb.client.ContainerManagementProtocolPBClientImpl.startContainers(ContainerManagementProtocolPBClientImpl.java:93) at org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.launch(AMLauncher.java:119) at org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.run(AMLauncher.java:254) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,968 ERROR org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Error updating app: application_1504809243340_0001 java.lang.NullPointerException at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) at org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) at org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) at org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) at org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) at org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) at org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,978 FATAL org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Received a org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent of type STATE_STORE_OP_FAILED. Cause: java.lang.NullPointerException at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) at com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) at com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) at org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) at org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) at org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) at org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) at org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) at org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) at org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) at java.lang.Thread.run(Thread.java:745) 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl: container_1504809243340_0001_01_000001 Container Transitioned from ACQUIRED to KILLED 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp: Completed container: container_1504809243340_0001_01_000001 in state: KILLED event:KILL 2017-09-08 02:34:37,987 INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER=hires OPERATION=AM Released Container TARGET=SchedulerApp RESULT=SUCCESS APPID=application_1504809243340_0001 CONTAINERID=container_1504809243340_0001_01_000001 2017-09-08 02:34:37,988 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode: Released container container_1504809243340_0001_01_000001 of capacity <memory:2048, vCores:1> on host hadoop11:45454, which currently has 0 containers, <memory:0, vCores:0> used and <memory:8096, vCores:8> available, release resources=true 2017-09-08 02:34:37,988 INFO org.apache.hadoop.util.ExitUtil: Exiting with status 1 2017-09-08 02:34:37,988 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: default used=<memory:0, vCores:0> numContainers=0 user=hires user-resources=<memory:0, vCores:0> 2017-09-08 02:34:37,989 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: completedContainer container=Container: [ContainerId: container_1504809243340_0001_01_000001, NodeId: hadoop11:45454, NodeHttpAddress: hadoop11:8042, Resource: <memory:2048, vCores:1>, Priority: 0, Token: Token { kind: ContainerToken, service: 10.3.1.11:45454 }, ] queue=default: capacity=1.0, absoluteCapacity=1.0, usedResources=<memory:0, vCores:0>, usedCapacity=0.0, absoluteUsedCapacity=0.0, numApps=1, numContainers=0 cluster=<memory:16192, vCores:16> 2017-09-08 02:34:37,989 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: completedContainer queue=root usedCapacity=0.0 absoluteUsedCapacity=0.0 used=<memory:0, vCores:0> cluster=<memory:16192, vCores:16> 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: Re-sorting completed queue: root.default stats: default: capacity=1.0, absoluteCapacity=1.0, usedResources=<memory:0, vCores:0>, usedCapacity=0.0, absoluteUsedCapacity=0.0, numApps=1, numContainers=0 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler: Application attempt appattempt_1504809243340_0001_000001 released container container_1504809243340_0001_01_000001 on node: host: hadoop11:45454 #containers=0 available=8096 used=0 with event: KILL 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppSchedulingInfo: Application application_1504809243340_0001 requests cleared 2017-09-08 02:34:37,990 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: Application removed - appId: application_1504809243340_0001 user: hires queue: default #user-pending-applications: 0 #user-active-applications: 0 #queue-pending-applications: 0 #queue-active-applications: 0 2017-09-08 02:34:38,001 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,005 INFO org.mortbay.log: Stopped HttpServer2$SelectChannelConnectorWithSafeStartup@hadoop11:8088 2017-09-08 02:34:38,005 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,006 ERROR org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: ExpiredTokenRemover received java.lang.InterruptedException: sleep interrupted 2017-09-08 02:34:38,108 INFO org.apache.hadoop.ipc.Server: Stopping server on 8032 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server listener on 8032 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping server on 8033 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server Responder 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server listener on 8033 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC Server Responder[^attachment-name.zip] {code} > After job kill command is send, the ResourceManager was killed > --------------------------------------------------------------- > > Key: YARN-7176 > URL: https://issues.apache.org/jira/browse/YARN-7176 > Project: Hadoop YARN > Issue Type: Bug > Components: RM > Affects Versions: 2.6.0 > Reporter: lujie > Priority: Critical > Attachments: logs.rar > > > I submit a job, but I need to kill it immediately due to some reason. Then I > found the RM was killed. > I check the RMLog and found ArrayIndexOutOfBoundsException and > NullPointerException.According to the log,RM was killed due to > NullPointerException, but i still don't understand why those Exception happen > I attath the whole RM log. > {code:java} > 2017-09-08 02:34:37,967 INFO > org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher: Error > launching appattempt_1504809243340_0001_000001. Got exception: > java.lang.ArrayIndexOutOfBoundsException: 3 > at java.util.ArrayList.add(ArrayList.java:441) > at > com.google.protobuf.AbstractMessageLite$Builder.addAll(AbstractMessageLite.java:330) > at > org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto$Builder.addAllApplicationACLs(YarnProtos.java:39956) > at > org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.addApplicationACLs(ContainerLaunchContextPBImpl.java:446) > at > org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToBuilder(ContainerLaunchContextPBImpl.java:121) > at > org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.mergeLocalToProto(ContainerLaunchContextPBImpl.java:128) > at > org.apache.hadoop.yarn.api.records.impl.pb.ContainerLaunchContextPBImpl.getProto(ContainerLaunchContextPBImpl.java:70) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.convertToProtoFormat(StartContainerRequestPBImpl.java:156) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToBuilder(StartContainerRequestPBImpl.java:85) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.mergeLocalToProto(StartContainerRequestPBImpl.java:95) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainerRequestPBImpl.getProto(StartContainerRequestPBImpl.java:57) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.convertToProtoFormat(StartContainersRequestPBImpl.java:137) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.addLocalRequestsToProto(StartContainersRequestPBImpl.java:97) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToBuilder(StartContainersRequestPBImpl.java:79) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.mergeLocalToProto(StartContainersRequestPBImpl.java:72) > at > org.apache.hadoop.yarn.api.protocolrecords.impl.pb.StartContainersRequestPBImpl.getProto(StartContainersRequestPBImpl.java:48) > at > org.apache.hadoop.yarn.api.impl.pb.client.ContainerManagementProtocolPBClientImpl.startContainers(ContainerManagementProtocolPBClientImpl.java:93) > at > org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.launch(AMLauncher.java:119) > at > org.apache.hadoop.yarn.server.resourcemanager.amlauncher.AMLauncher.run(AMLauncher.java:254) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > 2017-09-08 02:34:37,968 ERROR > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore: Error > updating app: application_1504809243340_0001 > java.lang.NullPointerException > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) > at > com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) > at > org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) > at > org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) > at > org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) > at > org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) > at > org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) > at > org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) > at java.lang.Thread.run(Thread.java:745) > 2017-09-08 02:34:37,978 FATAL > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Received a > org.apache.hadoop.yarn.server.resourcemanager.RMFatalEvent of type > STATE_STORE_OP_FAILED. Cause: > java.lang.NullPointerException > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnProtos$ContainerLaunchContextProto.getSerializedSize(YarnProtos.java:38512) > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnProtos$ApplicationSubmissionContextProto.getSerializedSize(YarnProtos.java:28481) > at > com.google.protobuf.CodedOutputStream.computeMessageSizeNoTag(CodedOutputStream.java:749) > at > com.google.protobuf.CodedOutputStream.computeMessageSize(CodedOutputStream.java:530) > at > org.apache.hadoop.yarn.proto.YarnServerResourceManagerRecoveryProtos$ApplicationStateDataProto.getSerializedSize(YarnServerResourceManagerRecoveryProtos.java:816) > at > com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.updateApplicationStateInternal(FileSystemRMStateStore.java:426) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:163) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$UpdateAppTransition.transition(RMStateStore.java:148) > at > org.apache.hadoop.yarn.state.StateMachineFactory$SingleInternalArc.doTransition(StateMachineFactory.java:362) > at > org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) > at > org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) > at > org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.handleStoreEvent(RMStateStore.java:810) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:864) > at > org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore$ForwardingEventHandler.handle(RMStateStore.java:859) > at > org.apache.hadoop.yarn.event.AsyncDispatcher.dispatch(AsyncDispatcher.java:173) > at > org.apache.hadoop.yarn.event.AsyncDispatcher$1.run(AsyncDispatcher.java:106) > at java.lang.Thread.run(Thread.java:745) > 2017-09-08 02:34:37,987 INFO > org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl: > container_1504809243340_0001_01_000001 Container Transitioned from ACQUIRED > to KILLED > 2017-09-08 02:34:37,987 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp: > Completed container: container_1504809243340_0001_01_000001 in state: KILLED > event:KILL > 2017-09-08 02:34:37,987 INFO > org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER=hires > OPERATION=AM Released Container TARGET=SchedulerApp RESULT=SUCCESS > APPID=application_1504809243340_0001 > CONTAINERID=container_1504809243340_0001_01_000001 > 2017-09-08 02:34:37,988 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode: > Released container container_1504809243340_0001_01_000001 of capacity > <memory:2048, vCores:1> on host hadoop11:45454, which currently has 0 > containers, <memory:0, vCores:0> used and <memory:8096, vCores:8> available, > release resources=true > 2017-09-08 02:34:37,988 INFO org.apache.hadoop.util.ExitUtil: Exiting with > status 1 > 2017-09-08 02:34:37,988 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: > default used=<memory:0, vCores:0> numContainers=0 user=hires > user-resources=<memory:0, vCores:0> > 2017-09-08 02:34:37,989 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: > completedContainer container=Container: [ContainerId: > container_1504809243340_0001_01_000001, NodeId: hadoop11:45454, > NodeHttpAddress: hadoop11:8042, Resource: <memory:2048, vCores:1>, Priority: > 0, Token: Token { kind: ContainerToken, service: 10.3.1.11:45454 }, ] > queue=default: capacity=1.0, absoluteCapacity=1.0, usedResources=<memory:0, > vCores:0>, usedCapacity=0.0, absoluteUsedCapacity=0.0, numApps=1, > numContainers=0 cluster=<memory:16192, vCores:16> > 2017-09-08 02:34:37,989 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: > completedContainer queue=root usedCapacity=0.0 absoluteUsedCapacity=0.0 > used=<memory:0, vCores:0> cluster=<memory:16192, vCores:16> > 2017-09-08 02:34:37,990 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue: > Re-sorting completed queue: root.default stats: default: capacity=1.0, > absoluteCapacity=1.0, usedResources=<memory:0, vCores:0>, usedCapacity=0.0, > absoluteUsedCapacity=0.0, numApps=1, numContainers=0 > 2017-09-08 02:34:37,990 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler: > Application attempt appattempt_1504809243340_0001_000001 released container > container_1504809243340_0001_01_000001 on node: host: hadoop11:45454 > #containers=0 available=8096 used=0 with event: KILL > 2017-09-08 02:34:37,990 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.AppSchedulingInfo: > Application application_1504809243340_0001 requests cleared > 2017-09-08 02:34:37,990 INFO > org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue: > Application removed - appId: application_1504809243340_0001 user: hires > queue: default #user-pending-applications: 0 #user-active-applications: 0 > #queue-pending-applications: 0 #queue-active-applications: 0 > 2017-09-08 02:34:38,001 ERROR > org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: > ExpiredTokenRemover received java.lang.InterruptedException: sleep > interrupted > 2017-09-08 02:34:38,005 INFO org.mortbay.log: Stopped > HttpServer2$SelectChannelConnectorWithSafeStartup@hadoop11:8088 > 2017-09-08 02:34:38,005 ERROR > org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: > ExpiredTokenRemover received java.lang.InterruptedException: sleep > interrupted > 2017-09-08 02:34:38,006 ERROR > org.apache.hadoop.security.token.delegation.AbstractDelegationTokenSecretManager: > ExpiredTokenRemover received java.lang.InterruptedException: sleep > interrupted > 2017-09-08 02:34:38,108 INFO org.apache.hadoop.ipc.Server: Stopping server on > 8032 > 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping IPC > Server listener on 8032 > 2017-09-08 02:34:38,113 INFO org.apache.hadoop.ipc.Server: Stopping server on > 8033 > 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC > Server Responder > 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC > Server listener on 8033 > 2017-09-08 02:34:38,114 INFO org.apache.hadoop.ipc.Server: Stopping IPC > Server Responder[^attachment-name.zip] > {code} -- This message was sent by Atlassian JIRA (v6.4.14#64029) --------------------------------------------------------------------- To unsubscribe, e-mail: yarn-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: yarn-issues-h...@hadoop.apache.org