[ https://issues.apache.org/jira/browse/YARN-10290?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Wilfred Spiegelenburg resolved YARN-10290. ------------------------------------------ Resolution: Duplicate This issue is fixed in YARN-7913. That change fixes a number of issues around restores that fail. The change was not backported to Hadoop 2.x > Resourcemanager recover failed when fair scheduler queue acl changed > -------------------------------------------------------------------- > > Key: YARN-10290 > URL: https://issues.apache.org/jira/browse/YARN-10290 > Project: Hadoop YARN > Issue Type: Bug > Components: resourcemanager > Affects Versions: 2.7.2 > Reporter: yehuanhuan > Priority: Blocker > > Resourcemanager recover failed when fair scheduler queue acl changed. Because > of queue acl changed, when recover the application (addApplication() in > fairscheduler) is rejected. Then recover the applicationAttempt > (addApplicationAttempt() in fairscheduler) get Application is null. This will > lead to two RM is at standby. Repeat as follows. > > # user run a long running application. > # change queue acl (aclSubmitApps) so that the user does not have permission. > # restart the RM. > {code:java} > 2020-05-25 16:04:06,191 INFO > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl: Updating > application application_1590393162216_0005 with final state: FAILED > 2020-05-25 16:04:06,192 ERROR > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Failed to > load/recover state > java.lang.NullPointerException > at > org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.addApplicationAttempt(FairScheduler.java:663) > at > org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.handle(FairScheduler.java:1246) > at > org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.handle(FairScheduler.java:116) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl$AttemptRecoveredTransition.transition(RMAppAttemptImpl.java:1072) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl$AttemptRecoveredTransition.transition(RMAppAttemptImpl.java:1036) > at > org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385) > at > org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) > at > org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) > at > org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl.handle(RMAppAttemptImpl.java:789) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl.handle(RMAppAttemptImpl.java:105) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.recoverAppAttempts(RMAppImpl.java:845) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.access$1900(RMAppImpl.java:102) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:897) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl$RMAppRecoveredTransition.transition(RMAppImpl.java:850) > at > org.apache.hadoop.yarn.state.StateMachineFactory$MultipleInternalArc.doTransition(StateMachineFactory.java:385) > at > org.apache.hadoop.yarn.state.StateMachineFactory.doTransition(StateMachineFactory.java:302) > at > org.apache.hadoop.yarn.state.StateMachineFactory.access$300(StateMachineFactory.java:46) > at > org.apache.hadoop.yarn.state.StateMachineFactory$InternalStateMachine.doTransition(StateMachineFactory.java:448) > at > org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl.handle(RMAppImpl.java:723) > at > org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recoverApplication(RMAppManager.java:322) > at > org.apache.hadoop.yarn.server.resourcemanager.RMAppManager.recover(RMAppManager.java:427) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.recover(ResourceManager.java:1173) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:584) > at > org.apache.hadoop.service.AbstractService.start(AbstractService.java:193) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:980) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1021) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1017) > at java.security.AccessController.doPrivileged(Native Method) > at javax.security.auth.Subject.doAs(Subject.java:422) > at > org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1659) > at > org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1017) > at > org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToActive(AdminService.java:301) > at > org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.becomeActive(EmbeddedElectorService.java:126) > at > org.apache.hadoop.ha.ActiveStandbyElector.becomeActive(ActiveStandbyElector.java:813) > at > org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:418) > at > org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:599) > at > org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: yarn-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: yarn-issues-h...@hadoop.apache.org