[ https://issues.apache.org/jira/browse/AURORA-327?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15496537#comment-15496537 ]
John Sirois commented on AURORA-327: ------------------------------------ Actually - it does make sense to resolve this now since the default configuration now ships with the fix enabled. > Aurora shutdown blocks when attempting to abdicate leadership and zk ensemble > is down > ------------------------------------------------------------------------------------- > > Key: AURORA-327 > URL: https://issues.apache.org/jira/browse/AURORA-327 > Project: Aurora > Issue Type: Bug > Components: Reliability, Scheduler > Reporter: Kevin Sweeney > Assignee: John Sirois > > Observed this yesterday - Aurora appears to retry abdicating its membership > if the zk ensemble is down. This will halt scheduler shutdown. > {noformat} > W0416 23:05:27.344 THREAD23 > com.twitter.common.zookeeper.Group$ActiveMembership$1.get: Temporary error > cancelling membership: /twitter/service/mesos/test/scheduler/member_0000002820 > org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode > = ConnectionLoss for /twitter/service/mesos/test/scheduler/member_0000002820 > at > org.apache.zookeeper.KeeperException.create(KeeperException.java:90) > at > org.apache.zookeeper.KeeperException.create(KeeperException.java:42) > at org.apache.zookeeper.ZooKeeper.delete(ZooKeeper.java:734) > at > com.twitter.common.zookeeper.Group$ActiveMembership$1.get(Group.java:370) > at > com.twitter.common.zookeeper.Group$ActiveMembership$1.get(Group.java:367) > at com.twitter.common.util.BackoffHelper$1.get(BackoffHelper.java:109) > at com.twitter.common.util.BackoffHelper$1.get(BackoffHelper.java:107) > at > com.twitter.common.util.BackoffHelper.doUntilResult(BackoffHelper.java:127) > at > com.twitter.common.util.BackoffHelper.doUntilSuccess(BackoffHelper.java:107) > at > com.twitter.common.zookeeper.Group$ActiveMembership.cancel(Group.java:367) > at > com.twitter.common.zookeeper.ServerSetImpl$MemberStatus.leave(ServerSetImpl.java:255) > at > com.twitter.common.zookeeper.ServerSetImpl$2.leave(ServerSetImpl.java:199) > at > com.twitter.common.zookeeper.SingletonService$1$1.leave(SingletonService.java:185) > at > org.apache.aurora.scheduler.SchedulerLifecycle$8.execute(SchedulerLifecycle.java:369) > at > org.apache.aurora.scheduler.SchedulerLifecycle$8.execute(SchedulerLifecycle.java:354) > at com.twitter.common.base.Closures$4.execute(Closures.java:120) > at com.twitter.common.base.Closures$3.execute(Closures.java:98) > at > com.twitter.common.util.StateMachine.transition(StateMachine.java:191) > at > org.apache.aurora.scheduler.SchedulerLifecycle$4.execute(SchedulerLifecycle.java:235) > at > com.twitter.common.application.ShutdownRegistry$ShutdownRegistryImpl.execute(ShutdownRegistry.java:88) > at > com.twitter.common.application.Lifecycle.shutdown(Lifecycle.java:92) > at > org.apache.aurora.scheduler.log.mesos.MesosLog$LogStream.disableLog(MesosLog.java:354) > at > org.apache.aurora.scheduler.log.mesos.MesosLog$LogStream.mutate(MesosLog.java:372) > at > org.apache.aurora.scheduler.log.mesos.MesosLog$LogStream.append(MesosLog.java:318) > at > org.apache.aurora.scheduler.log.mesos.MesosLog$LogStream.append(MesosLog.java:148) > at > org.apache.aurora.scheduler.storage.log.LogManager$StreamManager.appendAndGetPosition(LogManager.java:308) > at > org.apache.aurora.scheduler.storage.log.LogManager$StreamManager.snapshot(LogManager.java:296) > at > org.apache.aurora.scheduler.storage.log.LogStorage.persist(LogStorage.java:491) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797.CGLIB$persist$10(<generated>) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797$$FastClassByGuice$$6565b2c0.invoke(<generated>) > at > com.google.inject.internal.cglib.proxy.$MethodProxy.invokeSuper(MethodProxy.java:228) > at > com.google.inject.internal.InterceptorStackCallback$InterceptedMethodInvocation.proceed(InterceptorStackCallback.java:72) > at > com.twitter.common.inject.TimedInterceptor.invoke(TimedInterceptor.java:87) > at > com.google.inject.internal.InterceptorStackCallback$InterceptedMethodInvocation.proceed(InterceptorStackCallback.java:72) > at > com.google.inject.internal.InterceptorStackCallback.intercept(InterceptorStackCallback.java:52) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797.persist(<generated>) > at > org.apache.aurora.scheduler.storage.log.LogStorage$6.execute(LogStorage.java:481) > at > org.apache.aurora.scheduler.storage.Storage$MutateWork$NoResult.apply(Storage.java:131) > at > org.apache.aurora.scheduler.storage.Storage$MutateWork$NoResult.apply(Storage.java:127) > at > org.apache.aurora.scheduler.storage.log.LogStorage$7.apply(LogStorage.java:515) > at > org.apache.aurora.scheduler.storage.log.LogStorage$7.apply(LogStorage.java:512) > at > org.apache.aurora.scheduler.storage.mem.MemStorage.doWork(MemStorage.java:149) > at > org.apache.aurora.scheduler.storage.mem.MemStorage.write(MemStorage.java:165) > at > org.apache.aurora.scheduler.storage.log.LogStorage.write(LogStorage.java:512) > at > org.apache.aurora.scheduler.storage.log.LogStorage.doSnapshot(LogStorage.java:476) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797.CGLIB$doSnapshot$9(<generated>) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797$$FastClassByGuice$$6565b2c0.invoke(<generated>) > at > com.google.inject.internal.cglib.proxy.$MethodProxy.invokeSuper(MethodProxy.java:228) > at > com.google.inject.internal.InterceptorStackCallback$InterceptedMethodInvocation.proceed(InterceptorStackCallback.java:72) > at > com.twitter.common.inject.TimedInterceptor.invoke(TimedInterceptor.java:87) > at > com.google.inject.internal.InterceptorStackCallback$InterceptedMethodInvocation.proceed(InterceptorStackCallback.java:72) > at > com.google.inject.internal.InterceptorStackCallback.intercept(InterceptorStackCallback.java:52) > at > org.apache.aurora.scheduler.storage.log.LogStorage$$EnhancerByGuice$$8f32e797.doSnapshot(<generated>) > at > org.apache.aurora.scheduler.storage.log.LogStorage.snapshot(LogStorage.java:548) > at > org.apache.aurora.scheduler.storage.log.LogStorage$5.run(LogStorage.java:454) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) > at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:304) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:178) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:744) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332)