Repository: brooklyn-server Updated Branches: refs/heads/master f1d89ad97 -> be89a8dd2
BROOKLYN-325: alert if provisioning/termination aborted on rebind Project: http://git-wip-us.apache.org/repos/asf/brooklyn-server/repo Commit: http://git-wip-us.apache.org/repos/asf/brooklyn-server/commit/829ae6aa Tree: http://git-wip-us.apache.org/repos/asf/brooklyn-server/tree/829ae6aa Diff: http://git-wip-us.apache.org/repos/asf/brooklyn-server/diff/829ae6aa Branch: refs/heads/master Commit: 829ae6aab90217b6d8f702f8bcb1f27af71d5520 Parents: fbfe2fe Author: Aled Sage <aled.s...@gmail.com> Authored: Mon Nov 14 17:42:36 2016 +0000 Committer: Aled Sage <aled.s...@gmail.com> Committed: Wed Nov 16 12:47:50 2016 +0000 ---------------------------------------------------------------------- .../entity/internal/AttributesInternal.java | 12 +++- .../mgmt/rebind/BasicEntityRebindSupport.java | 56 ++++++++++++++--- .../MachineLifecycleEffectorTasks.java | 13 +++- ...ftwareProcessRebindNotRunningEntityTest.java | 64 +++++++++++++++++--- 4 files changed, 122 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/brooklyn-server/blob/829ae6aa/core/src/main/java/org/apache/brooklyn/core/entity/internal/AttributesInternal.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/brooklyn/core/entity/internal/AttributesInternal.java b/core/src/main/java/org/apache/brooklyn/core/entity/internal/AttributesInternal.java index 6051e05..7bccb4d 100644 --- a/core/src/main/java/org/apache/brooklyn/core/entity/internal/AttributesInternal.java +++ b/core/src/main/java/org/apache/brooklyn/core/entity/internal/AttributesInternal.java @@ -30,12 +30,18 @@ public interface AttributesInternal extends Attributes { public static final AttributeSensor<ProvisioningTaskState> INTERNAL_PROVISIONING_TASK_STATE = new BasicAttributeSensor<ProvisioningTaskState>( TypeToken.of(ProvisioningTaskState.class), "internal.provisioning.task.state", - "Internal transient sensor (do not use) for tracking the provisioning of a machine (to better handle aborting/rebind)", - AttributeSensor.SensorPersistenceMode.NONE); + "Internal transient sensor (do not use) for tracking the provisioning of a machine (to better handle aborting/rebind)"); + @Beta + public static final AttributeSensor<ProvisioningTaskState> INTERNAL_TERMINATION_TASK_STATE = new BasicAttributeSensor<ProvisioningTaskState>( + TypeToken.of(ProvisioningTaskState.class), + "internal.termination.task.state", + "Internal transient sensor (do not use) for tracking the termination of a machine (to better handle aborting/rebind)"); + /** * Used only internally by {@link org.apache.brooklyn.entity.software.base.lifecycle.MachineLifecycleEffectorTasks} - * to track provisioning, so machine can be terminated if stopped while opaque provision call is being made. + * to track provisioning/termination. This is used so the machine can be terminated if stopped while opaque provision + * call is being made; and is used to report if termination was prematurely aborted (e.g. during Brooklyn restart). */ @Beta @VisibleForTesting http://git-wip-us.apache.org/repos/asf/brooklyn-server/blob/829ae6aa/core/src/main/java/org/apache/brooklyn/core/mgmt/rebind/BasicEntityRebindSupport.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/brooklyn/core/mgmt/rebind/BasicEntityRebindSupport.java b/core/src/main/java/org/apache/brooklyn/core/mgmt/rebind/BasicEntityRebindSupport.java index b85b768..f7b5cd8 100644 --- a/core/src/main/java/org/apache/brooklyn/core/mgmt/rebind/BasicEntityRebindSupport.java +++ b/core/src/main/java/org/apache/brooklyn/core/mgmt/rebind/BasicEntityRebindSupport.java @@ -26,6 +26,7 @@ import org.apache.brooklyn.api.effector.Effector; import org.apache.brooklyn.api.entity.Entity; import org.apache.brooklyn.api.entity.EntityLocal; import org.apache.brooklyn.api.location.Location; +import org.apache.brooklyn.api.location.MachineLocation; import org.apache.brooklyn.api.mgmt.rebind.RebindContext; import org.apache.brooklyn.api.mgmt.rebind.mementos.EntityMemento; import org.apache.brooklyn.api.objs.BrooklynObjectType; @@ -35,15 +36,18 @@ import org.apache.brooklyn.core.enricher.AbstractEnricher; import org.apache.brooklyn.core.entity.AbstractEntity; import org.apache.brooklyn.core.entity.Attributes; import org.apache.brooklyn.core.entity.EntityInternal; +import org.apache.brooklyn.core.entity.internal.AttributesInternal; +import org.apache.brooklyn.core.entity.internal.AttributesInternal.ProvisioningTaskState; import org.apache.brooklyn.core.entity.lifecycle.Lifecycle; import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic; import org.apache.brooklyn.core.feed.AbstractFeed; +import org.apache.brooklyn.core.location.Machines; import org.apache.brooklyn.core.mgmt.rebind.dto.MementosGenerators; import org.apache.brooklyn.core.objs.AbstractBrooklynObject; import org.apache.brooklyn.core.policy.AbstractPolicy; import org.apache.brooklyn.entity.group.AbstractGroupImpl; -import org.apache.brooklyn.entity.stock.BasicApplication; import org.apache.brooklyn.util.exceptions.Exceptions; +import org.apache.brooklyn.util.guava.Maybe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -245,14 +249,52 @@ public class BasicEntityRebindSupport extends AbstractBrooklynObjectRebindSuppor Preconditions.checkState(instance == entity, "Expected %s and %s to match, but different objects", instance, entity); Lifecycle expectedState = ServiceStateLogic.getExpectedState(entity); if (expectedState == Lifecycle.STARTING || expectedState == Lifecycle.STOPPING) { - LOG.warn("Entity {} goes on-fire because it was in state {} on rebind", entity, expectedState); - LOG.warn("not-up-indicators={}", entity.getAttribute(Attributes.SERVICE_NOT_UP_INDICATORS)); - ServiceStateLogic.setExpectedState(entity, Lifecycle.ON_FIRE); + // If we were previously "starting" or "stopping", then those tasks will have been + // aborted. We don't want to continue showing that state (e.g. the web-console would + // then show the it as in-progress with the "spinning" icon). + // Therefore we set the entity as on-fire, and add the indicator that says why. + markTransitioningEntityOnFireOnRebind((EntityInternal) entity, expectedState); + } + + // Clear the provisioning/termination task-state; the task will have been aborted, so wrong to keep this state. + ((EntityInternal)entity).sensors().remove(AttributesInternal.INTERNAL_PROVISIONING_TASK_STATE); + ((EntityInternal)entity).sensors().remove(AttributesInternal.INTERNAL_TERMINATION_TASK_STATE); + + super.instanceRebind(instance); + } + + protected void markTransitioningEntityOnFireOnRebind(EntityInternal entity, Lifecycle expectedState) { + LOG.warn("Entity {} being marked as on-fire because it was in state {} on rebind; indicators={}", new Object[] {entity, expectedState, entity.getAttribute(Attributes.SERVICE_NOT_UP_INDICATORS)}); + ServiceStateLogic.setExpectedState(entity, Lifecycle.ON_FIRE); + ServiceStateLogic.ServiceNotUpLogic.updateNotUpIndicator( + entity, + "Task aborted on rebind", + "Set to on-fire (from previous expected state "+expectedState+") because tasks aborted on rebind"); + + // Check if we were in the process of provisioning a machine. If so, a VM might have + // been left behind. E.g. we might have submitted to jclouds the request to provision + // the VM, but not yet received back the id (so have lost details of the VM). + // Also see MachineLifecycleEffectorTasks.ProvisionMachineTask. + Maybe<MachineLocation> machine = Machines.findUniqueMachineLocation(entity.getLocations()); + ProvisioningTaskState provisioningState = entity.sensors().get(AttributesInternal.INTERNAL_PROVISIONING_TASK_STATE); + if (machine.isAbsent() && provisioningState == ProvisioningTaskState.RUNNING) { + LOG.warn("Entity {} was provisioning; VM may have been left running", entity); ServiceStateLogic.ServiceNotUpLogic.updateNotUpIndicator( entity, - "Task aborted on rebind", - "Set to on-fire (from previous expected state "+expectedState+") because tasks aborted on rebind"); + "VM may be lost on rebind", + "VM provisioning may have been in-progress and now lost, because tasks aborted on rebind"); + } + + // Similar to the provisioning case, if we were terminating the VM then we may or may + // not have finished. This means the VM might have been left running. + // Also see MachineLifecycleEffectorTasks.stopAnyProvisionedMachines() + ProvisioningTaskState terminationState = entity.sensors().get(AttributesInternal.INTERNAL_TERMINATION_TASK_STATE); + if (machine.isAbsent() && terminationState == ProvisioningTaskState.RUNNING) { + LOG.warn("Entity {} was terminating; VM may have been left running", entity); + ServiceStateLogic.ServiceNotUpLogic.updateNotUpIndicator( + entity, + "VM may be lost on rebind", + "VM termination may have been in-progress and now lost, because tasks aborted on rebind"); } - super.instanceRebind(instance); } } http://git-wip-us.apache.org/repos/asf/brooklyn-server/blob/829ae6aa/software/base/src/main/java/org/apache/brooklyn/entity/software/base/lifecycle/MachineLifecycleEffectorTasks.java ---------------------------------------------------------------------- diff --git a/software/base/src/main/java/org/apache/brooklyn/entity/software/base/lifecycle/MachineLifecycleEffectorTasks.java b/software/base/src/main/java/org/apache/brooklyn/entity/software/base/lifecycle/MachineLifecycleEffectorTasks.java index 439770a..8748861 100644 --- a/software/base/src/main/java/org/apache/brooklyn/entity/software/base/lifecycle/MachineLifecycleEffectorTasks.java +++ b/software/base/src/main/java/org/apache/brooklyn/entity/software/base/lifecycle/MachineLifecycleEffectorTasks.java @@ -1001,10 +1001,17 @@ public abstract class MachineLifecycleEffectorTasks { log.debug("No decommissioning necessary for "+entity()+" - not a machine location ("+machine+")"); return new StopMachineDetails<Integer>("No machine decommissioning necessary - not a machine ("+machine+")", 0); } - - clearEntityLocationAttributes(machine); - provisioner.release((MachineLocation)machine); + entity().sensors().set(AttributesInternal.INTERNAL_TERMINATION_TASK_STATE, ProvisioningTaskState.RUNNING); + try { + clearEntityLocationAttributes(machine); + provisioner.release((MachineLocation)machine); + } finally { + // TODO On exception, should we add the machine back to the entity (because it might not really be terminated)? + // Do we need a better exception hierarchy for that? + entity().sensors().remove(AttributesInternal.INTERNAL_TERMINATION_TASK_STATE); + } + return new StopMachineDetails<Integer>("Decommissioned "+machine, 1); } http://git-wip-us.apache.org/repos/asf/brooklyn-server/blob/829ae6aa/software/base/src/test/java/org/apache/brooklyn/entity/software/base/SoftwareProcessRebindNotRunningEntityTest.java ---------------------------------------------------------------------- diff --git a/software/base/src/test/java/org/apache/brooklyn/entity/software/base/SoftwareProcessRebindNotRunningEntityTest.java b/software/base/src/test/java/org/apache/brooklyn/entity/software/base/SoftwareProcessRebindNotRunningEntityTest.java index e5e7709..89a177a 100644 --- a/software/base/src/test/java/org/apache/brooklyn/entity/software/base/SoftwareProcessRebindNotRunningEntityTest.java +++ b/software/base/src/test/java/org/apache/brooklyn/entity/software/base/SoftwareProcessRebindNotRunningEntityTest.java @@ -43,6 +43,7 @@ import org.apache.brooklyn.config.ConfigKey; import org.apache.brooklyn.core.config.ConfigKeys; import org.apache.brooklyn.core.entity.Attributes; import org.apache.brooklyn.core.entity.EntityAsserts; +import org.apache.brooklyn.core.entity.internal.AttributesInternal; import org.apache.brooklyn.core.entity.lifecycle.Lifecycle; import org.apache.brooklyn.core.entity.trait.Startable; import org.apache.brooklyn.core.location.AbstractLocation; @@ -75,11 +76,6 @@ import com.google.common.util.concurrent.MoreExecutors; public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixtureWithApp { - // TODO If we fail during provisioningLocation.obtain() or provisioningLocation.release(), then we - // should tell the user that a VM might have started being provisioned but been forgotten about; or - // that termination of the VM may or may not have completed. - // We could use the Attributes.SERVICE_NOT_UP_INDICATORS to achieve that. - private static final Logger LOG = LoggerFactory.getLogger(SoftwareProcessRebindNotRunningEntityTest.class); private ListeningExecutorService executor; @@ -243,6 +239,11 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture assertMarkedAsOnfire(newEntity, Lifecycle.STARTING); assertMarkedAsOnfire(newApp, Lifecycle.STARTING); + assertMarkedAsVmLost(newEntity, Lifecycle.STARTING); + + // Expect the marker to have been cleared on rebind (sensible because task is not running). + EntityAsserts.assertAttributeEquals(newEntity, AttributesInternal.INTERNAL_PROVISIONING_TASK_STATE, null); + EntityAsserts.assertAttributeEquals(newEntity, AttributesInternal.INTERNAL_TERMINATION_TASK_STATE, null); } @Test @@ -267,8 +268,14 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture TestApplication newApp = rebind(); final VanillaSoftwareProcess newEntity = (VanillaSoftwareProcess) Iterables.find(newApp.getChildren(), Predicates.instanceOf(VanillaSoftwareProcess.class)); - + + assertMarkedAsOnfire(newEntity, Lifecycle.STOPPING); + assertMarkedAsVmLost(newEntity, Lifecycle.STOPPING); + + // Expect the marker to have been cleared on rebind (sensible because task is not running). + EntityAsserts.assertAttributeEquals(newEntity, AttributesInternal.INTERNAL_PROVISIONING_TASK_STATE, null); + EntityAsserts.assertAttributeEquals(newEntity, AttributesInternal.INTERNAL_TERMINATION_TASK_STATE, null); } @Test @@ -299,7 +306,7 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture assertNotMarkedOnfire(newEntity, Lifecycle.STARTING); assertNotMarkedOnfire(newApp, Lifecycle.STARTING); } - + protected ListenableFuture<Void> startAsync(final Startable entity, final Collection<? extends Location> locs) { return executor.submit(new Callable<Void>() { @Override public Void call() throws Exception { @@ -327,15 +334,24 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture return result; } - protected void assertMarkedAsOnfire(final Entity entity, final Lifecycle previousState) throws Exception { + protected void assertMarkedAsOnfire(Entity entity, Lifecycle previousState) throws Exception { EntityAsserts.assertAttributeEqualsEventually(entity, Attributes.SERVICE_STATE_ACTUAL, Lifecycle.ON_FIRE); EntityAsserts.assertAttributeEqualsEventually(entity, Attributes.SERVICE_UP, false); + assertNotUpIndicatorIncludesEventually(entity, "Task aborted on rebind", + "Set to on-fire (from previous expected state "+previousState+") because tasks aborted on rebind"); + } + + protected void assertMarkedAsVmLost(Entity entity, Lifecycle previousState) throws Exception { + String expectedReason = "VM " + (previousState == Lifecycle.STARTING ? "provisioning" : "termination") + + " may have been in-progress and now lost, because tasks aborted on rebind"; + assertNotUpIndicatorIncludesEventually(entity, "VM may be lost on rebind", expectedReason); + } + + protected void assertNotUpIndicatorIncludesEventually(final Entity entity, final String expectedKey, final String expectedVal) throws Exception { EntityAsserts.assertAttributeEventually(entity, Attributes.SERVICE_NOT_UP_INDICATORS, new Predicate<Map<?,?>>() { @Override public boolean apply(Map<?, ?> input) { if (input == null) return false; - String expectedKey = "Task aborted on rebind"; - String expectedVal = "Set to on-fire (from previous expected state "+previousState+") because tasks aborted on rebind"; for (Map.Entry<?, ?> entry : input.entrySet()) { boolean keyMatches = expectedKey.equals(entry.getKey()); boolean valueMatches = expectedVal.equals(entry.getValue()); @@ -363,6 +379,8 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture new TypeToken<LocationSpec<SshMachineLocation>>() {}, "machineSpec"); + protected List<CallInfo> callHistory = Collections.synchronizedList(Lists.<CallInfo>newArrayList()); + @Override public MachineProvisioningLocation<SshMachineLocation> newSubLocation(Map<?, ?> newFlags) { throw new UnsupportedOperationException(); @@ -370,6 +388,8 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture @Override public SshMachineLocation obtain(Map<?,?> flags) throws NoMachinesAvailableException { + callHistory.add(new CallInfo("obtain", ImmutableList.of(flags))); + CountDownLatch calledLatch = config().get(OBTAIN_CALLED_LATCH); CountDownLatch blockedLatch = config().get(OBTAIN_BLOCKED_LATCH); LocationSpec<SshMachineLocation> machineSpec = config().get(MACHINE_SPEC); @@ -385,6 +405,8 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture @Override public void release(SshMachineLocation machine) { + callHistory.add(new CallInfo("release", ImmutableList.of(machine))); + CountDownLatch calledLatch = config().get(RELEASE_CALLED_LATCH); CountDownLatch blockedLatch = config().get(RELEASE_BLOCKED_LATCH); @@ -400,5 +422,27 @@ public class SoftwareProcessRebindNotRunningEntityTest extends RebindTestFixture public Map getProvisioningFlags(Collection<String> tags) { return Collections.emptyMap(); } + + public List<CallInfo> getCallHistory() { + synchronized (callHistory) { + return ImmutableList.copyOf(callHistory); + } + } + + public CallInfo getLastCall() { + synchronized (callHistory) { + return callHistory.get(callHistory.size()-1); + } + } + + static class CallInfo { + public final String name; + public final List<? extends Object> args; + + public CallInfo(String name, List<? extends Object> args) { + this.name = name; + this.args = args; + } + } } }