good for merge
On Fri, 2009-04-17 at 14:13 +1200, angus salkeld wrote:
> If a component on node A had locked up, and node B failed over, then
> once node A component's healthcheck failed, node A wouldn't reboot.
> This was because node A's component failover was deferred until processing
> node B failover completed. The problem here is because the node A component
> has locked up/exited, node B failover processing will never complete, so
> the node A would never reboot.
> Processing the node failover immediately seems to work fine.
>
> exec/amfsg.c
> + amf_sg_failover_node_req() - process the node failover immediately if the SG
> state machine is currently trying to assign this node the active workload.
> + is_acsm_assigning_node_active() - new function to check if a given node
> is currently being assigned the active workload.
>
> -Angus
>
> Index: services/amfsg.c
> ===================================================================
> --- services/amfsg.c (revision 1791)
> +++ services/amfsg.c (working copy)
> @@ -2210,6 +2210,46 @@
> }
>
> /**
> + * Checks if a SG ACSM is waiting for a active SI assignment on a given node
> + * before it will transition to the next state. This is used to determine if
> we
> + * can safely defer processing a node leave/failover until the ACSM
> transitions
> + * back to idle (if the current ACSM action involves the node specified, then
> + * it may NEVER complete).
> + *
> + * @return SA_TRUE if activating an SI assignment on node specified,
> SA_FALSE if not
> + */
> +static int is_acsm_assigning_node_active (struct amf_sg *sg, struct amf_node
> *node)
> +{
> + struct amf_si *si;
> + struct amf_si_assignment *si_assignment;
> + int activating_node_su = SA_FALSE;
> +
> + for (si = sg->application->si_head; si != NULL; si = si->next) {
> + if (name_match (&si->saAmfSIProtectedbySG, &sg->name)) {
> +
> + for (si_assignment = si->assigned_sis;
> + si_assignment != NULL;
> + si_assignment = si_assignment->next) {
> +
> + /* Check if an SU on the node is in the
> process of activating */
> + if (name_match(&node->name,
> +
> &si_assignment->su->saAmfSUHostedByNode) &&
> + si_assignment->requested_ha_state !=
> + si_assignment->saAmfSISUHAState &&
> + si_assignment->requested_ha_state ==
> + SA_AMF_HA_ACTIVE) {
> + activating_node_su = SA_TRUE;
> + break;
> + }
> + }
> + }
> + }
> + ENTER("'%s, %s' %u",node->name.value, sg->name.value,
> activating_node_su);
> +
> + return activating_node_su;
> +}
> +
> +/**
> * This function is called because an error has been detected and the
> analysis
> * (done elsewhere) indicated that this error shall be recovered by a Node
> * failover. This function initiates the recovery action 'Node failover'.
> @@ -2223,6 +2263,20 @@
> sg_event_t sg_event;
>
> switch (sg->avail_state) {
> + case SG_AC_ActivatingStandby:
> + /* check if failover can be safely deferred */
> + if (!is_acsm_assigning_node_active (sg, node)) {
> + sg_set_event (SG_FAILOVER_NODE_EV, sg, 0, 0,
> node, &sg_event);
> + sg_defer_event (SG_FAILOVER_NODE_EV,
> &sg_event);
> + break;
> + } else {
> + log_printf (LOG_LEVEL_NOTICE,
> + "Cannot defer node
> '%s' failover (%s ACSM state %u)",
> + node->name.value,
> sg->name.value, sg->avail_state);
> +
> + /* fall-through and process now */
> + }
> +
> case SG_AC_Idle:
> set_scope_for_failover_node(sg, node);
> if (has_any_su_in_scope_active_workload (sg)) {
> @@ -2260,7 +2314,6 @@
> break;
> case SG_AC_DeactivatingDependantWorkload:
> case SG_AC_TerminatingSuspected:
> - case SG_AC_ActivatingStandby:
> case SG_AC_AssigningStandbyToSpare:
> case SG_AC_ReparingComponent:
> case SG_AC_ReparingSu:
> @@ -2846,4 +2899,3 @@
>
> return sg;
> }
> -
>
> _______________________________________________
> Openais mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/openais
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais