good for merge

On Fri, 2009-04-17 at 14:13 +1200, angus salkeld wrote:
> If a component on node A had locked up, and node B failed over, then
> once node A component's healthcheck failed, node A wouldn't reboot.
> This was because node A's component failover was deferred until processing
> node B failover completed. The problem here is because the node A component
> has locked up/exited, node B failover processing will never complete, so
> the node A would never reboot.
> Processing the node failover immediately seems to work fine.
> 
> exec/amfsg.c
> + amf_sg_failover_node_req() - process the node failover immediately if the SG
> state machine is currently trying to assign this node the active workload.
> + is_acsm_assigning_node_active() - new function to check if a given node
> is currently being assigned the active workload.
> 
> -Angus
> 
> Index: services/amfsg.c
> ===================================================================
> --- services/amfsg.c    (revision 1791)
> +++ services/amfsg.c    (working copy)
> @@ -2210,6 +2210,46 @@
>  }
>  
>  /**
> + * Checks if a SG ACSM is waiting for a active SI assignment on a given node
> + * before it will transition to the next state. This is used to determine if 
> we
> + * can safely defer processing a node leave/failover until the ACSM 
> transitions
> + * back to idle (if the current ACSM action involves the node specified, then
> + * it may NEVER complete).
> + *
> + * @return SA_TRUE if activating an SI assignment on node specified, 
> SA_FALSE if not
> + */
> +static int is_acsm_assigning_node_active (struct amf_sg *sg, struct amf_node 
> *node)
> +{
> +       struct amf_si *si;
> +       struct amf_si_assignment *si_assignment;
> +       int activating_node_su = SA_FALSE;
> +
> +       for (si = sg->application->si_head; si != NULL; si = si->next) {
> +               if (name_match (&si->saAmfSIProtectedbySG, &sg->name)) {
> +
> +                       for (si_assignment = si->assigned_sis;
> +                               si_assignment != NULL;
> +                               si_assignment = si_assignment->next) {
> +
> +                               /* Check if an SU on the node is in the 
> process of activating */
> +                               if (name_match(&node->name,
> +                                       
> &si_assignment->su->saAmfSUHostedByNode) &&
> +                                       si_assignment->requested_ha_state !=
> +                                       si_assignment->saAmfSISUHAState &&
> +                                       si_assignment->requested_ha_state ==
> +                                       SA_AMF_HA_ACTIVE) {
> +                                       activating_node_su = SA_TRUE;
> +                                       break;
> +                               }
> +                       }
> +               }
> +       }
> +       ENTER("'%s, %s' %u",node->name.value, sg->name.value, 
> activating_node_su);
> +
> +       return activating_node_su;
> +}
> +
> +/**
>   * This function is called because an error has been detected and the 
> analysis
>   * (done elsewhere) indicated that this error shall be recovered by a Node
>   * failover. This function initiates the recovery action 'Node failover'.
> @@ -2223,6 +2263,20 @@
>         sg_event_t sg_event;
>  
>         switch (sg->avail_state) {
> +               case SG_AC_ActivatingStandby:
> +                       /* check if failover can be safely deferred */
> +                       if (!is_acsm_assigning_node_active (sg, node)) {
> +                               sg_set_event (SG_FAILOVER_NODE_EV, sg, 0, 0, 
> node, &sg_event); 
> +                               sg_defer_event (SG_FAILOVER_NODE_EV, 
> &sg_event);
> +                               break;
> +                       } else {
> +                               log_printf (LOG_LEVEL_NOTICE,
> +                                                       "Cannot defer node 
> '%s' failover (%s ACSM state %u)",
> +                                                       node->name.value, 
> sg->name.value, sg->avail_state);
> +
> +                               /* fall-through and process now */
> +                       }
> +
>                 case SG_AC_Idle:
>                         set_scope_for_failover_node(sg, node);
>                         if (has_any_su_in_scope_active_workload (sg)) {
> @@ -2260,7 +2314,6 @@
>                         break;
>                 case SG_AC_DeactivatingDependantWorkload:
>                 case SG_AC_TerminatingSuspected:
> -               case SG_AC_ActivatingStandby:
>                 case SG_AC_AssigningStandbyToSpare:
>                 case SG_AC_ReparingComponent:
>                 case SG_AC_ReparingSu:
> @@ -2846,4 +2899,3 @@
>  
>         return sg;
>  }
> -
> 
> _______________________________________________
> Openais mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/openais

_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to