If a component on node A had locked up, and node B failed over, then
once node A component's healthcheck failed, node A wouldn't reboot.
This was because node A's component failover was deferred until processing
node B failover completed. The problem here is because the node A component
has locked up/exited, node B failover processing will never complete, so
the node A would never reboot.
Processing the node failover immediately seems to work fine.

exec/amfsg.c
+ amf_sg_failover_node_req() - process the node failover immediately if the SG
state machine is currently trying to assign this node the active workload.
+ is_acsm_assigning_node_active() - new function to check if a given node
is currently being assigned the active workload.

-Angus

Index: services/amfsg.c
===================================================================
--- services/amfsg.c    (revision 1791)
+++ services/amfsg.c    (working copy)
@@ -2210,6 +2210,46 @@
 }
 
 /**
+ * Checks if a SG ACSM is waiting for a active SI assignment on a given node
+ * before it will transition to the next state. This is used to determine if we
+ * can safely defer processing a node leave/failover until the ACSM transitions
+ * back to idle (if the current ACSM action involves the node specified, then
+ * it may NEVER complete).
+ *
+ * @return SA_TRUE if activating an SI assignment on node specified, SA_FALSE 
if not
+ */
+static int is_acsm_assigning_node_active (struct amf_sg *sg, struct amf_node 
*node)
+{
+       struct amf_si *si;
+       struct amf_si_assignment *si_assignment;
+       int activating_node_su = SA_FALSE;
+
+       for (si = sg->application->si_head; si != NULL; si = si->next) {
+               if (name_match (&si->saAmfSIProtectedbySG, &sg->name)) {
+
+                       for (si_assignment = si->assigned_sis;
+                               si_assignment != NULL;
+                               si_assignment = si_assignment->next) {
+
+                               /* Check if an SU on the node is in the process 
of activating */
+                               if (name_match(&node->name,
+                                       
&si_assignment->su->saAmfSUHostedByNode) &&
+                                       si_assignment->requested_ha_state !=
+                                       si_assignment->saAmfSISUHAState &&
+                                       si_assignment->requested_ha_state ==
+                                       SA_AMF_HA_ACTIVE) {
+                                       activating_node_su = SA_TRUE;
+                                       break;
+                               }
+                       }
+               }
+       }
+       ENTER("'%s, %s' %u",node->name.value, sg->name.value, 
activating_node_su);
+
+       return activating_node_su;
+}
+
+/**
  * This function is called because an error has been detected and the analysis
  * (done elsewhere) indicated that this error shall be recovered by a Node
  * failover. This function initiates the recovery action 'Node failover'.
@@ -2223,6 +2263,20 @@
        sg_event_t sg_event;
 
        switch (sg->avail_state) {
+               case SG_AC_ActivatingStandby:
+                       /* check if failover can be safely deferred */
+                       if (!is_acsm_assigning_node_active (sg, node)) {
+                               sg_set_event (SG_FAILOVER_NODE_EV, sg, 0, 0, 
node, &sg_event); 
+                               sg_defer_event (SG_FAILOVER_NODE_EV, &sg_event);
+                               break;
+                       } else {
+                               log_printf (LOG_LEVEL_NOTICE,
+                                                       "Cannot defer node '%s' 
failover (%s ACSM state %u)",
+                                                       node->name.value, 
sg->name.value, sg->avail_state);
+
+                               /* fall-through and process now */
+                       }
+
                case SG_AC_Idle:
                        set_scope_for_failover_node(sg, node);
                        if (has_any_su_in_scope_active_workload (sg)) {
@@ -2260,7 +2314,6 @@
                        break;
                case SG_AC_DeactivatingDependantWorkload:
                case SG_AC_TerminatingSuspected:
-               case SG_AC_ActivatingStandby:
                case SG_AC_AssigningStandbyToSpare:
                case SG_AC_ReparingComponent:
                case SG_AC_ReparingSu:
@@ -2846,4 +2899,3 @@
 
        return sg;
 }
-

_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais

Reply via email to