If a component on node A had locked up, and node B failed over, then
once node A component's healthcheck failed, node A wouldn't reboot.
This was because node A's component failover was deferred until processing
node B failover completed. The problem here is because the node A component
has locked up/exited, node B failover processing will never complete, so
the node A would never reboot.
Processing the node failover immediately seems to work fine.
exec/amfsg.c
+ amf_sg_failover_node_req() - process the node failover immediately if the SG
state machine is currently trying to assign this node the active workload.
+ is_acsm_assigning_node_active() - new function to check if a given node
is currently being assigned the active workload.
-Angus
Index: services/amfsg.c
===================================================================
--- services/amfsg.c (revision 1791)
+++ services/amfsg.c (working copy)
@@ -2210,6 +2210,46 @@
}
/**
+ * Checks if a SG ACSM is waiting for a active SI assignment on a given node
+ * before it will transition to the next state. This is used to determine if we
+ * can safely defer processing a node leave/failover until the ACSM transitions
+ * back to idle (if the current ACSM action involves the node specified, then
+ * it may NEVER complete).
+ *
+ * @return SA_TRUE if activating an SI assignment on node specified, SA_FALSE
if not
+ */
+static int is_acsm_assigning_node_active (struct amf_sg *sg, struct amf_node
*node)
+{
+ struct amf_si *si;
+ struct amf_si_assignment *si_assignment;
+ int activating_node_su = SA_FALSE;
+
+ for (si = sg->application->si_head; si != NULL; si = si->next) {
+ if (name_match (&si->saAmfSIProtectedbySG, &sg->name)) {
+
+ for (si_assignment = si->assigned_sis;
+ si_assignment != NULL;
+ si_assignment = si_assignment->next) {
+
+ /* Check if an SU on the node is in the process
of activating */
+ if (name_match(&node->name,
+
&si_assignment->su->saAmfSUHostedByNode) &&
+ si_assignment->requested_ha_state !=
+ si_assignment->saAmfSISUHAState &&
+ si_assignment->requested_ha_state ==
+ SA_AMF_HA_ACTIVE) {
+ activating_node_su = SA_TRUE;
+ break;
+ }
+ }
+ }
+ }
+ ENTER("'%s, %s' %u",node->name.value, sg->name.value,
activating_node_su);
+
+ return activating_node_su;
+}
+
+/**
* This function is called because an error has been detected and the analysis
* (done elsewhere) indicated that this error shall be recovered by a Node
* failover. This function initiates the recovery action 'Node failover'.
@@ -2223,6 +2263,20 @@
sg_event_t sg_event;
switch (sg->avail_state) {
+ case SG_AC_ActivatingStandby:
+ /* check if failover can be safely deferred */
+ if (!is_acsm_assigning_node_active (sg, node)) {
+ sg_set_event (SG_FAILOVER_NODE_EV, sg, 0, 0,
node, &sg_event);
+ sg_defer_event (SG_FAILOVER_NODE_EV, &sg_event);
+ break;
+ } else {
+ log_printf (LOG_LEVEL_NOTICE,
+ "Cannot defer node '%s'
failover (%s ACSM state %u)",
+ node->name.value,
sg->name.value, sg->avail_state);
+
+ /* fall-through and process now */
+ }
+
case SG_AC_Idle:
set_scope_for_failover_node(sg, node);
if (has_any_su_in_scope_active_workload (sg)) {
@@ -2260,7 +2314,6 @@
break;
case SG_AC_DeactivatingDependantWorkload:
case SG_AC_TerminatingSuspected:
- case SG_AC_ActivatingStandby:
case SG_AC_AssigningStandbyToSpare:
case SG_AC_ReparingComponent:
case SG_AC_ReparingSu:
@@ -2846,4 +2899,3 @@
return sg;
}
-
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais