During split-brain, there is possiblity of having mismatches in global counters
hold by IMMNDs which might cause cluster rebooted at split-brain recovery.
This patch introduces some changes in IMMD - not to update global counters
from joining IMMNDs if the coord is already elected.
---
src/imm/immd/immd_evt.c | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 1114a81d4..237b3a4bb 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -1629,6 +1629,15 @@ static uint32_t immd_evt_proc_immnd_req_sync(IMMD_CB
*cb, IMMD_EVT *evt,
return proc_rc;
}
+/* Return true if there is any difference in global counters. */
+static bool immd_evt_proc_check_diff(const IMMD_CB *cb, const IMMD_EVT *evt)
+{
+ return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) ||
+ (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count) ||
+ (cb->ccb_id_count < evt->info.ctrl_msg.ccb_id_count) ||
+ (cb->impl_count < evt->info.ctrl_msg.impl_count));
+}
+
/****************************************************************************
* Name : immd_evt_proc_immnd_intro
*
@@ -1718,7 +1727,14 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
cb->mRulingEpoch);
}
- if (cb->mRulingEpoch < node_info->epoch) {
+ /*
+ Don't update the ruling epoch from joining IMMND if the coord
+ is already elected to avoid cluster reboot after split-brain
+ recovery. This rule is also applied for all below global
+ counters (fevs counter, ccb id counter, etc.).
+ */
+ if (!(cb->immnd_coord && node_info->immnd_key != cb->node_id) &&
+ (cb->mRulingEpoch < node_info->epoch)) {
cb->mRulingEpoch = node_info->epoch;
LOG_NO("Ruling epoch changed to:%u", cb->mRulingEpoch);
}
@@ -1769,6 +1785,21 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
veteranImmndNode = true;
+ if (cb->immnd_coord && node_info->immnd_key !=
cb->node_id) {
+ if (immd_evt_proc_check_diff(cb, evt)) {
+ IMMSV_ND2D_CONTROL* msg =
&(evt->info.ctrl_msg);
+ LOG_NO("Ignore updating counters from
%x:"
+ "fevs(%llu/%llu), admid (%u/%u),"
+ "ccbid(%u/%u), impid(%u/%u)",
+ node_info->immnd_key,
+ cb->fevsSendCount,
msg->fevs_count,
+ cb->admo_id_count,
msg->admo_id_count,
+ cb->ccb_id_count,
msg->ccb_id_count,
+ cb->impl_count, msg->impl_count);
+ }
+ goto accept_node;
+ }
+
if (cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) {
LOG_NO(
"Refresh of fevs count from %llu to %llu
from %x.",
@@ -1839,6 +1870,7 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
}
}
+
/* Determine type of node. */
if (sinfo->dest == cb->loc_immnd_dest) {
node_info->isOnController = true;
--
2.18.0
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel