During split-brain, there is possiblity of having mismatches in global counters
hold by IMMNDs which might cause cluster rebooted at split-brain recovery.
This patch introduces some changes in IMMD - not to update global counters
from joining IMMNDs if the coord is already elected.
---
src/imm/immd/immd_evt.c | 50 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 1114a81d4..9c4f1f016 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -1629,6 +1629,25 @@ static uint32_t immd_evt_proc_immnd_req_sync(IMMD_CB
*cb, IMMD_EVT *evt,
return proc_rc;
}
+/****************************************************************************
+ * Name : is_global_counter_less
+ *
+ * Description : Check if any global counters is less than ones sent by
IMMND.
+ *
+ * Arguments : IMMD_CB *cb - IMMD CB pointer
+ * IMMD_EVT *evt - Received Event structure
+ *
+ * Return Values : true if having at least one, false otherwise.
+ *
+ *****************************************************************************/
+static bool is_global_counter_less(const IMMD_CB *cb, const IMMD_EVT *evt)
+{
+ return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) ||
+ (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count) ||
+ (cb->ccb_id_count < evt->info.ctrl_msg.ccb_id_count) ||
+ (cb->impl_count < evt->info.ctrl_msg.impl_count));
+}
+
/****************************************************************************
* Name : immd_evt_proc_immnd_intro
*
@@ -1718,7 +1737,16 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
cb->mRulingEpoch);
}
- if (cb->mRulingEpoch < node_info->epoch) {
+ /*
+ Don't update the ruling epoch from joining IMMND if the coord
+ is already elected, except the change comes from the coord.
+ This check is to avoid cluster reboot after split-brain
+ recovery; this rule is also applied for all below global
+ counters (fevs counter, ccb id counter, etc.).
+ */
+ if ((cb->mRulingEpoch < node_info->epoch) &&
+ (cb->immnd_coord != 0 ||
+ cb->immnd_coord == node_info->immnd_key)) {
cb->mRulingEpoch = node_info->epoch;
LOG_NO("Ruling epoch changed to:%u", cb->mRulingEpoch);
}
@@ -1769,6 +1797,26 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
veteranImmndNode = true;
+ /* Don't update global counters when the coord is
+ already elected, but except ones come from the coord.
+ */
+ if (cb->immnd_coord != 0 &&
+ node_info->immnd_key != cb->immnd_coord) {
+ if (is_global_counter_less(cb, evt)) {
+ IMMSV_ND2D_CONTROL* msg =
&(evt->info.ctrl_msg);
+ LOG_NO("Ignore updating counters from
%x. "
+ "Diffs (global/intro):"
+ "fevs(%llu/%llu), admid (%u/%u),"
+ "ccbid(%u/%u), impid(%u/%u)",
+ node_info->immnd_key,
+ cb->fevsSendCount,
msg->fevs_count,
+ cb->admo_id_count,
msg->admo_id_count,
+ cb->ccb_id_count,
msg->ccb_id_count,
+ cb->impl_count, msg->impl_count);
+ }
+ goto accept_node;
+ }
+
if (cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) {
LOG_NO(
"Refresh of fevs count from %llu to %llu
from %x.",
--
2.18.0
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel