During split-brain, there is possiblity of having mismatches in global counters
hold by IMMNDs which might cause cluster rebooted at split-brain recovery.

This patch introduces some changes in IMMD - not to update global counters
from joining IMMNDs if the coord is already elected.
---
 src/imm/immd/immd_evt.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 1114a81d4..237b3a4bb 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -1629,6 +1629,15 @@ static uint32_t immd_evt_proc_immnd_req_sync(IMMD_CB 
*cb, IMMD_EVT *evt,
        return proc_rc;
 }
 
+/* Return true if there is any difference in global counters. */
+static bool immd_evt_proc_check_diff(const IMMD_CB *cb, const IMMD_EVT *evt)
+{
+       return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) ||
+               (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count) ||
+               (cb->ccb_id_count  < evt->info.ctrl_msg.ccb_id_count) ||
+               (cb->impl_count    < evt->info.ctrl_msg.impl_count));
+}
+
 /****************************************************************************
  * Name          : immd_evt_proc_immnd_intro
  *
@@ -1718,7 +1727,14 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
                            cb->mRulingEpoch);
                }
 
-               if (cb->mRulingEpoch < node_info->epoch) {
+               /*
+                 Don't update the ruling epoch from joining IMMND if the coord
+                 is already elected to avoid cluster reboot after split-brain
+                 recovery. This rule is also applied for all below global
+                 counters (fevs counter, ccb id counter, etc.).
+               */
+               if (!(cb->immnd_coord && node_info->immnd_key != cb->node_id) &&
+                   (cb->mRulingEpoch < node_info->epoch)) {
                        cb->mRulingEpoch = node_info->epoch;
                        LOG_NO("Ruling epoch changed to:%u", cb->mRulingEpoch);
                }
@@ -1769,6 +1785,21 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
 
                        veteranImmndNode = true;
 
+                       if (cb->immnd_coord && node_info->immnd_key != 
cb->node_id) {
+                               if (immd_evt_proc_check_diff(cb, evt)) {
+                                       IMMSV_ND2D_CONTROL* msg = 
&(evt->info.ctrl_msg);
+                                       LOG_NO("Ignore updating counters from 
%x:"
+                                              "fevs(%llu/%llu), admid (%u/%u),"
+                                              "ccbid(%u/%u), impid(%u/%u)",
+                                              node_info->immnd_key,
+                                              cb->fevsSendCount, 
msg->fevs_count,
+                                              cb->admo_id_count, 
msg->admo_id_count,
+                                              cb->ccb_id_count, 
msg->ccb_id_count,
+                                              cb->impl_count, msg->impl_count);
+                               }
+                               goto accept_node;
+                       }
+
                        if (cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) {
                                LOG_NO(
                                    "Refresh of fevs count from %llu to %llu 
from %x.",
@@ -1839,6 +1870,7 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
                }
        }
 
+
        /* Determine type of node. */
        if (sinfo->dest == cb->loc_immnd_dest) {
                node_info->isOnController = true;
-- 
2.18.0



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to