During split-brain, there is possiblity of having mismatches in global counters
hold by IMMNDs which might cause cluster rebooted at split-brain recovery.

This patch introduces some changes in IMMD - not to update global counters
from joining IMMNDs if the coord is already elected.
---
 src/imm/immd/immd_evt.c | 50 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 1114a81d4..9c4f1f016 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -1629,6 +1629,25 @@ static uint32_t immd_evt_proc_immnd_req_sync(IMMD_CB 
*cb, IMMD_EVT *evt,
        return proc_rc;
 }
 
+/****************************************************************************
+ * Name          : is_global_counter_less
+ *
+ * Description   : Check if any global counters is less than ones sent by 
IMMND.
+ *
+ * Arguments     : IMMD_CB  *cb  - IMMD CB pointer
+ *                 IMMD_EVT *evt - Received Event structure
+ *
+ * Return Values : true if having at least one, false otherwise.
+ *
+ *****************************************************************************/
+static bool is_global_counter_less(const IMMD_CB *cb, const IMMD_EVT *evt)
+{
+       return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) ||
+               (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count) ||
+               (cb->ccb_id_count  < evt->info.ctrl_msg.ccb_id_count) ||
+               (cb->impl_count    < evt->info.ctrl_msg.impl_count));
+}
+
 /****************************************************************************
  * Name          : immd_evt_proc_immnd_intro
  *
@@ -1718,7 +1737,16 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
                            cb->mRulingEpoch);
                }
 
-               if (cb->mRulingEpoch < node_info->epoch) {
+               /*
+                 Don't update the ruling epoch from joining IMMND if the coord
+                 is already elected, except the change comes from the coord.
+                 This check is to avoid cluster reboot after split-brain
+                 recovery; this rule is also applied for all below global
+                 counters (fevs counter, ccb id counter, etc.).
+               */
+               if ((cb->mRulingEpoch < node_info->epoch) &&
+                   (cb->immnd_coord != 0 ||
+                    cb->immnd_coord == node_info->immnd_key)) {
                        cb->mRulingEpoch = node_info->epoch;
                        LOG_NO("Ruling epoch changed to:%u", cb->mRulingEpoch);
                }
@@ -1769,6 +1797,26 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb, 
IMMD_EVT *evt,
 
                        veteranImmndNode = true;
 
+                       /* Don't update global counters when the coord is
+                          already elected, but except ones come from the coord.
+                       */
+                       if (cb->immnd_coord != 0 &&
+                           node_info->immnd_key != cb->immnd_coord) {
+                               if (is_global_counter_less(cb, evt)) {
+                                       IMMSV_ND2D_CONTROL* msg = 
&(evt->info.ctrl_msg);
+                                       LOG_NO("Ignore updating counters from 
%x. "
+                                              "Diffs (global/intro):"
+                                              "fevs(%llu/%llu), admid (%u/%u),"
+                                              "ccbid(%u/%u), impid(%u/%u)",
+                                              node_info->immnd_key,
+                                              cb->fevsSendCount, 
msg->fevs_count,
+                                              cb->admo_id_count, 
msg->admo_id_count,
+                                              cb->ccb_id_count, 
msg->ccb_id_count,
+                                              cb->impl_count, msg->impl_count);
+                               }
+                               goto accept_node;
+                       }
+
                        if (cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) {
                                LOG_NO(
                                    "Refresh of fevs count from %llu to %llu 
from %x.",
-- 
2.18.0



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to