- IMMND coordinator take longer time to sync because incorrectly
postpone sync to wait for incorrect number of down nodes.
- IMMND should restart after being accepted re-intro and not be
a new coordinator to sync again with new coordinator.
- Active IMMD only update ex-IMMD from coordinator if info exist.
Update ex-IMMD to node id itself when new coord announce sync.
- Update #3228 solution: active IMMD should not drop re-intro
from local IMMND, it causes unexpected IMMND coord selected then
local IMMND unexpected restart later.
- IMMND on active IMMD node will start split-brain detected timer
to reboot node if see another acitve IMMD, not reboot immedidately
to avoid messing up RDE split-brain detection mechanism.
- Quick reboot sometimes not quick then active IMMD on node may
impact to new promoted Active node. Let stop AMFND, kill AMFD/IMMD
to avoid any impact.
---
scripts/opensaf_reboot | 5 +-
src/imm/immd/immd_evt.c | 19 ++++--
src/imm/immd/immd_mds.c | 1 +
src/imm/immnd/immnd.h | 1 +
src/imm/immnd/immnd_cb.h | 2 +
src/imm/immnd/immnd_evt.c | 122 ++++++++++++++++++++++---------------
src/imm/immnd/immnd_main.c | 2 +
7 files changed, 98 insertions(+), 54 deletions(-)
diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index e2a0ca944..8e5bd8c40 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -143,8 +143,9 @@ unset tipc
# If clm cluster reboot requested argument one and two are set but not used,
# argument 3 is set to 1, "safe reboot" request.
if [ "$#" = 0 ]; then
- $icmd pkill -STOP osafamfd
- $icmd pkill -STOP osafimmd
+ $icmd pkill -STOP osafamfnd
+ $icmd pkill -KILL osafamfd
+ $icmd pkill -KILL osafimmd
quick_local_node_reboot
elif [ "$safe_reboot" = 1 ]; then
opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 51cc8e4f7..297761d13 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -897,7 +897,8 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
LOG_NO(
"IMMND coord at %x with ex-IMMD %x",
node_info->immnd_key, node_info->ex_immd_node_id);
- cb->ex_immd_node_id = node_info->ex_immd_node_id;
+ if (check_ex_immd_node_id && node_info->ex_immd_node_id)
+ cb->ex_immd_node_id = node_info->ex_immd_node_id;
}
mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
@@ -1253,6 +1254,7 @@ static uint32_t immd_evt_proc_immnd_announce_sync(IMMD_CB
*cb, IMMD_EVT *evt,
Loop through all nodes */
cb->mRulingEpoch++;
+ cb->ex_immd_node_id = cb->node_id;
/*Only updates epoch for coord. */
/*node_info->epoch = cb->mRulingEpoch; */
@@ -1691,8 +1693,9 @@ static uint32_t immd_evt_proc_immnd_intro(IMMD_CB *cb,
IMMD_EVT *evt,
immd_immnd_info_node_get(&cb->immnd_tree, &sinfo->dest, &node_info);
if (!node_info) {
- if (evt->info.ctrl_msg.refresh == 3) {
- LOG_WA("Drop re-intro from old IMMND dest %" PRIu64,
sinfo->dest);
+ if ((evt->info.ctrl_msg.refresh == 3) &&
+ (sinfo->node_id != cb->node_id)) {
+ TRACE("Drop re-intro from old IMMND %x",
sinfo->node_id);
goto done;
}
LOG_WA("Node not found dest %" PRIu64
@@ -3308,7 +3311,15 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb,
IMMD_EVT *evt)
mds_info->dest);
goto done;
} else {
- TRACE_5("IMMND DOWN PROCESS detected by IMMD");
+ if (node_info->immnd_execPid == 0) {
+ TRACE_5(
+ "Ignore IMMND %x DOWN not yet
accepted intro",
+ node_info->immnd_key);
+ immd_immnd_info_node_delete(cb,
node_info);
+ goto done;
+ }
+ TRACE_5("IMMND %x DOWN PROCESS detected by
IMMD",
+ node_info->immnd_key);
immd_process_immnd_down(cb, node_info, true);
}
}
diff --git a/src/imm/immd/immd_mds.c b/src/imm/immd/immd_mds.c
index 7610a45fa..9688b49ad 100644
--- a/src/imm/immd/immd_mds.c
+++ b/src/imm/immd/immd_mds.c
@@ -495,6 +495,7 @@ static uint32_t immd_mds_rcv(IMMD_CB *cb,
MDS_CALLBACK_RECEIVE_INFO *rcv_info)
pEvt->sinfo.ctxt = rcv_info->i_msg_ctxt;
pEvt->sinfo.dest = rcv_info->i_fr_dest;
pEvt->sinfo.to_svc = rcv_info->i_fr_svc_id;
+ pEvt->sinfo.node_id = rcv_info->i_node_id;
if (rcv_info->i_rsp_reqd) {
pEvt->sinfo.stype = MDS_SENDTYPE_RSP;
}
diff --git a/src/imm/immnd/immnd.h b/src/imm/immnd/immnd.h
index 7b0818de7..23edf004b 100644
--- a/src/imm/immnd/immnd.h
+++ b/src/imm/immnd/immnd.h
@@ -33,6 +33,7 @@
#endif
#include "imm/common/immsv.h"
+#include "base/ncssysf_tmr.h"
#include "immnd_cb.h"
#include "immnd_init.h"
diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h
index 3dc03d88b..bb3bb8493 100644
--- a/src/imm/immnd/immnd_cb.h
+++ b/src/imm/immnd/immnd_cb.h
@@ -207,6 +207,8 @@ typedef struct immnd_cb_tag {
clm_init_sel_obj; /* Selection object wait for clms intialization*/
bool isClmNodeJoined; /* True => If clm joined the cluster*/
NCS_PATRICIA_TREE immnd_clm_list; /* IMMND_IMM_CLIENT_NODE - node */
+ tmr_t splitbrain_tmr;
+ bool splitbrain_tmr_run;
} IMMND_CB;
/* CB prototypes */
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index e405d3ce4..670823a45 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -10541,6 +10541,10 @@ static uint32_t immnd_evt_proc_intro_rsp(IMMND_CB *cb,
IMMND_EVT *evt,
LOG_IN("2PBE SYNC CASE CAUGHT oldCanBeCoord:%u",
oldCanBeCoord);
}
+ if ((cb->mIntroduced == 2) && (!evt->info.ctrl.isCoord)) {
+ LOG_WA("Restart to sync with Coord! Exit");
+ exit(EXIT_SUCCESS);
+ }
cb->mIntroduced = 1;
cb->mCanBeCoord = evt->info.ctrl.canBeCoord;
if ((cb->mCanBeCoord == IMMSV_2PBE_PRELOAD) && (cb->m2Pbe < 2)
&&
@@ -12202,6 +12206,12 @@ void immnd_evt_ccb_augment_admo(IMMND_CB *cb,
IMMND_EVT *evt,
TRACE_LEAVE();
}
+void splitbrain_tmr_exp(void *arg)
+{
+ (void)arg;
+ LOG_ER("Split-brain detected! Rebooting...");
+ opensaf_quick_reboot("Split-brain detected! Rebooting...");
+}
/****************************************************************************
* Name : immnd_evt_proc_mds_evt
*
@@ -12219,52 +12229,69 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb,
IMMND_EVT *evt)
/*TRACE_ENTER(); */
uint32_t rc = NCSCC_RC_SUCCESS;
bool is_headless = false;
+ IMMSV_MDS_INFO *mdsInfo = &evt->info.mds_info;
- if ((evt->info.mds_info.change == NCSMDS_DOWN) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OM ||
- evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OI)) {
+ if ((mdsInfo->change == NCSMDS_DOWN) &&
+ (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OM ||
+ mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OI)) {
TRACE_2("IMMA DOWN EVENT");
- immnd_proc_imma_down(cb, evt->info.mds_info.dest,
- evt->info.mds_info.svc_id);
+ immnd_proc_imma_down(cb, mdsInfo->dest,
+ mdsInfo->svc_id);
}
/* In multi partitioned clusters rejoin, IMMND may not realize
* headless due to see IMMDs from different partitions */
- if ((evt->info.mds_info.change == NCSMDS_DOWN) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
- is_headless = true;
- cb->immd_node_id = 0;
- cb->other_immd_id = 0;
- } else if ((evt->info.mds_info.change == NCSMDS_RED_UP) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD) &&
- (evt->info.mds_info.node_id != cb->immd_node_id)) {
- if ((evt->info.mds_info.role == V_DEST_RL_STANDBY) &&
- (cb->other_immd_id == 0)) {
- cb->other_immd_id = evt->info.mds_info.node_id;
- TRACE_2("IMMD RED_UP EVENT %x role=%d ==> ACT:%x
SBY:%x",
- evt->info.mds_info.node_id, evt->info.mds_info.role,
- cb->immd_node_id, cb->other_immd_id);
- } else if ((evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
- (cb->immd_node_id != 0) &&
- (cb->node_id != cb->immd_node_id)) {
- LOG_WA("See two Active IMMD: %x %x, going to headless",
- cb->immd_node_id, evt->info.mds_info.node_id);
+ if (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD) {
+ switch (mdsInfo->change) {
+ case NCSMDS_DOWN:
is_headless = true;
cb->immd_node_id = 0;
cb->other_immd_id = 0;
- }
- } else if ((evt->info.mds_info.change == NCSMDS_RED_DOWN) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
- if (cb->immd_node_id == evt->info.mds_info.node_id)
- cb->immd_node_id = 0;
- if (cb->other_immd_id == evt->info.mds_info.node_id)
- cb->other_immd_id = 0;
- TRACE_2("IMMD RED_DOWN EVENT %x role=%d ==> ACT:%x SBY:%x",
- evt->info.mds_info.node_id, evt->info.mds_info.role,
- cb->immd_node_id, cb->other_immd_id);
- if ((cb->immd_node_id == 0) && (cb->other_immd_id == 0)) {
- LOG_WA("Both Active & Standby DOWN, going to headless");
- is_headless = true;
+ break;
+ case NCSMDS_RED_UP:
+ if ((mdsInfo->role == V_DEST_RL_STANDBY) &&
+ (cb->other_immd_id == 0)) {
+ cb->other_immd_id = mdsInfo->node_id;
+ } else if (mdsInfo->role == V_DEST_RL_ACTIVE) {
+ if ((cb->immd_node_id != 0) &&
+ (cb->immd_node_id != mdsInfo->node_id)) {
+ if (cb->node_id != cb->immd_node_id) {
+ LOG_WA(
+ "See two Active IMMD: %x
%x, going to headless",
+ cb->immd_node_id,
mdsInfo->node_id);
+ is_headless = true;
+ cb->immd_node_id = 0;
+ cb->other_immd_id = 0;
+ } else if (!cb->splitbrain_tmr_run) {
+ // Normally, RDE will handle
split-brain detection.
+ // In roaming SC split/join, sometimes
RDE don't detect
+ // split-brain but IMMND does, start
timer reboot node.
+ LOG_WA(
+ "Another Active IMMD %x.
Start split-brain timer",
+ mdsInfo->node_id);
+ cb->splitbrain_tmr =
ncs_tmr_start(
+ cb->splitbrain_tmr, 1000,
// 10s
+ splitbrain_tmr_exp, NULL,
NULL, 0);
+ cb->splitbrain_tmr_run = true;
+ }
+ }
+ }
+ break;
+ case NCSMDS_RED_DOWN:
+ if (cb->immd_node_id == mdsInfo->node_id)
+ cb->immd_node_id = 0;
+ if (cb->other_immd_id == mdsInfo->node_id)
+ cb->other_immd_id = 0;
+ TRACE_2("IMMD RED_DOWN EVENT %x role=%d ==> ACT:%x
SBY:%x",
+ mdsInfo->node_id, mdsInfo->role,
+ cb->immd_node_id, cb->other_immd_id);
+ if ((cb->immd_node_id == 0) && (cb->other_immd_id ==
0)) {
+ LOG_WA("Both Active & Standby DOWN, going to
headless");
+ is_headless = true;
+ }
+ break;
+ default:
+ break;
}
}
@@ -12394,29 +12421,28 @@ static uint32_t immnd_evt_proc_mds_evt(IMMND_CB *cb,
IMMND_EVT *evt)
}
}
- } else if ((evt->info.mds_info.change == NCSMDS_UP) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
+ } else if ((mdsInfo->change == NCSMDS_UP) &&
+ (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD)) {
LOG_NO(
"IMMD(%x) service is UP ... ScAbsenseAllowed?:%u
introduced?:%u",
- evt->info.mds_info.node_id,
+ mdsInfo->node_id,
cb->mScAbsenceAllowed, cb->mIntroduced);
if ((cb->mIntroduced == 2) &&
(immnd_introduceMe(cb) != NCSCC_RC_SUCCESS)) {
LOG_WA(
"IMMND re-introduceMe after IMMD restart failed,
will retry");
}
- } else if ((evt->info.mds_info.change == NCSMDS_UP) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OI ||
- evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMA_OM)) {
+ } else if ((mdsInfo->change == NCSMDS_UP) &&
+ (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OI ||
+ mdsInfo->svc_id == NCSMDS_SVC_ID_IMMA_OM)) {
TRACE_2("IMMA UP EVENT");
- } else if ((evt->info.mds_info.change == NCSMDS_CHG_ROLE) &&
- (evt->info.mds_info.role == V_DEST_RL_ACTIVE) &&
- (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMD)) {
-
+ } else if ((mdsInfo->change == NCSMDS_CHG_ROLE) &&
+ (mdsInfo->role == V_DEST_RL_ACTIVE) &&
+ (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMD)) {
TRACE_2("IMMD FAILOVER");
/* The IMMD has failed over. */
immnd_proc_imma_discard_stales(cb);
- } else if (evt->info.mds_info.svc_id == NCSMDS_SVC_ID_IMMND) {
+ } else if (mdsInfo->svc_id == NCSMDS_SVC_ID_IMMND) {
LOG_NO("MDS SERVICE EVENT OF TYPE IMMND!!");
}
/*TRACE_LEAVE(); */
diff --git a/src/imm/immnd/immnd_main.c b/src/imm/immnd/immnd_main.c
index 0cd004053..410134c97 100644
--- a/src/imm/immnd/immnd_main.c
+++ b/src/imm/immnd/immnd_main.c
@@ -301,6 +301,8 @@ static uint32_t immnd_initialize(char *progname)
immnd_cb->clmSelectionObject = -1;
immnd_cb->immd_node_id = 0;
immnd_cb->other_immd_id = 0;
+ immnd_cb->splitbrain_tmr = ncs_tmr_alloc(NULL, 0);
+ immnd_cb->splitbrain_tmr_run = false;
populate_reserved_class_names(immnd_cb);
--
2.17.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel