- Quick reboot is sometimes not quick cause RDE continue cause
split-brain detection for another SC. Need kill director services
to avoid impact other SCs.
- Active IMMD pause itself if see another active IMMD. Node will
reboot by RDE or split-brain timer of local IMMND.
- Improve log messages to avoid confusion about intro/re-intro
accept or just epoch update.
---
scripts/opensaf_reboot | 10 ++++++---
src/imm/immd/immd_evt.c | 47 ++++++++++++++++++++++++++---------------
2 files changed, 37 insertions(+), 20 deletions(-)
diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
index 8e5bd8c40..5fbb1dd54 100644
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -107,6 +107,13 @@ quick_local_node_reboot()
{
logger -t "opensaf_reboot" "Do quick local node reboot"
+ for service in osafamfnd osafimmnd; do
+ $icmd pkill -STOP $service
+ done
+ for service in osafrded osafamfd osafimmd osaflogd osafntfd osafclmd; do
+ $icmd pkill -KILL $service
+ done
+
$icmd /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger"
ret_code=$?
@@ -143,9 +150,6 @@ unset tipc
# If clm cluster reboot requested argument one and two are set but not used,
# argument 3 is set to 1, "safe reboot" request.
if [ "$#" = 0 ]; then
- $icmd pkill -STOP osafamfnd
- $icmd pkill -KILL osafamfd
- $icmd pkill -KILL osafimmd
quick_local_node_reboot
elif [ "$safe_reboot" = 1 ]; then
opensaf_safe_reboot
diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
index 297761d13..eb579c489 100644
--- a/src/imm/immd/immd_evt.c
+++ b/src/imm/immd/immd_evt.c
@@ -785,13 +785,15 @@ static void immd_kill_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info)
static uint16_t accepted_nodes = 0;
static void immd_accept_node(IMMD_CB *cb, IMMD_IMMND_INFO_NODE *node_info,
- bool doReply, bool knownVeteran, bool
check_ex_immd_node_id)
+ bool doReply, bool knownVeteran, bool
check_ex_immd)
{
IMMSV_EVT accept_evt;
IMMD_MBCSV_MSG mbcp_msg;
bool isOnController = node_info->isOnController;
bool fsParamMbcp = false;
- TRACE_ENTER();
+ TRACE_ENTER2(
+ "Accept IMMND %x doReply=%d knownVeteran=%d check_ex_immd=%d",
+ node_info->immnd_key, doReply, knownVeteran, check_ex_immd);
memset(&accept_evt, 0, sizeof(IMMSV_EVT));
memset(&mbcp_msg, 0, sizeof(IMMD_MBCSV_MSG));
@@ -799,9 +801,6 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
++accepted_nodes;
}
- LOG_NO(
- "Accept intro from %x with ex-IMMD %x",
- node_info->immnd_key, node_info->ex_immd_node_id);
accept_evt.type = IMMSV_EVT_TYPE_IMMND;
accept_evt.info.immnd.type = IMMND_EVT_D2ND_INTRO_RSP;
accept_evt.info.immnd.info.ctrl.nodeId = node_info->immnd_key;
@@ -844,7 +843,7 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
cb->immnd_coord = node_info->immnd_key;
node_info->isCoord = true;
} else if (cb->mScAbsenceAllowed && doReply) {
- if ((check_ex_immd_node_id) &&
+ if ((check_ex_immd) &&
(cb->node_id == node_info->immnd_key)) {
LOG_NO(
"IMMND re-introduce to IMMD on same
this node. "
@@ -897,13 +896,13 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
LOG_NO(
"IMMND coord at %x with ex-IMMD %x",
node_info->immnd_key, node_info->ex_immd_node_id);
- if (check_ex_immd_node_id && node_info->ex_immd_node_id)
+ if (check_ex_immd && node_info->ex_immd_node_id)
cb->ex_immd_node_id = node_info->ex_immd_node_id;
}
mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP; /* Mbcp intro to SBY. */
mbcp_msg.info.ctrl = accept_evt.info.immnd.info.ctrl;
- if (check_ex_immd_node_id) {
+ if (check_ex_immd) {
mbcp_msg.type = IMMD_A2S_MSG_INTRO_RSP_2;
mbcp_msg.info.ctrl.ex_immd_node_id = node_info->ex_immd_node_id;
}
@@ -949,10 +948,19 @@ static void immd_accept_node(IMMD_CB *cb,
IMMD_IMMND_INFO_NODE *node_info,
.canBeCoord = IMMSV_VETERAN_COORD;
/* Allow all nodes including payloads to be coord */
- if (check_ex_immd_node_id &&
- !is_on_same_partition_with_coord(cb, node_info)) {
- LOG_WA("Going to reboot node 0x%x",
node_info->immnd_key);
- accept_evt.info.immnd.info.ctrl.canBeCoord =
IMMSV_UNKNOWN;
+ if (check_ex_immd) {
+ if (!is_on_same_partition_with_coord(cb,
node_info)) {
+ LOG_WA(
+ "Going to reboot node 0x%x",
node_info->immnd_key);
+ accept_evt.info.immnd.info.ctrl
+ .canBeCoord = IMMSV_UNKNOWN;
+ } else {
+ LOG_NO(
+ "Accept re-intro from %x with
ex-IMMD %x",
+ node_info->immnd_key,
node_info->ex_immd_node_id);
+ }
+ } else {
+ LOG_NO("Accept intro from %x",
node_info->immnd_key);
}
accept_evt.info.immnd.info.ctrl.ndExecPid =
@@ -3253,11 +3261,16 @@ static uint32_t immd_evt_proc_mds_evt(IMMD_CB *cb,
IMMD_EVT *evt)
}
} else if ((mds_info->node_id != cb->immd_self_id) &&
(mds_info->node_id != cb->immd_remote_id) &&
- (mds_info->role == V_DEST_RL_ACTIVE) &&
- (cb->ha_state == SA_AMF_HA_STANDBY)) {
- LOG_ER("Standby peer see two peers: %x and %x",
- cb->immd_remote_id, mds_info->node_id);
- opensaf_reboot(0, NULL, "Standby peer see two peers");
+ (mds_info->role == V_DEST_RL_ACTIVE)) {
+ if (cb->ha_state == SA_AMF_HA_STANDBY) {
+ LOG_ER("Standby peer see two peers: %x and %x",
+ cb->immd_remote_id, mds_info->node_id);
+ opensaf_reboot(0, NULL, "Standby peer see two
peers");
+ } else if (cb->ha_state == SA_AMF_HA_ACTIVE) {
+ // Node will be ordered reboot by RDE or IMMND
+ LOG_WA("Another Active IMMD %x",
mds_info->node_id);
+ for (;;) pause();
+ }
}
break;
--
2.25.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel