During SC failover, message sent on ACTIVE AMFD can not be
checked point to AMFD on STANDBY SC. But the AMFND still
increase receive/send msg id count. Then STANDBY SC takes
ACTIVE and mismatch message id b/w AMFND and new active AMFD.
Solution is to make msg id count alignment b/w AMFD/AMFND
in this case.
---
src/amf/amfnd/avnd_cb.h | 1 +
src/amf/amfnd/di.cc | 22 +++++++++++++++++++---
src/amf/amfnd/main.cc | 2 ++
src/amf/amfnd/verify.cc | 28 +++++++++++++++++++---------
4 files changed, 41 insertions(+), 12 deletions(-)
diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h
index 8af5e5fe1..a8241b965 100644
--- a/src/amf/amfnd/avnd_cb.h
+++ b/src/amf/amfnd/avnd_cb.h
@@ -96,6 +96,7 @@ typedef struct avnd_cb_tag {
uint32_t rcv_msg_id; /* Message ID of the last message received */
/* AvD messaging params (retransmit list etc.) */
uint32_t snd_msg_id; /* send msg id */
+ uint32_t active_ack_msg_id; // msg id acked by active
/** List of messages sent to director but not yet acked.
* Messages are removed when acked with the ACK message.
diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
index 40229438d..d1d83bab6 100644
--- a/src/amf/amfnd/di.cc
+++ b/src/amf/amfnd/di.cc
@@ -819,6 +819,7 @@ uint32_t avnd_evt_mds_avd_dn_evh(AVND_CB *cb, AVND_EVT
*evt) {
// reset msg_id counter
cb->rcv_msg_id = 0;
cb->snd_msg_id = 0;
+ cb->active_ack_msg_id = 0;
//Inform AMFA about SCs absence now.
avnd_send_sc_status_message(OSAF_AMF_SC_ABSENT);
@@ -1260,10 +1261,23 @@ uint32_t avnd_di_ack_nack_msg_send(AVND_CB *cb,
uint32_t rcv_id,
msg.info.avd->msg_info.n2d_ack_nack_info.msg_id = (cb->snd_msg_id + 1);
msg.info.avd->msg_info.n2d_ack_nack_info.node_id = cb->node_info.nodeId;
- if (rcv_id != cb->rcv_msg_id)
- msg.info.avd->msg_info.n2d_ack_nack_info.ack = false;
- else
+ if (rcv_id != cb->rcv_msg_id) {
+ LOG_WA("Mismatch msg id, AVD send ID count: %u, "
+ "AVND receive ID count: %u", rcv_id, cb->rcv_msg_id);
+ // During SC failover, message sent on ACTIVE AMFD can not
+ // be checked point to AMFD on STANDBY SC. But the AMFND still
+ // receive msg id. STANDBY SC takes ACTIVE and mismatch message
+ // id b/w AMFD and AMFND on new ACTIVE. In this case AVND receive
+ // ID count greater than AVD sent id count. Shoudl rsp ack(true).
+ if (cb->rcv_msg_id > rcv_id) {
+ cb->rcv_msg_id = rcv_id;
+ msg.info.avd->msg_info.n2d_ack_nack_info.ack = true;
+ } else {
+ msg.info.avd->msg_info.n2d_ack_nack_info.ack = false;
+ }
+ } else {
msg.info.avd->msg_info.n2d_ack_nack_info.ack = true;
+ }
TRACE_1("MsgId=%u,ACK=%u", msg.info.avd->msg_info.n2d_ack_nack_info.msg_id,
msg.info.avd->msg_info.n2d_ack_nack_info.ack);
@@ -1363,6 +1377,8 @@ uint32_t avnd_di_node_down_msg_send(AVND_CB *cb)
void avnd_di_msg_ack_process(AVND_CB *cb, uint32_t mid) {
TRACE_ENTER2("%u", mid);
+ cb->active_ack_msg_id = mid;
+
for (auto iter = cb->dnd_list.begin(); iter != cb->dnd_list.end(); ++iter)
{
auto rec = *iter;
osafassert(rec->msg.type == AVND_MSG_AVD);
diff --git a/src/amf/amfnd/main.cc b/src/amf/amfnd/main.cc
index 265907917..24c2e9b85 100644
--- a/src/amf/amfnd/main.cc
+++ b/src/amf/amfnd/main.cc
@@ -343,6 +343,8 @@ AVND_CB *avnd_cb_create() {
cb->is_avd_down = true;
cb->amfd_sync_required = false;
+ cb->active_ack_msg_id = 0;
+
// retrieve hydra configuration from IMM
hydra_config_get(cb);
cb->sc_absence_tmr.is_active = false;
diff --git a/src/amf/amfnd/verify.cc b/src/amf/amfnd/verify.cc
index e5b1e7793..325d170e7 100644
--- a/src/amf/amfnd/verify.cc
+++ b/src/amf/amfnd/verify.cc
@@ -128,15 +128,25 @@ uint32_t avnd_evt_avd_verify_evh(AVND_CB *cb, AVND_EVT
*evt) {
}
if ((cb->snd_msg_id != info->rcv_id_cnt) && (msg_found == false)) {
- /* Log error, seems to be some problem.*/
- LOG_EM(
- "AVND record not found, after failover, snd_msg_id = %u, receive id =
%u",
- cb->snd_msg_id, info->rcv_id_cnt);
- opensaf_reboot(
- avnd_cb->node_info.nodeId,
- osaf_extended_name_borrow(&avnd_cb->node_info.executionEnvironment),
- "AVND record not found, after failover");
- exit(0);
+ if (cb->snd_msg_id == cb->active_ack_msg_id) {
+ // During SC failover, message received on ACTIVE AMFD can not
+ // be checked point to AMFD on STANDBY SC. But the AMFND still
+ // process the message ack for that message then it remove from queue.
+ // STANDBY SC takes ACTIVE and mismatch message id b/w AMFD and AMFND
+ // on new ACTIVE. In this case AVND send ID count greater than AVD
receive
+ // ID count on new ACTIVE. Shoudl realign.
+ cb->snd_msg_id = info->rcv_id_cnt;
+ } else {
+ /* Log error, seems to be some problem.*/
+ LOG_EM(
+ "AVND record not found, after failover, snd_msg_id = %u, receive id =
%u",
+ cb->snd_msg_id, info->rcv_id_cnt);
+ opensaf_reboot(
+ avnd_cb->node_info.nodeId,
+
osaf_extended_name_borrow(&avnd_cb->node_info.executionEnvironment),
+ "AVND record not found, after failover");
+ exit(0);
+ }
}
/*