During SC failover, message sent on ACTIVE AMFD can not be checked point to AMFD on STANDBY SC. But the AMFND still increase receive/send msg id count. Then STANDBY SC takes ACTIVE and mismatch message id b/w AMFND and new active AMFD. Solution is to make msg id count alignment b/w AMFD/AMFND in this case. --- src/amf/amfnd/avnd_cb.h | 1 + src/amf/amfnd/di.cc | 19 ++++++++++++++++--- src/amf/amfnd/verify.cc | 24 +++++++++++++++++------- 3 files changed, 34 insertions(+), 10 deletions(-)
diff --git a/src/amf/amfnd/avnd_cb.h b/src/amf/amfnd/avnd_cb.h index 8af5e5fe1..a8241b965 100644 --- a/src/amf/amfnd/avnd_cb.h +++ b/src/amf/amfnd/avnd_cb.h @@ -96,6 +96,7 @@ typedef struct avnd_cb_tag { uint32_t rcv_msg_id; /* Message ID of the last message received */ /* AvD messaging params (retransmit list etc.) */ uint32_t snd_msg_id; /* send msg id */ + uint32_t active_ack_msg_id; // msg id acked by active /** List of messages sent to director but not yet acked. * Messages are removed when acked with the ACK message. diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc index 40229438d..1ebf22d65 100644 --- a/src/amf/amfnd/di.cc +++ b/src/amf/amfnd/di.cc @@ -1260,10 +1260,23 @@ uint32_t avnd_di_ack_nack_msg_send(AVND_CB *cb, uint32_t rcv_id, msg.info.avd->msg_info.n2d_ack_nack_info.msg_id = (cb->snd_msg_id + 1); msg.info.avd->msg_info.n2d_ack_nack_info.node_id = cb->node_info.nodeId; - if (rcv_id != cb->rcv_msg_id) - msg.info.avd->msg_info.n2d_ack_nack_info.ack = false; - else + if (rcv_id != cb->rcv_msg_id) { + LOG_WA("Mismatch msg id, AVD send ID count: %u, " + "AVND receive ID count: %u", rcv_id, cb->rcv_msg_id); + // During SC failover, message sent on ACTIVE AMFD can not + // be checked point to AMFD on STANDBY SC. But the AMFND still + // receive msg id. STANDBY SC takes ACTIVE and mismatch message + // id b/w AMFD and AMFND on new ACTIVE. In this case AVND receive + // ID count greater than AVD sent id count. Shoudl rsp ack(true). + if (cb->rcv_msg_id > rcv_id) { + cb->rcv_msg_id = rcv_id; + msg.info.avd->msg_info.n2d_ack_nack_info.ack = true; + } else { + msg.info.avd->msg_info.n2d_ack_nack_info.ack = false; + } + } else { msg.info.avd->msg_info.n2d_ack_nack_info.ack = true; + } TRACE_1("MsgId=%u,ACK=%u", msg.info.avd->msg_info.n2d_ack_nack_info.msg_id, msg.info.avd->msg_info.n2d_ack_nack_info.ack); diff --git a/src/amf/amfnd/verify.cc b/src/amf/amfnd/verify.cc index e5b1e7793..d6edc8855 100644 --- a/src/amf/amfnd/verify.cc +++ b/src/amf/amfnd/verify.cc @@ -128,15 +128,25 @@ uint32_t avnd_evt_avd_verify_evh(AVND_CB *cb, AVND_EVT *evt) { } if ((cb->snd_msg_id != info->rcv_id_cnt) && (msg_found == false)) { + if (cb->snd_msg_id == cb->active_ack_msg_id) { + // During SC failover, message received on ACTIVE AMFD can not + // be checked point to AMFD on STANDBY SC. But the AMFND still + // process the message ack for that message then it remove from queue. + // STANDBY SC takes ACTIVE and mismatch message id b/w AMFD and AMFND + // on new ACTIVE. In this case AVND send ID count greater than AVD receive + // ID count on new ACTIVE. Shoudl realign. + cb->snd_msg_id = info->rcv_id_cnt; + } else { /* Log error, seems to be some problem.*/ LOG_EM( - "AVND record not found, after failover, snd_msg_id = %u, receive id = %u", - cb->snd_msg_id, info->rcv_id_cnt); - opensaf_reboot( - avnd_cb->node_info.nodeId, - osaf_extended_name_borrow(&avnd_cb->node_info.executionEnvironment), - "AVND record not found, after failover"); - exit(0); + "AVND record not found, after failover, snd_msg_id = %u, receive id = %u", + cb->snd_msg_id, info->rcv_id_cnt); + opensaf_reboot( + avnd_cb->node_info.nodeId, + osaf_extended_name_borrow(&avnd_cb->node_info.executionEnvironment), + "AVND record not found, after failover"); + exit(0); + } } /* -- 2.25.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel