If the active and standby SCs are split into network partitions, it is possible a RED_UP never arrives even though we have already received MBC PEER_UP. The service using MBC will then get stuck in an infinite loop and probably fail health checks.
To cater for 'normal' race conditions between MDS topology and data messages, allow only up to 255 loops. If this is exceeded, the msg will be discarded. --- src/mbc/mbcsv_evt_msg.h | 2 ++ src/mbc/mbcsv_peer.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/src/mbc/mbcsv_evt_msg.h b/src/mbc/mbcsv_evt_msg.h index f11a553..9eef747 100644 --- a/src/mbc/mbcsv_evt_msg.h +++ b/src/mbc/mbcsv_evt_msg.h @@ -197,6 +197,8 @@ typedef struct mbcsv_evt { MBCSV_EVT_MDS_SUBSCR_INFO mds_sub_evt; } info; + uint32_t hops; + } MBCSV_EVT; /*********************************************************************************** diff --git a/src/mbc/mbcsv_peer.c b/src/mbc/mbcsv_peer.c index b45904f..1d4b257 100644 --- a/src/mbc/mbcsv_peer.c +++ b/src/mbc/mbcsv_peer.c @@ -826,6 +826,15 @@ uint32_t mbcsv_process_peer_up_info(MBCSV_EVT *msg, CKPT_INST *ckpt, memcpy(evt, msg, sizeof(MBCSV_EVT)); TRACE_4("Still RED_UP event not arrived of the peer"); + if (evt->hops < 255) { + ++evt->hops; + } else { + LOG_WA("RED_UP missing, discarding peer up"); + m_NCS_UNLOCK(&mbcsv_cb.peer_list_lock, + NCS_LOCK_WRITE); + m_MMGR_FREE_MBCSV_EVT(evt); + return NCSCC_RC_FAILURE; + } /* Again post the event, till RED_UP event arrives */ if (NCSCC_RC_SUCCESS != @@ -833,6 +842,7 @@ uint32_t mbcsv_process_peer_up_info(MBCSV_EVT *msg, CKPT_INST *ckpt, TRACE_LEAVE2("ipc send failed"); m_NCS_UNLOCK(&mbcsv_cb.peer_list_lock, NCS_LOCK_WRITE); + m_MMGR_FREE_MBCSV_EVT(evt); return NCSCC_RC_FAILURE; } -- 2.7.4 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel