If the active and standby SCs are split into network partitions, it is
possible a RED_UP never arrives even though we have already
received MBC PEER_UP. The service using MBC will then get stuck
in an infinite loop and probably fail health checks.

To cater for 'normal' race conditions between MDS topology and data
messages, allow only up to 255 loops. If this is exceeded, the msg
will be discarded.
---
 src/mbc/mbcsv_evt_msg.h |  2 ++
 src/mbc/mbcsv_peer.c    | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/mbc/mbcsv_evt_msg.h b/src/mbc/mbcsv_evt_msg.h
index f11a553..9eef747 100644
--- a/src/mbc/mbcsv_evt_msg.h
+++ b/src/mbc/mbcsv_evt_msg.h
@@ -197,6 +197,8 @@ typedef struct mbcsv_evt {
     MBCSV_EVT_MDS_SUBSCR_INFO mds_sub_evt;
   } info;
 
+  uint32_t hops;
+
 } MBCSV_EVT;
 
 
/***********************************************************************************
diff --git a/src/mbc/mbcsv_peer.c b/src/mbc/mbcsv_peer.c
index b45904f..1d4b257 100644
--- a/src/mbc/mbcsv_peer.c
+++ b/src/mbc/mbcsv_peer.c
@@ -826,6 +826,15 @@ uint32_t mbcsv_process_peer_up_info(MBCSV_EVT *msg, 
CKPT_INST *ckpt,
                        memcpy(evt, msg, sizeof(MBCSV_EVT));
 
                        TRACE_4("Still RED_UP event not arrived of the peer");
+                       if (evt->hops < 255) {
+                               ++evt->hops;
+                       } else {
+                               LOG_WA("RED_UP missing, discarding peer up");
+                               m_NCS_UNLOCK(&mbcsv_cb.peer_list_lock,
+                                       NCS_LOCK_WRITE);
+                               m_MMGR_FREE_MBCSV_EVT(evt);
+                               return NCSCC_RC_FAILURE;
+                       }
 
                        /* Again post the event, till RED_UP event arrives */
                        if (NCSCC_RC_SUCCESS !=
@@ -833,6 +842,7 @@ uint32_t mbcsv_process_peer_up_info(MBCSV_EVT *msg, 
CKPT_INST *ckpt,
                                TRACE_LEAVE2("ipc send failed");
                                m_NCS_UNLOCK(&mbcsv_cb.peer_list_lock,
                                             NCS_LOCK_WRITE);
+                               m_MMGR_FREE_MBCSV_EVT(evt);
                                return NCSCC_RC_FAILURE;
                        }
 
-- 
2.7.4



_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to