Hi Minh, I created a new ticket https://sourceforge.net/p/opensaf/tickets/3294/ to refactor these functions. We will refactor it in that ticket instead of this.
Best regards, Hieu -----Original Message----- From: Minh Hon Chau <minh.c...@dektech.com.au> Sent: Tuesday, October 26, 2021 5:52 AM To: Hieu Hong Hoang <hieu.h.ho...@dektech.com.au>; Thang Duc Nguyen <thang.d.ngu...@dektech.com.au>; Thien Minh Huynh <thien.m.hu...@dektech.com.au> Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: Resolve active MxN VDEST conflict in split brain [#3281] Hi Hieu, ack from me, but we should refactor this function into smaller subfunctions/logics, the function currently has 1.5K+ line of code. Thanks Minh On 18/10/21 6:33 pm, hieu.h.hoang wrote: > If split brain happens and network merges back, one active SC is possible to > stay alive due to other active SCs reboot too fast. Although alive SC doesn't > detect the split brain, it still detects some services are up in active > rebooting SCs. > In the ticket description, the status of cluster before network merges is > [[SC-1(ACT), SC-2(STB)], [SC-3(ACT), SC-4, SC-5(STB), SC-6], [SC-7(STB), > SC-8(ATC), SC-9], [SC-10(ATC)]]. On SC-1, the ntfa received wrong > NCSMDS_NO_ACTIVE in the below scenario: > - Detect ntfs on active SC-3 is up. > - Because ntfs on SC-1 is active, there are not SVC up event. > - Update the active vdest to new ntfs(on SC-3). > - Detect ntfs on active SC-3 is down. > - The active vdest is removed. A NCSMDS_NO_ACTIVE is generated, although ntfs > on SC-1 is still active. > Solution: don't replace current active vdest by a new active vdest. When > current active vdest is down, replace it by other active vdest if any. > --- > src/mds/mds_c_api.c | 260 ++++++++++++++++++++++++++++++++++---------- > src/mds/mds_c_db.c | 22 ++-- > 2 files changed, 213 insertions(+), 69 deletions(-) > > diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c index > 2a297a1e2..8e2f899a9 100644 > --- a/src/mds/mds_c_api.c > +++ b/src/mds/mds_c_api.c > @@ -2799,21 +2799,16 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > } > } > > - /* Make this as active > - */ > - > mds_subtn_res_tbl_change_active( > - local_svc_hdl, > - svc_id, > - (MDS_VDEST_ID) > - vdest_id, > - subtn_result_info, > - svc_sub_part_ver, > - archword_type); > - > - if ((tmr_running == > - true) || > - (local_subtn_view == > - MDS_VIEW_NORMAL)) { > + if (tmr_running == > true) { > + /* Make this as > active */ > + > mds_subtn_res_tbl_change_active( > + > local_svc_hdl, > + svc_id, > + > (MDS_VDEST_ID) > + > vdest_id, > + > subtn_result_info, > + > svc_sub_part_ver, > + > archword_type); > /* Call user > * callback UP > */ > @@ -2862,6 +2857,11 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > > ->sub_adest_details, > > svc_sub_part_ver, > > archword_type); > + } else { > + /* > + * Conflict > active entry, > + * keep old > active entry. > + */ > } > } > } > @@ -2973,6 +2973,63 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > svc_sub_part_ver, > archword_type); > } > + > + status = > + > mds_subtn_res_tbl_query_next_active( > + local_svc_hdl, svc_id, > + vdest_id, > + subtn_result_info, > + > &next_active_result_info); > + if (status == > + NCSCC_RC_SUCCESS) { > + /* > + * Other active present > + * Change active entry > + */ > + > mds_subtn_res_tbl_change_active( > + local_svc_hdl, > + svc_id, > vdest_id, > + > next_active_result_info, > + > svc_sub_part_ver, > + archword_type); > + status = > mds_mcm_user_event_callback( > + local_svc_hdl, > pwe_id, > + svc_id, role, > vdest_id, 0, > + > NCSMDS_NEW_ACTIVE, > + > svc_sub_part_ver, > + > MDS_SVC_ARCHWORD_TYPE_UNSPECIFIED); > + if (status != > NCSCC_RC_SUCCESS) { > + /* Callback > failure */ > + m_MDS_LOG_ERR( > + > "MCM:API: svc_up :" > + " > NCSMDS_NEW_ACTIVE Callback" > + " > Failure for svc_id = %s(%d)", > + > get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)); > + m_MDS_LEAVE(); > + return > NCSCC_RC_FAILURE; > + } > + m_MDS_LOG_INFO( > + "MCM:API: > svc_up : svc_id = %s(%d)" > + " on DEST id = > %d got NCSMDS_NEW_ACTIVE for" > + " svc_id = > %s(%d) on Vdest id = %d" > + " Adest = %s, > rem_svc_pvt_ver=%d", > + get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > m_MDS_GET_VDEST_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > get_svc_names(svc_id), > + svc_id, > vdest_id, > + > next_active_result_info > + > ->sub_adest_details, > + > svc_sub_part_ver); > + } > } else { /* Some other entry is active > */ > > @@ -3779,7 +3836,7 @@ uint32_t mds_mcm_svc_down(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > m_MDS_LOG_INFO("MCM:API: svc_down : " > "svc_id = %s(%d) on DEST id = > %d " > "got NO_ACTIVE for svc_id = > %s(%d) " > - "on Vdest id = %d Adest = %s, rem_svc_pvt_ver=%d", > + "on Vdest id = %d Adest = %s, > rem_svc_pvt_ver=%d", > get_svc_names( > > m_MDS_GET_SVC_ID_FROM_SVC_HDL(local_svc_hdl)), > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > @@ -3799,49 +3856,142 @@ > uint32_t mds_mcm_svc_down(PW_ENV_ID pwe_id, MDS_SVC_ID svc_id, V_DEST_RL role, > if (active_adest == adest) { > if (vdest_policy == > NCS_VDEST_TYPE_MxN) { > - mds_subtn_res_tbl_remove_active( > - local_svc_hdl, svc_id, > - vdest_id); > + status = > + > mds_subtn_res_tbl_query_next_active( > + local_svc_hdl, svc_id, > + vdest_id, > + subtn_result_info, > + > &next_active_result_info); > + if (status == > + NCSCC_RC_FAILURE) { > + /* No other active > present */ > + > mds_subtn_res_tbl_remove_active( > + local_svc_hdl, > svc_id, > + vdest_id); > > - /* Call user call back with NO > - * ACTIVE */ > - status = NCSCC_RC_SUCCESS; > - status = > mds_mcm_user_event_callback( > - local_svc_hdl, pwe_id, > - svc_id, role, vdest_id, 0, > - NCSMDS_NO_ACTIVE, > - svc_sub_part_ver, > - > MDS_SVC_ARCHWORD_TYPE_UNSPECIFIED); > + /* Call user call back > with NO ACTIVE */ > + status = > NCSCC_RC_SUCCESS; > + status = > mds_mcm_user_event_callback( > + local_svc_hdl, > pwe_id, > + svc_id, role, > vdest_id, 0, > + > NCSMDS_NO_ACTIVE, > + > svc_sub_part_ver, > + > MDS_SVC_ARCHWORD_TYPE_UNSPECIFIED); > > - if (status != > - NCSCC_RC_SUCCESS) { > - /* Callback failure */ > - m_MDS_LOG_ERR( > - "MCM:API: svc_down > : NO_ACTIVE Callback Failure for svc_id = %s(%d)", > - get_svc_names( > + if (status != > + > NCSCC_RC_SUCCESS) { > + /* Callback > failure */ > + m_MDS_LOG_ERR( > + > "MCM:API: svc_down : NO_ACTIVE Callback Failure for svc_id = %s(%d)", > + > get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)); > + m_MDS_LEAVE(); > + return > NCSCC_RC_FAILURE; > + } > + > + m_MDS_LOG_INFO( > + "MCM:API: > svc_down : svc_id = %s(%d) on DEST id = %d got NO_ACTIVE for " > + "svc_id = > %s(%d) on Vdest id = %d Adest = %s, rem_svc_pvt_ver=%d", > + get_svc_names( > > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > - > local_svc_hdl)), > - > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > - local_svc_hdl)); > - m_MDS_LEAVE(); > - return NCSCC_RC_FAILURE; > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > m_MDS_GET_VDEST_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > get_svc_names(svc_id), > + svc_id, > vdest_id, > + > log_subtn_result_info > + > ->sub_adest_details, > + > svc_sub_part_ver); > + } else { > + status = > mds_mcm_user_event_callback( > + local_svc_hdl, > pwe_id, > + svc_id, role, > vdest_id, 0, > + > NCSMDS_NO_ACTIVE, > + > svc_sub_part_ver, > + > MDS_SVC_ARCHWORD_TYPE_UNSPECIFIED); > + if (status != > NCSCC_RC_SUCCESS) { > + /* Callback > failure */ > + m_MDS_LOG_ERR( > + > "MCM:API: svc_down : NCSMDS_NO_ACTIVE" > + " > Callback Failure for svc_id = %s(%d)", > + > get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)); > + m_MDS_LEAVE(); > + return > NCSCC_RC_FAILURE; > + } > + m_MDS_LOG_INFO( > + "MCM:API: > svc_down : svc_id = %s(%d)" > + " on DEST id = > %d got NCSMDS_DOWN for" > + " svc_id = > %s(%d) on Vdest id = %d" > + " Adest = %s, > rem_svc_pvt_ver=%d", > + get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > m_MDS_GET_VDEST_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > get_svc_names(svc_id), > + svc_id, > vdest_id, > + > log_subtn_result_info > + > ->sub_adest_details, > + > svc_sub_part_ver); > + /* > + * Change Active entry > + */ > + > mds_subtn_res_tbl_change_active( > + local_svc_hdl, > + svc_id, > vdest_id, > + > next_active_result_info, > + > svc_sub_part_ver, > + archword_type); > + status = > mds_mcm_user_event_callback( > + local_svc_hdl, > pwe_id, > + svc_id, role, > vdest_id, 0, > + > NCSMDS_NEW_ACTIVE, > + > svc_sub_part_ver, > + > MDS_SVC_ARCHWORD_TYPE_UNSPECIFIED); > + if (status != > NCSCC_RC_SUCCESS) { > + /* > + * Callback > failure > + */ > + m_MDS_LOG_ERR( > + > "MCM:API: svc_up : NCSMDS_NEW_ACTIVE" > + " > Callback Failure for svc_id = %s(%d)", > + > get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)); > + m_MDS_LEAVE(); > + return > NCSCC_RC_FAILURE; > + } > + m_MDS_LOG_INFO( > + "MCM:API: > svc_up : svc_id = %s(%d)" > + " on DEST id = > %d got NCSMDS_NEW_ACTIVE for" > + " svc_id = > %s(%d) on Vdest id = %d" > + " Adest = %s, > rem_svc_pvt_ver=%d", > + get_svc_names( > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + > local_svc_hdl)), > + > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > m_MDS_GET_VDEST_ID_FROM_SVC_HDL( > + local_svc_hdl), > + > get_svc_names(svc_id), > + svc_id, > vdest_id, > + > next_active_result_info > + > ->sub_adest_details, > + > svc_sub_part_ver); > } > - > - m_MDS_LOG_INFO( > - "MCM:API: svc_down : svc_id > = %s(%d) on DEST id = %d got NO_ACTIVE for " > - "svc_id = %s(%d) on Vdest > id = %d Adest = %s, rem_svc_pvt_ver=%d", > - get_svc_names( > - > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > - local_svc_hdl)), > - > m_MDS_GET_SVC_ID_FROM_SVC_HDL( > - local_svc_hdl), > - > m_MDS_GET_VDEST_ID_FROM_SVC_HDL( > - local_svc_hdl), > - get_svc_names(svc_id), > - svc_id, vdest_id, > - log_subtn_result_info > - ->sub_adest_details, > - svc_sub_part_ver); > { > if (adest_exists == > false) { > diff --git a/src/mds/mds_c_db.c b/src/mds/mds_c_db.c index > 337f0cb23..2213c664b 100644 > --- a/src/mds/mds_c_db.c > +++ b/src/mds/mds_c_db.c > @@ -1991,20 +1991,14 @@ uint32_t mds_subtn_res_tbl_add(MDS_SVC_HDL svc_hdl, > MDS_SVC_ID subscr_svc_id, > .active_route_info > ->next_active_in_turn = > subtn_res_info; > - } else { /* Present entry is Active Entry */ > - > - if (local_vdest_policy == > - NCS_VDEST_TYPE_MxN) { > - /* Change active to point to > - * this active */ > - active_subtn_res_info->info > - .active_vdest > - .active_route_info > - ->next_active_in_turn = > - subtn_res_info; > - } else { > - /* Do nothing just add entry */ > - } > + } else { > + /* > + * Present entry is Active Entry. > + * Do nothing just add entry. > + * It is an active entry conflict > + * in NCS_VDEST_TYPE_MxN mode, just wait > + * for an actice entry goes down. > + */ > } > } > } else { /* role == V_DEST_RL_STANDBY */ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel