Hi Vu, Thanks for comments. I reply my answer inline.
Best Regards, ThuanTr -----Original Message----- From: Nguyen Minh Vu <vu.m.ngu...@dektech.com.au> Sent: Tuesday, October 22, 2019 4:42 PM To: thuan.tran <thuan.t...@dektech.com.au>; 'Minh Chau' <minh.c...@dektech.com.au>; hans.nordeb...@ericsson.com; gary....@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net Subject: Re: [PATCH 1/1] mds: not waste 1.5s in waiting Adest already down to send response message [#3102] Hi Thuan, I have few comments below. Regards, Vu On 10/22/19 7:14 AM, thuan.tran wrote: > - When sending response message which Adest not exist (already down) > current MDS try to wait for 1.5 seconds before conclude no route to > send response message. > > - There are 2 scenarios may have: > UP -> DOWN -> receive SNDRSP -> response timeout after 1.5s > UP -> receive SNDRSP -> DOWN -> response timeout after 1.5s > > - With this change, MDS will not waste for 1.5s which can cause trouble > for higher layer services, e.g: ntf, imm, etc... > --- > src/mds/mds_c_api.c | 70 ++++++++++++++++++++++++++++++++++++++++- > src/mds/mds_c_sndrcv.c | 52 ++++++++++++++++++++++++++++-- > src/mds/mds_core.h | 25 +++++++++++++-- > src/mds/mds_dt2c.h | 2 +- > src/mds/mds_dt_common.c | 22 ++++++++++++- > 5 files changed, 162 insertions(+), 9 deletions(-) > > diff --git a/src/mds/mds_c_api.c b/src/mds/mds_c_api.c > index 132555b8e..5dd30c536 100644 > --- a/src/mds/mds_c_api.c > +++ b/src/mds/mds_c_api.c > @@ -1900,6 +1900,32 @@ uint32_t mds_mcm_svc_up(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > > /*************** Validation for SCOPE **********************/ > > + if (adest != m_MDS_GET_ADEST) { > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *)ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, > + (uint8_t *)&adest); > + if (!adest_info) { > + /* Add adest to adest list */ > + adest_info = m_MMGR_ALLOC_ADEST_INFO; > + memset(adest_info, 0, sizeof(MDS_ADEST_INFO)); > + adest_info->adest = adest; > + adest_info->node.key_info = > + (uint8_t *)&adest_info->adest; > + adest_info->svc_cnt = 1; > + adest_info->tmr_start = false; > + ncs_patricia_tree_add( > + &gl_mds_mcm_cb->adest_list, > + (NCS_PATRICIA_NODE *)adest_info); > + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 > + " svc_cnt=%u", adest, adest_info->svc_cnt); > + } else { > + adest_info->svc_cnt++; > + m_MDS_LOG_DBG("MCM:API: Adest=%" PRIu64 > + " svc_cnt=%u", adest, adest_info->svc_cnt); > + } > + } [Vu] This new database, adest_list, is shared b/w internal osaf_mds thread and mds's user thread, hence should be protected by mutex. [Thuan] It's protected by gl_mds_library_mutex > + > status = mds_get_subtn_res_tbl_by_adest(local_svc_hdl, svc_id, vdest_id, > adest, &log_subtn_result_info); > > @@ -3571,6 +3597,24 @@ uint32_t mds_mcm_svc_down(PW_ENV_ID pwe_id, MDS_SVC_ID > svc_id, V_DEST_RL role, > /* Discard : Getting down before getting up */ > } else { /* Entry exist in subscription result table */ > > + /* If adest exist and no sndrsp, start a timer */ > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *)ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, > + (uint8_t *)&adest); > + if (adest_info) { > + adest_info->svc_cnt--; > + if (adest_info->svc_cnt == 0 && > + adest_info->sndrsp_cnt == 0) { > + m_MDS_LOG_INFO("MCM:API: Adest=%" PRIu64 > + " down timer start", adest); > + if (adest_info->tmr_start == false) { > + adest_info->tmr_start = true; > + start_mds_down_tmr(adest, svc_id); [Vu] Seems mds_down tmr is started twice. The first start is at the beginning of the function. [Thuan] That timer is not same with this timer, that timer for current process. This timer for remote adest [Vu] But what is the reason to start the mds down timer here? What if we won't start the tmr? [Thuan] Timer for handle UP -> DOWN -> receive SNDRSP -> send RSP /* potentially clean up the process info database */ MDS_PROCESS_INFO *info = mds_process_info_get(adest, svc_id); if (info != NULL) { /* Process info exist, delay the cleanup with a timeout to avoid * race */ start_mds_down_tmr(adest, svc_id); } > + } > + } > + } > + > if (vdest_id == m_VDEST_ID_FOR_ADEST_ENTRY) { > status = mds_subtn_res_tbl_del( > local_svc_hdl, svc_id, vdest_id, adest, > @@ -4956,6 +5000,17 @@ uint32_t mds_mcm_init(void) > return NCSCC_RC_FAILURE; > } > > + /* ADEST TREE */ > + memset(&pat_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS)); > + pat_tree_params.key_size = sizeof(MDS_DEST); > + if (NCSCC_RC_SUCCESS != > + ncs_patricia_tree_init(&gl_mds_mcm_cb->adest_list, > + &pat_tree_params)) { > + m_MDS_LOG_ERR( > + "MCM:API: patricia_tree_init: adest :failure, L > mds_mcm_init"); > + return NCSCC_RC_FAILURE; > + } > + > /* SERVICE TREE */ > memset(&pat_tree_params, 0, sizeof(NCS_PATRICIA_PARAMS)); > pat_tree_params.key_size = sizeof(MDS_SVC_HDL); > @@ -4966,7 +5021,12 @@ uint32_t mds_mcm_init(void) > if (NCSCC_RC_SUCCESS != > ncs_patricia_tree_destroy(&gl_mds_mcm_cb->vdest_list)) { > m_MDS_LOG_ERR( > - "MCM:API: patricia_tree_destroy: service :failure, > L mds_mcm_init"); > + "MCM:API: patricia_tree_destroy: vdest :failure, L > mds_mcm_init"); > + } > + if (NCSCC_RC_SUCCESS != > + ncs_patricia_tree_destroy(&gl_mds_mcm_cb->adest_list)) { > + m_MDS_LOG_ERR( > + "MCM:API: patricia_tree_destroy: adest :failure, L > mds_mcm_init"); > } > return NCSCC_RC_FAILURE; > } > @@ -4989,6 +5049,11 @@ uint32_t mds_mcm_init(void) > m_MDS_LOG_ERR( > "MCM:API: patricia_tree_destroy: vdest :failure, L > mds_mcm_init"); > } > + if (NCSCC_RC_SUCCESS != > + ncs_patricia_tree_destroy(&gl_mds_mcm_cb->adest_list)) { > + m_MDS_LOG_ERR( > + "MCM:API: patricia_tree_destroy: adest :failure, L > mds_mcm_init"); > + } > return NCSCC_RC_FAILURE; > } > > @@ -5036,6 +5101,9 @@ uint32_t mds_mcm_destroy(void) > /* VDEST TREE */ > ncs_patricia_tree_destroy(&gl_mds_mcm_cb->vdest_list); > > + /* ADEST TREE */ > + ncs_patricia_tree_destroy(&gl_mds_mcm_cb->adest_list); > + > /* Free MCM control block */ > m_MMGR_FREE_MCM_CB(gl_mds_mcm_cb); > > diff --git a/src/mds/mds_c_sndrcv.c b/src/mds/mds_c_sndrcv.c > index 7850ac714..4d8865b3a 100644 > --- a/src/mds/mds_c_sndrcv.c > +++ b/src/mds/mds_c_sndrcv.c > @@ -2650,6 +2650,22 @@ static uint32_t mcm_pvt_red_snd_process_common( > } > } > > + /* Update adest sndrsp counter */ > + if ((MDS_SENDTYPE_RSP == req->i_sendtype) || > + (MDS_SENDTYPE_RRSP == req->i_sendtype)) { [Vu] How about other send types? In current code, it also wait up to 1.5 seconds before returning timeout to caller. case MDS_SENDTYPE_SND: case MDS_SENDTYPE_RSP: case MDS_SENDTYPE_RED: case MDS_SENDTYPE_RRSP: case MDS_SENDTYPE_BCAST: case MDS_SENDTYPE_RBCAST: [Thuan] From mds.log of issue, the wait time only set for RSP/RRSP in mds_mcm_process_disc_queue_checks_redundant() > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *)ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, (uint8_t *)&dest); > + if (adest_info) { > + adest_info->sndrsp_cnt--; > + if (adest_info->sndrsp_cnt == 0 && > + adest_info->svc_cnt == 0) > + ncs_patricia_tree_del( > + &gl_mds_mcm_cb->adest_list, > + (NCS_PATRICIA_NODE *)adest_info); > + } > + } > + > if (NCSCC_RC_SUCCESS != mds_subtn_res_tbl_get_by_adest( > svc_cb->svc_hdl, to_svc_id, dest_vdest_id, > dest, &role_ret, &subs_result_hdl)) { > @@ -2716,9 +2732,27 @@ static uint32_t > mds_mcm_process_disc_queue_checks_redundant( > } else if (sub_info->tmr_flag != true) { > if ((MDS_SENDTYPE_RSP == req->i_sendtype) || > (MDS_SENDTYPE_RRSP == req->i_sendtype)) { > - time_wait = true; > - m_MDS_LOG_INFO( > - "MDS_SND_RCV:Disc queue red: Subscr exists no timer > running: Waiting for some time\n"); > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *)ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, > + (uint8_t *)&anchor); > + if (adest_info && adest_info->svc_cnt == 0) { > + adest_info->sndrsp_cnt--; > + m_MDS_LOG_NOTIFY( > + "MDS_SND_RCV: Adest already down," > + " skip response, remain sndrsp_cnt=%d", > + adest_info->sndrsp_cnt); > + if (adest_info->sndrsp_cnt == 0) > + ncs_patricia_tree_del( > + &gl_mds_mcm_cb->adest_list, > + (NCS_PATRICIA_NODE *)adest_info); > + return NCSCC_RC_FAILURE; [Vu] since we know the destination is down, the error code should not be a failure. [Thuan] I think it should be FAILURE, we just save time, still respect the error. > + } else { > + time_wait = true; > + m_MDS_LOG_INFO( > + "MDS_SND_RCV:Disc queue red: Subscr exists" > + " no timer running: Waiting for some time"); > + } > } else { > m_MDS_LOG_INFO( > "MDS_SND_RCV: Subscription exists but Timer has > expired\n"); > @@ -5024,6 +5058,18 @@ static uint32_t > mds_mcm_process_recv_snd_msg_common(MDS_SVC_INFO *svccb, > mds_mcm_free_msg_memory(recv->msg); > return NCSCC_RC_FAILURE; > } > + > + /* Update adest sndrsp counter */ > + if (recv->snd_type == MDS_SENDTYPE_SNDRSP || > + recv->snd_type == MDS_SENDTYPE_REDRSP) { > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *)ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, > + (uint8_t *)&recv->src_adest); > + if (adest_info) > + adest_info->sndrsp_cnt++; > + } [Vu] If adest_info does not exist from the adest database, mds should not fwd the message to upper layer as the rsp will be dropped later on. [Thuan] The adest can UP later. It mean if not yet get any UP for adest, the flow will be same as origin. > + > cbinfo.info.receive.pid = recv->pid; > cbinfo.info.receive.uid = recv->uid; > cbinfo.info.receive.gid = recv->gid; > diff --git a/src/mds/mds_core.h b/src/mds/mds_core.h > index c09b428ba..f5d4d22a1 100644 > --- a/src/mds/mds_core.h > +++ b/src/mds/mds_core.h > @@ -204,12 +204,12 @@ typedef struct mds_subscription_info { > #define m_GET_HDL_FROM_MDS_SVC_INFO(info) \ > ((info->svc-id)<<32 || \ > (info->parent_pwe->pwe_id)<<16 || \ > - (info->parent_pwe->parent_vdest->vdest-id))) > + (info->parent_pwe->parent_vdest->vdest-id)) > > #define m_GET_HDL_FROM_MDS_SVC_PWE_VDEST(s, p, v) \ > ((s)<<32 || \ > (p)<<16 || \ > - (v))) > + (v)) > > /**************************************\ > MDS PWE related declarations > @@ -251,6 +251,16 @@ typedef struct mds_vdest_info { > > } MDS_VDEST_INFO; > > +typedef struct mds_adest_info { > + /* Indexing info */ > + NCS_PATRICIA_NODE node; > + /* Adest info */ > + MDS_DEST adest; /* Key for Patricia node */ > + uint16_t svc_cnt; /* Adest SVC counter */ > + uint32_t sndrsp_cnt; /* Counter for adest send sndrsp */ > + bool tmr_start; /* Adest down timer start flag */ [Vu] What is the purpose of tmr_start flag? [Thuan] To not start timer many times, save resource. E.g: UP -> DOWN (timer start) -> UP -> DOWN (not start new timer if timer already started) > +} MDS_ADEST_INFO; > + > typedef struct mds_svc_info { > /* Indexing info */ > NCS_PATRICIA_NODE svc_list_node; > @@ -301,6 +311,7 @@ typedef struct mds_mcm_cb { > NCS_PATRICIA_TREE subtn_results; > NCS_PATRICIA_TREE svc_list; /* Tree of MDS_SVC_INFO information */ > NCS_PATRICIA_TREE vdest_list; /* Tree of MDS_VDEST_INFO information */ > + NCS_PATRICIA_TREE adest_list; /* Tree of MDS_ADEST_INFO information */ > } MDS_MCM_CB; > > /* Global MDSCB */ > @@ -681,6 +692,15 @@ uint32_t (*mds_mdtm_node_unsubscribe)(MDS_SUBTN_REF_VAL > subtn_ref_val); > m_NCS_MEM_FREE(p, NCS_MEM_REGION_TRANSIENT, NCS_SERVICE_ID_MDS, \ > MDS_MEM_VDEST_INFO) > > +#define m_MMGR_ALLOC_ADEST_INFO \ > + (MDS_ADEST_INFO *)m_NCS_MEM_ALLOC(sizeof(MDS_ADEST_INFO), \ > + NCS_MEM_REGION_TRANSIENT, \ > + NCS_SERVICE_ID_MDS, MDS_MEM_ADEST_INFO) > + > +#define m_MMGR_FREE_ADEST_INFO(p) \ > + m_NCS_MEM_FREE(p, NCS_MEM_REGION_TRANSIENT, NCS_SERVICE_ID_MDS, \ > + MDS_MEM_ADEST_INFO) > + > #define m_MMGR_ALLOC_PWE_INFO \ > (MDS_PWE_INFO *)m_NCS_MEM_ALLOC(sizeof(MDS_PWE_INFO), \ > NCS_MEM_REGION_TRANSIENT, \ > @@ -768,7 +788,6 @@ typedef struct mds_mcm_msg_elem { > NCSMDS_CALLBACK_INFO cbinfo; > } event; > } info; > - > } MDS_MCM_MSG_ELEM; > > /* ******************************************** */ > diff --git a/src/mds/mds_dt2c.h b/src/mds/mds_dt2c.h > index c92fecbba..49c9b09c4 100644 > --- a/src/mds/mds_dt2c.h > +++ b/src/mds/mds_dt2c.h > @@ -223,7 +223,7 @@ typedef enum { > MDS_MEM_DIRECT_BUFF, > MDS_MEM_AWAIT_ACTIVE, > MDS_MEM_MSGELEM, > - MDS_MEM_ADEST_LIST, > + MDS_MEM_ADEST_INFO, > MDS_MEM_REASSEM_QUEUE, > MDS_MEM_HDL_LIST, > MDS_MEM_CACHED_EVENTS_LIST, > diff --git a/src/mds/mds_dt_common.c b/src/mds/mds_dt_common.c > index de1388367..19000b387 100644 > --- a/src/mds/mds_dt_common.c > +++ b/src/mds/mds_dt_common.c > @@ -998,6 +998,26 @@ uint32_t mds_tmr_mailbox_processing(void) > info->mds_dest, info->pid); > (void)mds_process_info_del(info); > free(info); > + } else { > + MDS_DEST adest = > + tmr_req_info->info > + .down_event_tmr_info.adest; > + m_MDS_LOG_INFO( > + "TIMEOUT Adest=%" PRIu64 > + " down", adest); > + MDS_ADEST_INFO *adest_info = > + (MDS_ADEST_INFO *) > + ncs_patricia_tree_get( > + &gl_mds_mcm_cb->adest_list, > + (uint8_t *)&adest); > + if (adest_info) { > + adest_info->tmr_start = false; > + if (adest_info->svc_cnt == 0 && > + adest_info->sndrsp_cnt == 0) > + ncs_patricia_tree_del( > + > &gl_mds_mcm_cb->adest_list, > + (NCS_PATRICIA_NODE > *)adest_info); > + } > } > > if (tmr_req_info->info.down_event_tmr_info > @@ -1700,7 +1720,7 @@ uint16_t mds_checksum(uint32_t length, uint8_t buff[]) > /* Take the one's complement of sum */ > sum = ~sum; > > - return ((uint16_t)sum); > + return (uint16_t)sum; [Vu] Any reason to change the format of this code line? [Thuan] I saw error reported in static code check, I update it to avoid annoy report in future. > } > > > /**************************************************************************** _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel