Hi Minh, Ack, reviewed and tested
Thanks, Ravi -----Original Message----- From: Minh Chau [mailto:minh.c...@dektech.com.au] Sent: Friday, November 03, 2017 12:28 PM To: hans.nordeb...@ericsson.com; ravisekhar.ko...@oracle.com; gary....@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Minh Chau <minh.c...@dektech.com.au> Subject: [PATCH 1/1] amfd: Add retry mechanism for ClmTrackStart/Stop as job queue V2 [#2631] Currenty amfd does not retry ClmTrackStart/Stop after swicthover if the CLM APIs are still unavailable due to high loaded system or connectivity issue. This patch adds a retry mechanism for CLM APIs as other existing IMM/NTF APIs --- src/amf/amfd/clm.cc | 79 +++++++++++++++++++++++++++++++++++++++++------ src/amf/amfd/clm.h | 27 ++++++++++++++-- src/amf/amfd/imm.cc | 28 +++++++++++++++++ src/amf/amfd/imm.h | 3 ++ src/amf/amfd/role.cc | 16 ++++------ src/amf/amfd/sg_2n_fsm.cc | 3 +- src/amf/amfd/sgproc.cc | 3 +- 7 files changed, 135 insertions(+), 24 deletions(-) diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc index d8342ca..2bcea2d 100644 --- a/src/amf/amfd/clm.cc +++ b/src/amf/amfd/clm.cc @@ -475,46 +475,63 @@ done: return error; } -SaAisErrorT avd_clm_track_start(void) { +SaAisErrorT avd_clm_track_start(AVD_CL_CB* cb) { SaUint8T trackFlags = SA_TRACK_CURRENT | SA_TRACK_CHANGES_ONLY | SA_TRACK_VALIDATE_STEP | SA_TRACK_START_STEP; TRACE_ENTER(); - SaAisErrorT error = - saClmClusterTrack_4(avd_cb->clmHandle, trackFlags, nullptr); + SaAisErrorT error = SA_AIS_OK; + + if (cb->is_clm_track_started == true) { + // abort all pending and unsuccessful jobs that stop tracking + // because at this moment, amfd wants to start cluster tracking + Fifo::remove(cb, JOB_TYPE_CLM); + } + + error = saClmClusterTrack_4(cb->clmHandle, trackFlags, nullptr); if (error != SA_AIS_OK) { if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT || error == SA_AIS_ERR_UNAVAILABLE) { LOG_WA("Failed to start cluster tracking %u", error); + error = SA_AIS_ERR_TRY_AGAIN; } else { LOG_ER("Failed to start cluster tracking %u", error); } } else { - avd_cb->is_clm_track_started = true; + cb->is_clm_track_started = true; } TRACE_LEAVE(); return error; } -SaAisErrorT avd_clm_track_stop(void) { +SaAisErrorT avd_clm_track_stop(AVD_CL_CB* cb) { TRACE_ENTER(); - SaAisErrorT error = saClmClusterTrackStop(avd_cb->clmHandle); + SaAisErrorT error = SA_AIS_OK; + + if (cb->is_clm_track_started == false) { + // abort all pending and unsuccessful jobs that start tracking + // because at this moment, amfd wants to sttop cluster tracking + Fifo::remove(cb, JOB_TYPE_CLM); + } + + error = saClmClusterTrackStop(cb->clmHandle); if (error != SA_AIS_OK) { if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT || error == SA_AIS_ERR_UNAVAILABLE) { LOG_WA("Failed to stop cluster tracking %u", error); + error = SA_AIS_ERR_TRY_AGAIN; } else if (error == SA_AIS_ERR_NOT_EXIST) { /* track changes was not started or stopped successfully */ LOG_WA("Failed to stop cluster tracking %u", error); - avd_cb->is_clm_track_started = false; + cb->is_clm_track_started = false; + error = SA_AIS_OK; } else { LOG_ER("Failed to stop cluster tracking %u", error); } } else { TRACE("Sucessfully stops cluster tracking"); - avd_cb->is_clm_track_started = false; + cb->is_clm_track_started = false; } - TRACE_LEAVE(); return error; } @@ -550,7 +567,7 @@ static void *avd_clm_init_thread(void *arg) { if (cb->avail_state_avd == SA_AMF_HA_ACTIVE) { for (;;) { - error = avd_clm_track_start(); + error = avd_clm_track_start(cb); if (error == SA_AIS_ERR_TRY_AGAIN || error == SA_AIS_ERR_TIMEOUT || error == SA_AIS_ERR_UNAVAILABLE) { osaf_nanosleep(&kHundredMilliseconds); @@ -584,3 +601,45 @@ SaAisErrorT avd_start_clm_init_bg(void) { pthread_attr_destroy(&attr); return SA_AIS_OK; } + +AvdJobDequeueResultT ClmTrackStart::exec(const AVD_CL_CB* cb) { + AvdJobDequeueResultT res; + TRACE_ENTER(); + + SaAisErrorT rc = avd_clm_track_start(const_cast<AVD_CL_CB*>(cb)); + if (rc == SA_AIS_OK) { + delete Fifo::dequeue(); + res = JOB_EXECUTED; + } else if (rc == SA_AIS_ERR_TRY_AGAIN) { + TRACE("TRY-AGAIN"); + res = JOB_ETRYAGAIN; + } else { + delete Fifo::dequeue(); + LOG_ER("%s: ClmTrackStart FAILED %u", __FUNCTION__, rc); + res = JOB_ERR; + } + + TRACE_LEAVE(); + return res; +} + +AvdJobDequeueResultT ClmTrackStop::exec(const AVD_CL_CB* cb) { + AvdJobDequeueResultT res; + TRACE_ENTER(); + + SaAisErrorT rc = avd_clm_track_stop(const_cast<AVD_CL_CB*>(cb)); + if (rc == SA_AIS_OK) { + delete Fifo::dequeue(); + res = JOB_EXECUTED; + } else if (rc == SA_AIS_ERR_TRY_AGAIN) { + TRACE("TRY-AGAIN"); + res = JOB_ETRYAGAIN; + } else { + delete Fifo::dequeue(); + LOG_ER("%s: ClmTrackStop FAILED %u", __FUNCTION__, rc); + res = JOB_ERR; + } + + TRACE_LEAVE(); + return res; +} diff --git a/src/amf/amfd/clm.h b/src/amf/amfd/clm.h index 6adf796..2bbe320 100644 --- a/src/amf/amfd/clm.h +++ b/src/amf/amfd/clm.h @@ -1,6 +1,7 @@ /* -*- OpenSAF -*- * * (C) Copyright 2010 The OpenSAF Foundation + * Copyright Ericsson AB 2017 - All Rights Reserved. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY @@ -16,6 +17,7 @@ */ #include <saClm.h> +#include "amf/amfd/imm.h" #ifndef AMF_AMFD_CLM_H_ #define AMF_AMFD_CLM_H_ @@ -23,9 +25,30 @@ struct cl_cb_tag; extern SaAisErrorT avd_clm_init(struct cl_cb_tag *); -extern SaAisErrorT avd_clm_track_start(void); -extern SaAisErrorT avd_clm_track_stop(void); +extern SaAisErrorT avd_clm_track_start(struct cl_cb_tag *); extern +SaAisErrorT avd_clm_track_stop(struct cl_cb_tag *); extern void clm_node_terminate(AVD_AVND *node); extern SaAisErrorT avd_start_clm_init_bg(void); +class ClmJob : public Job { +public: + ClmJob() {} + AvdJobTypeT getJobType() { return JOB_TYPE_CLM; } + bool isRunnable(const AVD_CL_CB *cb) { return true;} }; + +class ClmTrackStart : public ClmJob { + public: + ClmTrackStart() : ClmJob(){}; + AvdJobDequeueResultT exec(const struct cl_cb_tag *cb); + ~ClmTrackStart() {} +}; + +class ClmTrackStop : public ClmJob { + public: + ClmTrackStop() : ClmJob(){}; + AvdJobDequeueResultT exec(const struct cl_cb_tag *cb); + ~ClmTrackStop() {} +}; + #endif // AMF_AMFD_CLM_H_ diff --git a/src/amf/amfd/imm.cc b/src/amf/amfd/imm.cc index baf646c..bf7e3d3 100644 --- a/src/amf/amfd/imm.cc +++ b/src/amf/amfd/imm.cc @@ -461,6 +461,34 @@ AvdJobDequeueResultT Fifo::executeAll(const AVD_CL_CB *cb, AvdJobTypeT job_type) return ret; } +void Fifo::remove(const AVD_CL_CB *cb, AvdJobTypeT job_type) { + + Job *ajob, *firstjob; + + TRACE_ENTER(); + firstjob = nullptr; + + while ((ajob = peek()) != nullptr) { + if (ajob->getJobType() == job_type) { + delete Fifo::dequeue(); + } else { + // push back + ajob = Fifo::dequeue(); + Fifo::queue(ajob); + + // check if we have gone through all jobs of queue + if (firstjob == nullptr) { + firstjob = ajob; + } else { + if (firstjob == ajob) + break; + } + } + } + + TRACE_LEAVE(); +} + AvdJobDequeueResultT Fifo::executeAdminResp(const AVD_CL_CB *cb) { Job *ajob; AvdJobDequeueResultT ret = JOB_EXECUTED; diff --git a/src/amf/amfd/imm.h b/src/amf/amfd/imm.h index f0152ac..83c5686 100644 --- a/src/amf/amfd/imm.h +++ b/src/amf/amfd/imm.h @@ -56,6 +56,7 @@ typedef enum { typedef enum { JOB_TYPE_IMM = 1, /* A IMM job */ JOB_TYPE_NTF = 2, /* A NTF job */ + JOB_TYPE_CLM = 3, /* A CLM job */ JOB_TYPE_ANY } AvdJobTypeT; @@ -166,6 +167,8 @@ class Fifo { static AvdJobDequeueResultT execute(const AVD_CL_CB *cb); static AvdJobDequeueResultT executeAll(const AVD_CL_CB *cb, AvdJobTypeT job_type = JOB_TYPE_ANY); + static void remove(const AVD_CL_CB *cb, + AvdJobTypeT job_type = JOB_TYPE_ANY); static AvdJobDequeueResultT executeAdminResp(const AVD_CL_CB *cb); static void empty(); diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc index ec13c3b..865d89d 100644 --- a/src/amf/amfd/role.cc +++ b/src/amf/amfd/role.cc @@ -1066,7 +1066,8 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) { /* Mark AVD as Quiesced. */ cb->avail_state_avd = SA_AMF_HA_QUIESCED; - avd_clm_track_stop(); + if (avd_clm_track_stop(cb) == SA_AIS_ERR_TRY_AGAIN) + Fifo::queue(new ClmTrackStop()); /* Go ahead and set mds role as already the NCS SU has been switched */ if (NCSCC_RC_SUCCESS != @@ -1105,7 +1106,6 @@ uint32_t amfd_switch_actv_qsd(AVD_CL_CB *cb) { uint32_t amfd_switch_qsd_stdby(AVD_CL_CB *cb) { uint32_t status = NCSCC_RC_SUCCESS; - SaAisErrorT ais_rc; TRACE_ENTER(); LOG_NO("Switching Quiesced --> StandBy"); @@ -1138,12 +1138,8 @@ uint32_t amfd_switch_qsd_stdby(AVD_CL_CB *cb) { avd_pg_node_csi_del_all(cb, avnd); } - if (cb->is_clm_track_started == true) { - ais_rc = avd_clm_track_stop(); - if (ais_rc != SA_AIS_OK && ais_rc != SA_AIS_ERR_NOT_EXIST) { - LOG_ER("Failed to stop cluster tracking after switch over"); - } - } + if (avd_clm_track_stop(cb) == SA_AIS_ERR_TRY_AGAIN) + Fifo::queue(new ClmTrackStop()); LOG_NO("Controller switch over done"); saflog(LOG_NOTICE, amfSvcUsrName, "Controller switch over done at %x", @@ -1274,9 +1270,9 @@ uint32_t amfd_switch_stdby_actv(AVD_CL_CB *cb) { if (NCSCC_RC_SUCCESS != avd_rde_set_role(SA_AMF_HA_ACTIVE)) { LOG_ER("rde role change failed from stdy -> Active"); } - - if (avd_clm_track_start() != SA_AIS_OK) { + if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN) { LOG_ER("Switch Standby --> Active, clm track start failed"); + Fifo::queue(new ClmTrackStart()); avd_d2d_chg_role_rsp(cb, NCSCC_RC_FAILURE, SA_AMF_HA_ACTIVE); return NCSCC_RC_FAILURE; } diff --git a/src/amf/amfd/sg_2n_fsm.cc b/src/amf/amfd/sg_2n_fsm.cc index 3a7609e..c7d5844 100644 --- a/src/amf/amfd/sg_2n_fsm.cc +++ b/src/amf/amfd/sg_2n_fsm.cc @@ -1767,7 +1767,8 @@ uint32_t SG_2N::susi_success_sg_realign(AVD_SU *su, AVD_SU_SI_REL *susi, if ((state == SA_AMF_HA_ACTIVE) && (cb->node_id_avd == su->su_on_node->node_info.nodeId)) { /* This is as a result of failover, start CLM tracking*/ - (void)avd_clm_track_start(); + if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN) + Fifo::queue(new ClmTrackStart()); } // Set active_services_exist at error conditions e.g. controller fail-over diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc index d305b22..610c205 100644 --- a/src/amf/amfd/sgproc.cc +++ b/src/amf/amfd/sgproc.cc @@ -2201,7 +2201,8 @@ void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, AVD_AVND *avnd) { in avd_sg_2n_susi_sucss_sg_reln, so start here.*/ if ((i_su->sg_of_su->sg_redundancy_model == SA_AMF_2N_REDUNDANCY_MODEL) && (i_su->sg_of_su->sg_fsm_state == AVD_SG_FSM_STABLE)) - (void)avd_clm_track_start(); + if (avd_clm_track_start(cb) == SA_AIS_ERR_TRY_AGAIN) + Fifo::queue(new ClmTrackStart()); /* Free all the SU SI assignments*/ i_su->delete_all_susis(); -- 2.7.4 ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel