Thanks Minh for your comment and your time to review.

Hi Minh/Thang/Nagendra/Paul,
I am planning to push the patch by 30th July(thursday).
Please kindly find some time to review by 29th July(tomorrow) and
send your comments or Ack.

Thanks
Anand Sundararaj
Senior Solutions Architect | +1 480 686 4772
www.GetHighAvailability.com 
Get High Availability Today!
NJ, USA: +1 508-507-6507

> On 07/27/2020 8:28 PM minhchau <minh.c...@dektech.com.au> wrote:
> 
>  
> Hi,
> 
> It works now.
> 
> /Minh
> 
> On 28/7/20 11:43 am, s.an...@gethighavailability.com wrote:
> > Hi Minh,
> > Good catch.
> > The following patch will solve the issue, please confirm:
> >
> > diff --git a/src/amf/amfd/clm.cc b/src/amf/amfd/clm.cc
> > index cfbe36a..743a6c2 100644
> > --- a/src/amf/amfd/clm.cc
> > +++ b/src/amf/amfd/clm.cc
> > @@ -24,10 +24,12 @@
> >   #include "amf/amfd/node.h"
> >   #include "base/osaf_time.h"
> >
> > -static void clm_node_join_complete(AVD_AVND *node) {
> > -  TRACE_ENTER();
> > -  /* Enable the node in any case. */
> > -  avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED);
> > +static void clm_node_join_complete(AVD_AVND *node, bool 
> > member_transition_state) {
> > +  TRACE_ENTER2("%u", member_transition_state);
> > +  /* Don't enable in case, node was up and running and node 
> > failover/switchover
> > +     case was reported i.e. member_transition_state == false */
> > +  if (member_transition_state == true)
> > +    avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED);
> >
> >     /* For each of the SUs calculate the readiness state.
> >      ** call the SG FSM with the new readiness state.
> > @@ -415,6 +417,12 @@ static void clm_track_cb(
> >               avd_node_add_nodeid(node);
> >               TRACE("Node Joined '%s' '%zu'", node_name.c_str(),
> >                     node_name.length());
> > +           // To differentiate between CLM lock and track cbk in case of
> > +            // si-swap
> > +           bool member_transition_state = false;
> > +           if (node->node_info.member == SA_FALSE)
> > +              member_transition_state = true;
> > +           TRACE("member_transition_state %u", member_transition_state);
> >
> >               node->node_info.member = SA_TRUE;
> >               if (avd_cb->avd_peer_ver < AVD_MBCSV_SUB_PART_VERSION_4) {
> > @@ -427,7 +435,7 @@ static void clm_track_cb(
> >               if (node->node_state == AVD_AVND_STATE_PRESENT) {
> >                 TRACE("Node already up and configured");
> >                 /* now try to instantiate all the SUs that need to be */
> > -              clm_node_join_complete(node);
> > +              clm_node_join_complete(node, member_transition_state);
> >               }
> >             } else {
> >               LOG_IN("AMF-node not configured on this CLM-node '%s'",
> >
> >
> > Thanks
> >
> > Anand Sundararaj
> > Senior Solutions Architect | 480 686 4772
> >   
> >
> > www.GetHighAvailability.com
> >
> > Get High Availability Today!
> > NJ, USA: +1 508-507-6507
> >
> >> On 07/27/2020 12:29 PM minhchau <minh.c...@dektech.com.au> wrote:
> >>
> >>   
> >> Hi Anand,
> >>
> >> I have run the following test:
> >>
> >> 1. Install 2N amf demo on PL-4 and PL-5
> >>
> >> 2. Disable saAmfNodeAutoRepair on both PLs
> >>
> >> 3. Kill amf_demo on PL-5 to escalate Node Failover recovery
> >>
> >> 4. Run amf-adm repaired <PL-5 dn>
> >>
> >>       -> The command works, PL-5 is repaired
> >>
> >> However, after step 3, do 2N OpenSAF si-swap, then the 'repaired' does
> >> not work
> >>
> >> root@PL-5:~#  amf-adm repaired safAmfNode=PL-5,safAmfCluster=myAmfCluster
> >> error - saImmOmAdminOperationInvoke_2 admin-op RETURNED:
> >> SA_AIS_ERR_NO_OP (28)
> >> error-string: Admin repair request for
> >> 'safAmfNode=PL-5,safAmfCluster=myAmfCluster', op state already enabled
> >>
> >> Replace si-swap by a SC failover, the command also has the same error.
> >>
> >> Thanks
> >>
> >> Minh
> >>
> >> On 24/7/20 9:04 am, s.an...@gethighavailability.com wrote:
> >>> From: Anand Sundararaj <s.an...@gethighavailability.com>
> >>>
> >>> ---
> >>>    src/amf/amfd/node.cc    | 56 
> >>> +++++++++++++++++++++++++++++++++++++++++++-
> >>>    src/amf/amfd/node.h     |  2 ++
> >>>    src/amf/amfd/sgproc.cc  | 18 ++++++++++++++
> >>>    src/amf/amfnd/avnd_su.h |  1 +
> >>>    src/amf/amfnd/di.cc     | 62 
> >>> +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>    src/amf/amfnd/err.cc    |  2 +-
> >>>    src/amf/amfnd/su.cc     |  2 +-
> >>>    7 files changed, 140 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/src/amf/amfd/node.cc b/src/amf/amfd/node.cc
> >>> index 2c04fca..4c63055 100644
> >>> --- a/src/amf/amfd/node.cc
> >>> +++ b/src/amf/amfd/node.cc
> >>> @@ -108,6 +108,7 @@ void AVD_AVND::initialize() {
> >>>      saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
> >>>      admin_node_pend_cbk = {};
> >>>      su_cnt_admin_oper = {};
> >>> +  su_cnt_admin_repair = {};
> >>>      node_state = AVD_AVND_STATE_ABSENT;
> >>>      list_of_ncs_su = {};
> >>>      list_of_su = {};
> >>> @@ -1247,6 +1248,23 @@ void AVD_AVND::node_sus_termstate_set(bool 
> >>> term_state) const {
> >>>    }
> >>>    
> >>>    /**
> >>> + * Update count of application SUs of the given node.
> >>> + *
> >>> + * @param node pointer
> >>> + * @return None
> >>> + */
> >>> +static uint32_t application_sus_count(AVD_AVND *node) {
> >>> +  uint32_t count = 0;
> >>> +  /* Count the applications SUs hosted on this node. */
> >>> +  for (const auto &su : node->list_of_su) {
> >>> +    if (su->su_is_external == false)
> >>> +      count++;
> >>> +  }
> >>> +  TRACE("count '%u'", count);
> >>> +  return count;
> >>> +}
> >>> +
> >>> +/**
> >>>     * Handle admin operations on SaAmfNode objects.
> >>>     *
> >>>     * @param immOiHandle
> >>> @@ -1532,9 +1550,44 @@ static void node_admin_op_cb(SaImmOiHandleT 
> >>> immOiHandle,
> >>>          }
> >>>          break;
> >>>    
> >>> +    case SA_AMF_ADMIN_REPAIRED:
> >>> +      if (node->saAmfNodeOperState == SA_AMF_OPERATIONAL_ENABLED) {
> >>> +        report_admin_op_error(
> >>> +            immOiHandle, invocation, SA_AIS_ERR_NO_OP, nullptr,
> >>> +            "Admin repair request for '%s', op state already enabled",
> >>> +            node->name.c_str());
> >>> +        goto done;
> >>> +      }
> >>> +
> >>> +      if (node->node_info.member == false) {
> >>> +        LOG_NO("'%s' ADMIN_REPAIRED: CLM node is not member",
> >>> +            node->name.c_str());
> >>> +        avd_saImmOiAdminOperationResult(immOiHandle, invocation, 
> >>> SA_AIS_ERR_TRY_AGAIN);
> >>> +        goto done;
> >>> +      }
> >>> +
> >>> +      if (0 == application_sus_count(node)) {
> >>> +        // Node is configured without appl su, return from here.
> >>> +        avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED);
> >>> +        avd_saImmOiAdminOperationResult(immOiHandle, invocation, 
> >>> SA_AIS_OK);
> >>> +        goto done;
> >>> +      }
> >>> +
> >>> +      /* forward the admin op req to the node director */
> >>> +      if (avd_admin_op_msg_snd(node->name, AVSV_SA_AMF_NODE, 
> >>> SA_AMF_ADMIN_REPAIRED,
> >>> +            node) == NCSCC_RC_SUCCESS) {
> >>> +        node->admin_node_pend_cbk.admin_oper = SA_AMF_ADMIN_REPAIRED;
> >>> +        node->admin_node_pend_cbk.invocation = invocation;
> >>> +        node->su_cnt_admin_repair = application_sus_count(node);
> >>> +      } else {
> >>> +        report_admin_op_error(immOiHandle, invocation, 
> >>> SA_AIS_ERR_TIMEOUT, nullptr,
> >>> +            "Admin op request send failed '%s'", node->name.c_str());
> >>> +      }
> >>> +      break;
> >>> +
> >>>        default:
> >>>          report_admin_op_error(immOiHandle, invocation, 
> >>> SA_AIS_ERR_NOT_SUPPORTED,
> >>> -                            nullptr, "UNSUPPORTED ADMIN OPERATION 
> >>> (%llu)",
> >>> +          nullptr, "UNSUPPORTED ADMIN OPERATION (%llu)",
> >>>                                operationId);
> >>>          break;
> >>>      }
> >>> @@ -1591,6 +1644,7 @@ void node_reset_su_try_inst_counter(const AVD_AVND 
> >>> *node) {
> >>>        su->sg_of_su->try_inst_counter = 0;
> >>>      }
> >>>    }
> >>> +
> >>>    /**
> >>>     * @brief  Checks all  nodegroup of nodes are in UNLOCKED state.
> >>>     * @param  ptr to Node (AVD_AVND).
> >>> diff --git a/src/amf/amfd/node.h b/src/amf/amfd/node.h
> >>> index 097f54b..37b8ce9 100644
> >>> --- a/src/amf/amfd/node.h
> >>> +++ b/src/amf/amfd/node.h
> >>> @@ -108,6 +108,8 @@ class AVD_AVND {
> >>>                                                callbacks on this node */
> >>>      uint32_t su_cnt_admin_oper;             /* count to keep track SUs 
> >>> on this node
> >>>                                                 undergoing node admin op 
> >>> */
> >>> +  uint32_t su_cnt_admin_repair; /* Count to keep track SUs on this node
> >>> +                                   being repaired by node admin repair 
> >>> op */
> >>>    
> >>>      /************ AMF B.04 
> >>> **************************************************/
> >>>    
> >>> diff --git a/src/amf/amfd/sgproc.cc b/src/amf/amfd/sgproc.cc
> >>> index 1d7e62a..335d043 100644
> >>> --- a/src/amf/amfd/sgproc.cc
> >>> +++ b/src/amf/amfd/sgproc.cc
> >>> @@ -1025,6 +1025,23 @@ void avd_su_oper_state_evh(AVD_CL_CB *cb, AVD_EVT 
> >>> *evt) {
> >>>            }
> >>>          }
> >>>        } else { /* if(su->sg_of_su->sg_ncs_spec == true) */
> >>> +      // This is a case of Node repair
> >>> +      if ((n2d_msg->msg_info.n2d_opr_state.node_oper_state ==
> >>> +            SA_AMF_OPERATIONAL_ENABLED) &&
> >>> +          (node->admin_node_pend_cbk.invocation) &&
> >>> +          (node->admin_node_pend_cbk.admin_oper == 
> >>> SA_AMF_ADMIN_REPAIRED)) {
> >>> +        avd_node_oper_state_set(node, SA_AMF_OPERATIONAL_ENABLED);
> >>> +        if (node->su_cnt_admin_repair > 0) node->su_cnt_admin_repair--;
> >>> +        if (node->su_cnt_admin_repair == 0) {
> >>> +          /* if this is the last SU then send out the pending callback */
> >>> +          avd_saImmOiAdminOperationResult(
> >>> +              cb->immOiHandle, node->admin_node_pend_cbk.invocation,
> >>> +              SA_AIS_OK);
> >>> +          node->admin_node_pend_cbk.invocation = 0;
> >>> +          node->admin_node_pend_cbk.admin_oper =
> >>> +            static_cast<SaAmfAdminOperationIdT>(0);
> >>> +        }
> >>> +      }
> >>>          /* If oper state of Uninstantiated SU got ENABLED so try to 
> >>> instantiate it
> >>>             after evaluating SG. */
> >>>          if (su->saAmfSUPresenceState == SA_AMF_PRESENCE_UNINSTANTIATED) {
> >>> @@ -2326,6 +2343,7 @@ void avd_node_down_mw_susi_failover(AVD_CL_CB *cb, 
> >>> AVD_AVND *avnd) {
> >>>                              SA_AIS_ERR_REPAIR_PENDING, 
> >>> &avnd->admin_node_pend_cbk,
> >>>                              "node failure");
> >>>        avnd->su_cnt_admin_oper = 0;
> >>> +    avnd->su_cnt_admin_repair = 0;
> >>>      }
> >>>    
> >>>      TRACE_LEAVE();
> >>> diff --git a/src/amf/amfnd/avnd_su.h b/src/amf/amfnd/avnd_su.h
> >>> index 45effd6..46a711f 100644
> >>> --- a/src/amf/amfnd/avnd_su.h
> >>> +++ b/src/amf/amfnd/avnd_su.h
> >>> @@ -461,5 +461,6 @@ extern AVND_SU 
> >>> *avnd_sudb_rec_get_next(AmfDb<std::string, AVND_SU> &sudb,
> >>>    extern void sudb_rec_comp_add(AVND_SU *su, AVND_COMP *comp, uint32_t 
> >>> *rc);
> >>>    uint32_t avnd_evt_avd_compcsi_evh(struct avnd_cb_tag *cb,
> >>>                                      struct avnd_evt_tag *evt);
> >>> +extern bool comp_in_term_failed_state(void);
> >>>    
> >>>    #endif  // AMF_AMFND_AVND_SU_H_
> >>> diff --git a/src/amf/amfnd/di.cc b/src/amf/amfnd/di.cc
> >>> index 7b01868..5bff121 100644
> >>> --- a/src/amf/amfnd/di.cc
> >>> +++ b/src/amf/amfnd/di.cc
> >>> @@ -92,6 +92,68 @@ static uint32_t avnd_evt_node_admin_op_req(AVND_CB 
> >>> *cb, AVND_EVT *evt) {
> >>>      cb->rcv_msg_id = info->msg_id;
> >>>    
> >>>      switch (info->oper_id) {
> >>> +    case SA_AMF_ADMIN_REPAIRED: {
> >>> +      /* Node has been repaired. Reset states of this node and all 
> >>> DISABLED
> >>> +       * SUs/comps. And update AMF director accordingly.
> >>> +       * No need to reset states of ENABLED SUs(in case of
> >>> +       * SA_AMF_NODE_SWITCHOVER, only faulty components/SU are disabled,
> >>> +       * other SUs assignments are simply switched over.)
> >>> +       * But for ENABLED Sus, send oper state message, so that, they
> >>> +       * are evaluated for assignments.
> >>> +       */
> >>> +      LOG_NO("Repair request for Node'%s'", 
> >>> Amf::to_string(&info->dn).c_str());
> >>> +
> >>> +      // Reset this node state first, so that when SU enabled message
> >>> +      // reaches at Amfd, it starts giving assignments to those SUs.
> >>> +      cb->oper_state = SA_AMF_OPERATIONAL_ENABLED;
> >>> +      cb->term_state = AVND_TERM_STATE_UP;
> >>> +      cb->node_err_esc_level = AVND_ERR_ESC_LEVEL_0;
> >>> +      cb->failed_su = nullptr;
> >>> +      cb->su_failover_cnt = 0;
> >>> +
> >>> +      /* Reset all faulty SUs and Comps */
> >>> +      for (AVND_SU *su = avnd_sudb_rec_get_next(cb->sudb, ""); su != 
> >>> nullptr;
> >>> +          su = avnd_sudb_rec_get_next(cb->sudb, su->name)) {
> >>> +        if (su->is_ncs || su->su_is_external)
> >>> +          continue;
> >>> +        su->comp_restart_cnt = 0;
> >>> +        su->su_restart_cnt = 0;
> >>> +        su->su_err_esc_level = AVND_ERR_ESC_LEVEL_0;
> >>> +        if (m_AVND_SU_OPER_STATE_IS_ENABLED(su)){
> >>> +          rc = avnd_di_oper_send(cb, su, 0);
> >>> +          continue;
> >>> +        }
> >>> +        // Reset the components states.
> >>> +        for (AVND_COMP *comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(
> >>> +              m_NCS_DBLIST_FIND_FIRST(&su->comp_list));
> >>> +            comp; comp = m_AVND_COMP_FROM_SU_DLL_NODE_GET(
> >>> +              m_NCS_DBLIST_FIND_NEXT(&comp->su_dll_node))) {
> >>> +          comp->admin_oper = false;
> >>> +          m_AVND_COMP_STATE_RESET(comp);
> >>> +          avnd_comp_pres_state_set(cb, comp, 
> >>> SA_AMF_PRESENCE_UNINSTANTIATED);
> >>> +
> >>> +          m_AVND_COMP_OPER_STATE_SET(comp, SA_AMF_OPERATIONAL_ENABLED);
> >>> +          avnd_di_uns32_upd_send(AVSV_SA_AMF_COMP, saAmfCompOperState_ID,
> >>> +              comp->name, comp->oper);
> >>> +        }
> >>> +
> >>> +        // Reset the SUs states.
> >>> +        if ((su->pres == SA_AMF_PRESENCE_TERMINATION_FAILED) &&
> >>> +            (comp_in_term_failed_state() == false))
> >>> +          avnd_failed_state_file_delete();
> >>> +
> >>> +        su->admin_op_Id = static_cast<SaAmfAdminOperationIdT>(0);
> >>> +        reset_suRestart_flag(su);
> >>> +        m_AVND_SU_STATE_RESET(su);
> >>> +        m_AVND_SU_OPER_STATE_SET(su, SA_AMF_OPERATIONAL_ENABLED);
> >>> +        avnd_di_uns32_upd_send(AVSV_SA_AMF_SU, saAmfSUOperState_ID, 
> >>> su->name,
> >>> +            su->oper);
> >>> +        avnd_su_pres_state_set(cb, su, SA_AMF_PRESENCE_UNINSTANTIATED);
> >>> +        rc = avnd_di_oper_send(cb, su, 0);
> >>> +      } // End of for loop for su
> >>> +      break;
> >>> +    } // End of case SA_AMF_ADMIN_REPAIRED:
> >>> +
> >>>        default:
> >>>          LOG_NO("%s: unsupported adm op %u", __FUNCTION__, info->oper_id);
> >>>          rc = NCSCC_RC_FAILURE;
> >>> diff --git a/src/amf/amfnd/err.cc b/src/amf/amfnd/err.cc
> >>> index aeec373..c549047 100644
> >>> --- a/src/amf/amfnd/err.cc
> >>> +++ b/src/amf/amfnd/err.cc
> >>> @@ -1135,7 +1135,7 @@ uint32_t avnd_err_rcvr_node_failover(AVND_CB *cb, 
> >>> AVND_SU *failed_su,
> >>>      for (comp = avnd_compdb_rec_get_next(cb->compdb, ""); comp != 
> >>> nullptr;
> >>>           comp = avnd_compdb_rec_get_next(cb->compdb, comp->name)) {
> >>>        if (comp->su->is_ncs || comp->su->su_is_external) continue;
> >>> -
> >>> +    rc = avnd_comp_curr_info_del(cb, comp);
> >>>        rc = avnd_comp_clc_fsm_run(cb, comp, 
> >>> AVND_COMP_CLC_PRES_FSM_EV_CLEANUP);
> >>>        if (rc != NCSCC_RC_SUCCESS) {
> >>>          LOG_ER("'%s' termination failed", comp->name.c_str());
> >>> diff --git a/src/amf/amfnd/su.cc b/src/amf/amfnd/su.cc
> >>> index d8e388a..cb4928d 100644
> >>> --- a/src/amf/amfnd/su.cc
> >>> +++ b/src/amf/amfnd/su.cc
> >>> @@ -639,7 +639,7 @@ done:
> >>>      return rc;
> >>>    }
> >>>    
> >>> -static bool comp_in_term_failed_state(void) {
> >>> +bool comp_in_term_failed_state(void) {
> >>>      for (AVND_COMP *comp = avnd_compdb_rec_get_next(avnd_cb->compdb, "");
> >>>           comp != nullptr;
> >>>           comp = avnd_compdb_rec_get_next(avnd_cb->compdb, comp->name)) {


_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to