Hi Minh,

Yes, #3185 is only help for multi partition cluster rejoin.
As verbal discussion, I will move this patch under #2936 umbrella.

See my replies for inline comments.

Best Regards,
ThuanTr

-----Original Message-----
From: Minh Hon Chau <minh.c...@dektech.com.au> 
Sent: Thursday, July 9, 2020 8:39 PM
To: Thuan Tran <thuan.t...@dektech.com.au>; Thang Duc Nguyen 
<thang.d.ngu...@dektech.com.au>
Cc: opensaf-devel@lists.sourceforge.net
Subject: Re: [PATCH 1/1] amf: enhance to work in roaming SC and headless [#3185]

Hi Thuan,

Is this changed only needed if roamingSC is enabled? I remember last 
time I tested #2936 Vu's prototype without roamingSC (headless 
partitions rejoin) I didn't need any change from amf.

And some comments inline.

Thanks

Minh


On 3/7/20 7:23 pm, thuan.tran wrote:
> - amfd reset msg id counter for node that ignore amfnd down
> event to avoid nodes reboot once more due to mismatch msg id after
> reboot up from reboot order for sending node_up after sync window.
>
> - amfd active order reboot its standby if it detect another
> active amfd (multi partition cluster rejoin).
[M] how's about the "another active", is it going to reboot too?
[T] Two active will be handled by RDE detect split-brain.
If AMF active reboot, other active may not reboot (as not yet see this active)
then it break legacy split-brain detected behavior of RDE.
>
> - amfd standby should reboot itself if see two active peers to
> avoid standby do cold-sync or be updated with wrong active.
[M] how's about one of "two active peers", is it going to reboot too?
[T] As above explain, it will reboot but by RDE.
>
> - amfd just become standby (out of sync) but see active down
> should reboot itself.
> ---
>   src/amf/amfd/dmsg.cc   |  8 ++++++++
>   src/amf/amfd/evt.h     |  1 +
>   src/amf/amfd/main.cc   |  3 +++
>   src/amf/amfd/mds.cc    | 36 ++++++++++++++++++++++++++++++++++--
>   src/amf/amfd/msg.h     |  1 +
>   src/amf/amfd/ndfsm.cc  |  2 ++
>   src/amf/amfd/proc.h    |  1 +
>   src/amf/amfd/role.cc   | 27 +++++++++++++++++++++++++++
>   src/amf/amfd/util.cc   |  2 +-
>   src/amf/amfnd/amfnd.cc |  2 +-
>   10 files changed, 79 insertions(+), 4 deletions(-)
>
> diff --git a/src/amf/amfd/dmsg.cc b/src/amf/amfd/dmsg.cc
> index cf4019d8a..5273f358c 100644
> --- a/src/amf/amfd/dmsg.cc
> +++ b/src/amf/amfd/dmsg.cc
> @@ -75,6 +75,8 @@ void avd_mds_d_enc(MDS_CALLBACK_ENC_INFO *enc_info) {
>         ncs_encode_32bit(&data, msg->msg_info.d2d_chg_role_rsp.role);
>         ncs_encode_32bit(&data, msg->msg_info.d2d_chg_role_rsp.status);
>         break;
> +    case AVD_D2D_ROAMING_SC_SPLITBRAIN:
[M] We can name AVD_D2D_REBOOT, the message name should be reflect the 
purpose of the message.
[T] OK, will rename it.
> +      break;
>       default:
>         LOG_ER("%s: unknown msg %u", __FUNCTION__, msg->msg_type);
>         break;
> @@ -120,6 +122,8 @@ void avd_mds_d_dec(MDS_CALLBACK_DEC_INFO *dec_info) {
>             static_cast<SaAmfHAStateT>(ncs_decode_32bit(&data));
>         d2d_msg->msg_info.d2d_chg_role_rsp.status = ncs_decode_32bit(&data);
>         break;
> +    case AVD_D2D_ROAMING_SC_SPLITBRAIN:
> +      break;
>       default:
>         LOG_ER("%s: unknown msg %u", __FUNCTION__, d2d_msg->msg_type);
>         break;
> @@ -210,6 +214,10 @@ uint32_t avd_d2d_msg_rcv(AVD_D2D_MSG *rcv_msg) {
>           osafassert(0);
>         }
>         break;
> +    case AVD_D2D_ROAMING_SC_SPLITBRAIN:
> +      LOG_ER("Reboot order from Active as roaming SC split-brain detected");
> +      opensaf_quick_reboot("Split-brain detected");
> +      break;
>       default:
>         LOG_ER("%s: unknown msg %u", __FUNCTION__, rcv_msg->msg_type);
>         break;
> diff --git a/src/amf/amfd/evt.h b/src/amf/amfd/evt.h
> index a9028cde3..a08dccebb 100644
> --- a/src/amf/amfd/evt.h
> +++ b/src/amf/amfd/evt.h
> @@ -72,6 +72,7 @@ typedef enum avd_evt_type {
>     AVD_IMM_REINITIALIZED,
>     AVD_EVT_UNASSIGN_SI_DEP_STATE,
>     AVD_EVT_ND_MDS_VER_INFO,
> +  AVD_EVT_ROAMING_SC_SPLITBRAIN,
>     AVD_EVT_MAX
>   } AVD_EVT_TYPE;
>   
> diff --git a/src/amf/amfd/main.cc b/src/amf/amfd/main.cc
> index 3b1536721..3cc0d9741 100644
> --- a/src/amf/amfd/main.cc
> +++ b/src/amf/amfd/main.cc
> @@ -132,6 +132,9 @@ static const AVD_EVT_HDLR g_actv_list[AVD_EVT_MAX] = {
>       invalid_evh,                /* AVD_EVT_INVALID */
>       avd_sidep_unassign_evh,     /* AVD_EVT_UNASSIGN_SI_DEP_STATE */
>       avd_avnd_mds_info_evh,      /* AVD_EVT_ND_MDS_VER_INFO*/
> +
> +    /* Roaming SC split-brain processing */
> +    avd_roaming_sc_split_brain_evh, /* AVD_EVT_ROAMING_SC_SPLITBRAIN */
>   };
>   
>   /* list of all the function pointers related to handling the events
> diff --git a/src/amf/amfd/mds.cc b/src/amf/amfd/mds.cc
> index 108f9b8bd..625616f5a 100644
> --- a/src/amf/amfd/mds.cc
> +++ b/src/amf/amfd/mds.cc
> @@ -396,13 +396,38 @@ static uint32_t 
> avd_mds_svc_evt(MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
>     uint32_t rc = NCSCC_RC_SUCCESS;
>     AVD_CL_CB *cb = avd_cb;
>   
> +  if ((evt_info->i_svc_id == NCSMDS_SVC_ID_AVD) &&
> +      (evt_info->i_change == NCSMDS_RED_UP) &&
> +      (evt_info->i_role == V_DEST_RL_ACTIVE) &&
> +      (cb->node_id_avd != evt_info->i_node_id) &&
> +      (cb->other_avd_adest) &&
> +      (cb->node_id_avd_other != evt_info->i_node_id)) {
> +    if (cb->avail_state_avd == SA_AMF_HA_STANDBY) {
> +      LOG_ER("Standby peer see two peers: %x and %x",
> +        cb->node_id_avd_other, evt_info->i_node_id);
> +      opensaf_reboot(0, NULL, "Standby peer see two peers");
> +    } else if (cb->avail_state_avd == SA_AMF_HA_ACTIVE) {
> +      // Send reboot order to known standby (multi clusters rejoin)
> +      AVD_EVT *evt = new AVD_EVT();
> +      evt->rcv_evt = AVD_EVT_ROAMING_SC_SPLITBRAIN;
> +      if (m_NCS_IPC_SEND(&cb->avd_mbx, evt, NCS_IPC_PRIORITY_HIGH) !=
> +          NCSCC_RC_SUCCESS) {
> +        LOG_ER("%s: ncs_ipc_send failed", __FUNCTION__);
> +        delete evt;
> +      }
> +    }
> +    return rc;
> +  }
> +

[M]: We can move the evt_info->i_change, evt_info->svc_id nested in the 
current "switch". One question, we have mds telling that 
evt_info->i_node_id having the role RL_ACTIVE, why we don't reboot 
evt_info->i_node_id?
[T]: I try to avoid touch current "switch" make it difficult for reading/review.
All new code in one separate block is better for reading/review.
[T]: Why active detect active not reboot, see explain at comment (1), (2)
>     switch (evt_info->i_change) {
>       case NCSMDS_UP:
>         switch (evt_info->i_svc_id) {
>           case NCSMDS_SVC_ID_AVD:
> +          TRACE("NCSMDS_UP AVD %x", evt_info->i_node_id);
>             /* if((Is this up from other node) && (Is this Up from an Adest)) 
> */
>             if ((evt_info->i_node_id != cb->node_id_avd) &&
> -              (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest))) {
> +              (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest)) &&
> +              (cb->other_avd_adest == 0)) {
>               cb->node_id_avd_other = evt_info->i_node_id;
>               cb->other_avd_adest = evt_info->i_dest;
>               cb->stby_sync_state = AVD_STBY_OUT_OF_SYNC;
> @@ -444,11 +469,18 @@ static uint32_t 
> avd_mds_svc_evt(MDS_CALLBACK_SVC_EVENT_INFO *evt_info) {
>       case NCSMDS_DOWN:
>         switch (evt_info->i_svc_id) {
>           case NCSMDS_SVC_ID_AVD:
> +          TRACE("NCSMDS_DOWN AVD %x", evt_info->i_node_id);
>             /* if(Is this down from an Adest) && (Is this adest same as Adest 
> in
>              * CB) */
>             if (m_MDS_DEST_IS_AN_ADEST(evt_info->i_dest) &&
>                 m_NCS_MDS_DEST_EQUAL(&evt_info->i_dest, 
> &cb->other_avd_adest)) {
> -            memset(&cb->other_avd_adest, '\0', sizeof(MDS_DEST));
> +            cb->other_avd_adest = 0;
> +            if ((cb->avail_state_avd == SA_AMF_HA_STANDBY) &&
> +                (cb->stby_sync_state == AVD_STBY_OUT_OF_SYNC)) {
> +              LOG_ER("FAILOVER StandBy --> Active FAILED,"
> +                " Standby OUT OF SYNC");
> +              opensaf_quick_reboot("FAILOVER failed");
> +            }
>             }
>             break;
>   
> diff --git a/src/amf/amfd/msg.h b/src/amf/amfd/msg.h
> index 4b699db71..ef5b40b07 100644
> --- a/src/amf/amfd/msg.h
> +++ b/src/amf/amfd/msg.h
> @@ -40,6 +40,7 @@
>   typedef enum {
>     AVD_D2D_CHANGE_ROLE_REQ = AVSV_D2D_CHANGE_ROLE_REQ,
>     AVD_D2D_CHANGE_ROLE_RSP,
> +  AVD_D2D_ROAMING_SC_SPLITBRAIN,
>     AVD_D2D_MSG_MAX,
>   } AVD_D2D_MSG_TYPE;
>   
> diff --git a/src/amf/amfd/ndfsm.cc b/src/amf/amfd/ndfsm.cc
> index 674ef863a..089c41971 100644
> --- a/src/amf/amfd/ndfsm.cc
> +++ b/src/amf/amfd/ndfsm.cc
> @@ -812,6 +812,8 @@ void avd_mds_avnd_down_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
>           // Ignore amfnd down event in late after clm cb node left then 
> joined
>           // But not ignore if after headless
>           LOG_WA("Ignore '%s' amfnd down event", node->node_name.c_str());
> +        node->rcv_msg_id = 0;
> +        node->snd_msg_id = 0;
>           TRACE_LEAVE();
>           return;
>         }
> diff --git a/src/amf/amfd/proc.h b/src/amf/amfd/proc.h
> index 4052aecac..188aa4860 100644
> --- a/src/amf/amfd/proc.h
> +++ b/src/amf/amfd/proc.h
> @@ -86,6 +86,7 @@ void avd_node_down_func(AVD_CL_CB *cb, AVD_AVND *avnd);
>   void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
>   void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, struct AVD_EVT *evt);
>   void avd_avnd_mds_info_evh(AVD_CL_CB *cb, AVD_EVT *evt);
> +void avd_roaming_sc_split_brain_evh(AVD_CL_CB *cb, AVD_EVT *evt);
>   uint32_t avd_node_down(AVD_CL_CB *cb, SaClmNodeIdT node_id);
>   AVD_AVND *avd_msg_sanity_chk(AVD_EVT *evt, SaClmNodeIdT node_id,
>                                AVSV_DND_MSG_TYPE msg_typ, uint32_t msg_id);
> diff --git a/src/amf/amfd/role.cc b/src/amf/amfd/role.cc
> index 24374de7c..bd497579b 100644
> --- a/src/amf/amfd/role.cc
> +++ b/src/amf/amfd/role.cc
> @@ -1394,3 +1394,30 @@ uint32_t avd_rde_set_role(SaAmfHAStateT role) {
>     }
>     return rc;
>   }
> +
> +/****************************************************************************\
> + * Function: avd_roaming_sc_split_brain_evh
> + *
> + * Purpose:  Send reboot order to standby.
> + *
> + * Input: cb - the AVD control block
> + *        evt - The event information.
> + *
> + * Returns: None.
> + *
> +\**************************************************************************/
> +void avd_roaming_sc_split_brain_evh(AVD_CL_CB *cb, AVD_EVT *evt) {
> +  (void)evt;
> +  AVD_D2D_MSG d2d_msg;
> +  TRACE_ENTER();
> +
> +  if ((cb->node_id_avd_other == 0) || (cb->other_avd_adest == 0)) {
> +    return;
> +  }
> +  d2d_msg.msg_type = AVD_D2D_ROAMING_SC_SPLITBRAIN;
> +  if (avd_d2d_msg_snd(cb, &d2d_msg) != NCSCC_RC_SUCCESS) {
> +    LOG_ER("Reboot order send failed for Standby AMFD");
> +  }
> +
> +  TRACE_LEAVE();
> +}
> diff --git a/src/amf/amfd/util.cc b/src/amf/amfd/util.cc
> index f2b3c5f2a..6915bddb6 100644
> --- a/src/amf/amfd/util.cc
> +++ b/src/amf/amfd/util.cc
> @@ -1823,7 +1823,7 @@ void avd_d2n_reboot_snd(AVD_AVND *node) {
>     AVD_DND_MSG *d2n_msg = new AVD_DND_MSG();
>   
>     d2n_msg->msg_type = AVSV_D2N_REBOOT_MSG;
> -  d2n_msg->msg_info.d2n_reboot_info.node_id = node->node_info.nodeId;
> +  d2n_msg->msg_info.d2n_reboot_info.node_id = avd_cb->node_id_avd;
>     d2n_msg->msg_info.d2n_reboot_info.msg_id = ++(node->snd_msg_id);
>   
>     if (avd_d2n_msg_snd(avd_cb, node, d2n_msg) != NCSCC_RC_SUCCESS) {
> diff --git a/src/amf/amfnd/amfnd.cc b/src/amf/amfnd/amfnd.cc
> index 9e8739bee..54a3d75fa 100644
> --- a/src/amf/amfnd/amfnd.cc
> +++ b/src/amf/amfnd/amfnd.cc
> @@ -432,7 +432,7 @@ uint32_t avnd_evt_avd_reboot_evh(AVND_CB *cb, AVND_EVT 
> *evt) {
>       }
>     }
>   
> -  LOG_NO("Received reboot order, ordering reboot now!");
> +  LOG_NO("Received reboot order from %x, ordering reboot now!", 
> info->node_id);
>     opensaf_reboot(cb->node_info.nodeId,
>                    
> osaf_extended_name_borrow(&cb->node_info.executionEnvironment),
>                    "Received reboot order");

_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to