Hi Vu,

Ack

Thanks
Lennart

> -----Original Message-----
> From: Vu Minh Nguyen <[email protected]>
> Sent: den 16 oktober 2018 09:09
> To: Hans Nordebäck <[email protected]>; Lennart Lund
> <[email protected]>; Gary Lee <[email protected]>
> Cc: [email protected]; Vu Minh Nguyen
> <[email protected]>
> Subject: [PATCH 1/1] imm: cluster is rebooted after split-brain recovery
> [#2934]
> 
> During split-brain, there is possiblity of having mismatches in global 
> counters
> hold by IMMNDs which might cause cluster rebooted at split-brain recovery.
> 
> This patch introduces some changes in IMMD - not to update global counters
> from joining IMMNDs if the coord is already elected.
> ---
>  src/imm/immd/immd_evt.c | 50
> ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 49 insertions(+), 1 deletion(-)
> 
> diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c
> index 1114a81d4..9c4f1f016 100644
> --- a/src/imm/immd/immd_evt.c
> +++ b/src/imm/immd/immd_evt.c
> @@ -1629,6 +1629,25 @@ static uint32_t
> immd_evt_proc_immnd_req_sync(IMMD_CB *cb, IMMD_EVT *evt,
>       return proc_rc;
>  }
> 
> +/*********************************************************
> *******************
> + * Name          : is_global_counter_less
> + *
> + * Description   : Check if any global counters is less than ones sent by
> IMMND.
> + *
> + * Arguments     : IMMD_CB  *cb  - IMMD CB pointer
> + *                 IMMD_EVT *evt - Received Event structure
> + *
> + * Return Values : true if having at least one, false otherwise.
> + *
> +
> **********************************************************
> *******************/
> +static bool is_global_counter_less(const IMMD_CB *cb, const IMMD_EVT
> *evt)
> +{
> +     return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) ||
> +             (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count)
> ||
> +             (cb->ccb_id_count  < evt->info.ctrl_msg.ccb_id_count) ||
> +             (cb->impl_count    < evt->info.ctrl_msg.impl_count));
> +}
> +
> 
> /**********************************************************
> ******************
>   * Name          : immd_evt_proc_immnd_intro
>   *
> @@ -1718,7 +1737,16 @@ static uint32_t
> immd_evt_proc_immnd_intro(IMMD_CB *cb, IMMD_EVT *evt,
>                           cb->mRulingEpoch);
>               }
> 
> -             if (cb->mRulingEpoch < node_info->epoch) {
> +             /*
> +               Don't update the ruling epoch from joining IMMND if the
> coord
> +               is already elected, except the change comes from the coord.
> +               This check is to avoid cluster reboot after split-brain
> +               recovery; this rule is also applied for all below global
> +               counters (fevs counter, ccb id counter, etc.).
> +             */
> +             if ((cb->mRulingEpoch < node_info->epoch) &&
> +                 (cb->immnd_coord != 0 ||
> +                  cb->immnd_coord == node_info->immnd_key)) {
>                       cb->mRulingEpoch = node_info->epoch;
>                       LOG_NO("Ruling epoch changed to:%u", cb-
> >mRulingEpoch);
>               }
> @@ -1769,6 +1797,26 @@ static uint32_t
> immd_evt_proc_immnd_intro(IMMD_CB *cb, IMMD_EVT *evt,
> 
>                       veteranImmndNode = true;
> 
> +                     /* Don't update global counters when the coord is
> +                        already elected, but except ones come from the
> coord.
> +                     */
> +                     if (cb->immnd_coord != 0 &&
> +                         node_info->immnd_key != cb->immnd_coord) {
> +                             if (is_global_counter_less(cb, evt)) {
> +                                     IMMSV_ND2D_CONTROL* msg =
> &(evt->info.ctrl_msg);
> +                                     LOG_NO("Ignore updating counters
> from %x. "
> +                                            "Diffs (global/intro):"
> +                                            "fevs(%llu/%llu), admid
> (%u/%u),"
> +                                            "ccbid(%u/%u), impid(%u/%u)",
> +                                            node_info->immnd_key,
> +                                            cb->fevsSendCount, msg-
> >fevs_count,
> +                                            cb->admo_id_count, msg-
> >admo_id_count,
> +                                            cb->ccb_id_count, msg-
> >ccb_id_count,
> +                                            cb->impl_count, msg-
> >impl_count);
> +                             }
> +                             goto accept_node;
> +                     }
> +
>                       if (cb->fevsSendCount < evt-
> >info.ctrl_msg.fevs_count) {
>                               LOG_NO(
>                                   "Refresh of fevs count from %llu to %llu
> from %x.",
> --
> 2.18.0



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to