Hi Vu, Ack
Thanks Lennart > -----Original Message----- > From: Vu Minh Nguyen <[email protected]> > Sent: den 16 oktober 2018 09:09 > To: Hans Nordebäck <[email protected]>; Lennart Lund > <[email protected]>; Gary Lee <[email protected]> > Cc: [email protected]; Vu Minh Nguyen > <[email protected]> > Subject: [PATCH 1/1] imm: cluster is rebooted after split-brain recovery > [#2934] > > During split-brain, there is possiblity of having mismatches in global > counters > hold by IMMNDs which might cause cluster rebooted at split-brain recovery. > > This patch introduces some changes in IMMD - not to update global counters > from joining IMMNDs if the coord is already elected. > --- > src/imm/immd/immd_evt.c | 50 > ++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 49 insertions(+), 1 deletion(-) > > diff --git a/src/imm/immd/immd_evt.c b/src/imm/immd/immd_evt.c > index 1114a81d4..9c4f1f016 100644 > --- a/src/imm/immd/immd_evt.c > +++ b/src/imm/immd/immd_evt.c > @@ -1629,6 +1629,25 @@ static uint32_t > immd_evt_proc_immnd_req_sync(IMMD_CB *cb, IMMD_EVT *evt, > return proc_rc; > } > > +/********************************************************* > ******************* > + * Name : is_global_counter_less > + * > + * Description : Check if any global counters is less than ones sent by > IMMND. > + * > + * Arguments : IMMD_CB *cb - IMMD CB pointer > + * IMMD_EVT *evt - Received Event structure > + * > + * Return Values : true if having at least one, false otherwise. > + * > + > ********************************************************** > *******************/ > +static bool is_global_counter_less(const IMMD_CB *cb, const IMMD_EVT > *evt) > +{ > + return ((cb->fevsSendCount < evt->info.ctrl_msg.fevs_count) || > + (cb->admo_id_count < evt->info.ctrl_msg.admo_id_count) > || > + (cb->ccb_id_count < evt->info.ctrl_msg.ccb_id_count) || > + (cb->impl_count < evt->info.ctrl_msg.impl_count)); > +} > + > > /********************************************************** > ****************** > * Name : immd_evt_proc_immnd_intro > * > @@ -1718,7 +1737,16 @@ static uint32_t > immd_evt_proc_immnd_intro(IMMD_CB *cb, IMMD_EVT *evt, > cb->mRulingEpoch); > } > > - if (cb->mRulingEpoch < node_info->epoch) { > + /* > + Don't update the ruling epoch from joining IMMND if the > coord > + is already elected, except the change comes from the coord. > + This check is to avoid cluster reboot after split-brain > + recovery; this rule is also applied for all below global > + counters (fevs counter, ccb id counter, etc.). > + */ > + if ((cb->mRulingEpoch < node_info->epoch) && > + (cb->immnd_coord != 0 || > + cb->immnd_coord == node_info->immnd_key)) { > cb->mRulingEpoch = node_info->epoch; > LOG_NO("Ruling epoch changed to:%u", cb- > >mRulingEpoch); > } > @@ -1769,6 +1797,26 @@ static uint32_t > immd_evt_proc_immnd_intro(IMMD_CB *cb, IMMD_EVT *evt, > > veteranImmndNode = true; > > + /* Don't update global counters when the coord is > + already elected, but except ones come from the > coord. > + */ > + if (cb->immnd_coord != 0 && > + node_info->immnd_key != cb->immnd_coord) { > + if (is_global_counter_less(cb, evt)) { > + IMMSV_ND2D_CONTROL* msg = > &(evt->info.ctrl_msg); > + LOG_NO("Ignore updating counters > from %x. " > + "Diffs (global/intro):" > + "fevs(%llu/%llu), admid > (%u/%u)," > + "ccbid(%u/%u), impid(%u/%u)", > + node_info->immnd_key, > + cb->fevsSendCount, msg- > >fevs_count, > + cb->admo_id_count, msg- > >admo_id_count, > + cb->ccb_id_count, msg- > >ccb_id_count, > + cb->impl_count, msg- > >impl_count); > + } > + goto accept_node; > + } > + > if (cb->fevsSendCount < evt- > >info.ctrl_msg.fevs_count) { > LOG_NO( > "Refresh of fevs count from %llu to %llu > from %x.", > -- > 2.18.0 _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
