Hi Hans, Thanks for your comments. See my comment inline. Thanks
Regards, Vu > -----Original Message----- > From: Hans Nordebäck <[email protected]> > Sent: Monday, January 28, 2019 4:37 PM > To: Hans Nordebäck <[email protected]>; Vu Minh Nguyen > <[email protected]>; Gary Lee <[email protected]>; > Minh Hon Chau <[email protected]> > Cc: [email protected] > Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when split > network [#3001] > > Hi Vu, > See one more comment below/Thanks HansN > > -----Original Message----- > From: Hans Nordebäck <[email protected]> > Sent: den 28 januari 2019 10:15 > To: Vu Minh Nguyen <[email protected]>; Gary Lee > <[email protected]>; Minh Hon Chau <[email protected]> > Cc: [email protected] > Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when split > network [#3001] > > Hi Vu, ack review only. Two comments below/Thanks HansN > > On 1/25/19 12:34, Vu Minh Nguyen wrote: > > --- > > scripts/opensaf_reboot | 33 +++++++++++++++++++++++++++------ > > src/amf/amfd/ndproc.cc | 4 ++-- > > src/base/ncssysf_def.h | 6 ++++++ > > src/base/sysf_def.c | 10 ++++++++++ > > src/fm/fmd/fm_main.cc | 6 +++--- > > src/fm/fmd/fm_rda.cc | 5 ++--- > > src/rde/rded/rde_main.cc | 6 ++---- > > 7 files changed, 52 insertions(+), 18 deletions(-) > > > > diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot index > > 727272e1d..2f7a7daeb 100644 > > --- a/scripts/opensaf_reboot > > +++ b/scripts/opensaf_reboot > > @@ -31,7 +31,7 @@ export > LD_LIBRARY_PATH=$libdir:$LD_LIBRARY_PATH > > > > # Node fencing: OpenSAF cannot reboot a node when there's no CLM > node to > > # PLM EE mapping in the information model. In such cases rebooting > > would be done -# through proprietary mechanisms, i.e. not through PLM. > > Node_id is (the only > > +# through proprietary mechanisms, i.e. not through PLM. Node_id is > > +(the only > > # entity) at the disposal of such a mechanism. > > > > if [ -f "$pkgsysconfdir/fmd.conf" ]; then @@ -81,7 +81,6 @@ > > opensaf_reboot_with_remote_fencing() > > #if plm exists in the system,then the reboot is performed using the > eename. > > opensaf_reboot_with_plm() > > { > > - > > immadm -o 7 $ee_name > > retval=$? > > if [ $retval != 0 ]; then > > @@ -96,12 +95,29 @@ opensaf_reboot_with_plm() > > logger -t "opensaf_reboot" "abrupt restart failed for $ee_name: unable to > restart remote node" > > exit 1 > > fi > > -fi > > +fi > > fi > > #Note: Operation Id SA_PLM_ADMIN_RESTART=7 > > #In the example the $ee_name would expand to (for eg:-) > safEE=my_linux_os,safHE=64bitmulticore,safDomain=my_domain > > } > > > > +# Force local node reboot as fast as possible > > +quick_local_node_reboot() > > +{ > > +logger -t "opensaf_reboot" "Do quick local node reboot" > [HansN] perhaps reuse the same logic as in sysf_def.c, i.e. use the sysrq as > fallback and use a short timeout [Vu] Forcing node reboot by touching /proc/sysrq-trigger is not allowed on containers such as LXC (as container is immutable), therefore I provided 02 more alternatives below in case the first try is failed. > > + > > +$icmd /bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger > [HansN] if not run as root, i.e. icmd is sudo, I think you need to use > cmd: /bin/echo -n 'b' | $icmd tee /proc/sysrq-trigger , please check > [HansN] or $icmd /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq- > trigger" [Vu] Thanks for your suggestion. Will update accordingly. > > +ret_code=$? > > + > > +if [ $ret_code != 0 ] && [ -x /bin/systemctl ]; then > > +$icmd /bin/systemctl --force --force reboot > > +ret_code=$? > > +fi > > + > > +if [ $ret_code != 0 ]; then > > +$icmd /sbin/reboot -f > > +fi > > +} > > > > if ! test -f "$NODE_ID_FILE"; then > > logger -t "opensaf_reboot" "$NODE_ID_FILE doesnt exists,reboot failed " > > @@ -112,8 +128,13 @@ temp_node_id=`cat "$NODE_ID_FILE"` > > temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e > 's:^:0x:'` > > self_node_id=`printf "%d" $temp_node_id` > > > > -# If clm cluster reboot requested argument one and two are set but > > not used, argument 3 is set to 1, "safe reboot" request -if [ > > "$safe_reboot" = 1 ]; then > > +# If no argument is provided, forcing node reboot immediately without > > +log # flushing, process terminating, disk un-mounting. > > +# If clm cluster reboot requested argument one and two are set but > > +not used, # argument 3 is set to 1, "safe reboot" request. > > +if [ "$#" = 0 ]; then > > +quick_local_node_reboot > > +elif [ "$safe_reboot" = 1 ]; then > > opensaf_safe_reboot > > else > > # A node ID of zero(0) means an order to reboot the local node @@ > > -165,7 +186,7 @@ else > > logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it is > not in INSTANTIATED state" > > elif [ $plm_node_state != 2 ]; then > > opensaf_reboot_with_plm > > -else > > +else > > logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it is > already in locked state" > > fi > > else > > diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index > > ec347fc71..c5060e67b 100644 > > --- a/src/amf/amfd/ndproc.cc > > +++ b/src/amf/amfd/ndproc.cc > > @@ -1262,7 +1262,7 @@ void check_quorum(AVD_CL_CB *cb) { > > > > // remote fencing is disabled and we have lost write access > > // reboot this node to prevent split brain > > - opensaf_reboot(0, nullptr, > > - "Quorum lost. Rebooting this node to prevent split-brain"); > > + opensaf_quick_reboot("Quorum lost. Rebooting this node to " > > + "prevent split-brain"); > > } > > } > > diff --git a/src/base/ncssysf_def.h b/src/base/ncssysf_def.h index > > 3df8b221e..3de6f44f6 100644 > > --- a/src/base/ncssysf_def.h > > +++ b/src/base/ncssysf_def.h > > @@ -76,6 +76,12 @@ void opensaf_reboot_prepare(void); > > */ > > void opensaf_reboot(unsigned node_id, const char* ee_name, const > > char* reason); > > > > +/** > > + * Do quick local node reboot - without first unmounting file systems > > + * or syncing disks attached to the system. > > +*/ > > +void opensaf_quick_reboot(const char* reason); > > + > > > /************************************************************** > *************** > > ** ** > > ** ncs_os_get_time_stamp: Return the current timestamp as "time_t" > in ** > > diff --git a/src/base/sysf_def.c b/src/base/sysf_def.c index > > d0cfc94ef..880f08ec4 100644 > > --- a/src/base/sysf_def.c > > +++ b/src/base/sysf_def.c > > @@ -271,6 +271,16 @@ void opensaf_reboot(unsigned node_id, const > char *ee_name, const char *reason) > > } > > } > > > > +void opensaf_quick_reboot(const char *reason) { > > +syslog(LOG_CRIT, "Quick local node rebooting, Reason: %s ", reason); > > +int reboot_result = system(PKGLIBDIR "/opensaf_reboot"); > > +if (reboot_result != EXIT_SUCCESS) { > > +syslog(LOG_CRIT, "node reboot failure: exit code %d", > > + WEXITSTATUS(reboot_result)); > > +} > > +} > > + > > /** > > * syslog and abort to generate a core dump (if enabled) > > * @param __file > > diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc index > > 122b2175c..b473b68aa 100644 > > --- a/src/fm/fmd/fm_main.cc > > +++ b/src/fm/fmd/fm_main.cc > > @@ -665,9 +665,9 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, > FM_EVT *fm_mbx_evt) { > > "Two active controllers observed in a cluster, newActive: %x and " > > "old-Active: %x", > > unsigned(fm_cb->node_id), unsigned(fm_cb->peer_node_id)); > > - opensaf_reboot(0, NULL, > > - "Received svc up from peer node (old-active is not " > > - "fully DOWN), hence rebooting the new Active"); > > + opensaf_quick_reboot("Received svc up from peer node (old-active " > > + "is not fully DOWN), " > > + "hence rebooting the new Active"); > > } > > } > > > > diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc index > > 2a9f1664f..028bfa380 100644 > > --- a/src/fm/fmd/fm_rda.cc > > +++ b/src/fm/fmd/fm_rda.cc > > @@ -105,9 +105,8 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, > PCS_RDA_ROLE role) { > > LOG_ER( > > "A controller is already active. We were separated from the " > > "cluster?"); > > - opensaf_reboot(0, nullptr, > > - "A controller is already active. We were separated " > > - "from the cluster?"); > > + opensaf_quick_reboot("A controller is already active. We were > separated " > > + "from the cluster?"); > > } > > } > > > > diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index > > 2d9aa51dc..b4c842942 100644 > > --- a/src/rde/rded/rde_main.cc > > +++ b/src/rde/rded/rde_main.cc > > @@ -140,9 +140,7 @@ static void handle_mbx_event() { > > active_controller.c_str()); > > if (consensus_service.IsRemoteFencingEnabled() == false) { > > LOG_ER("Probable split-brain. Rebooting this node"); > > - > > - opensaf_reboot(0, nullptr, > > - "Split-brain detected by consensus service"); > > + opensaf_quick_reboot("Split-brain detected by consensus > > + service"); > > } > > } > > > > @@ -258,7 +256,7 @@ static void CheckForSplitBrain(const rde_msg *msg) > { > > PCS_RDA_ROLE own_role = role->role(); > > PCS_RDA_ROLE other_role = msg->info.peer_info.ha_role; > > if (own_role == PCS_RDA_ACTIVE && other_role == PCS_RDA_ACTIVE) { > > - opensaf_reboot(0, nullptr, "Split-brain detected"); > > + opensaf_quick_reboot("Split-brain detected"); > > } > > } > > > > _______________________________________________ > Opensaf-devel mailing list > [email protected] > https://lists.sourceforge.net/lists/listinfo/opensaf-devel _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
