Hi Hans,

Thanks for your comments. See my comment inline. Thanks

Regards, Vu

> -----Original Message-----
> From: Hans Nordebäck <[email protected]>
> Sent: Monday, January 28, 2019 4:37 PM
> To: Hans Nordebäck <[email protected]>; Vu Minh Nguyen
> <[email protected]>; Gary Lee <[email protected]>;
> Minh Hon Chau <[email protected]>
> Cc: [email protected]
> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when
split
> network [#3001]
> 
> Hi Vu,
> See one more comment below/Thanks HansN
> 
> -----Original Message-----
> From: Hans Nordebäck <[email protected]>
> Sent: den 28 januari 2019 10:15
> To: Vu Minh Nguyen <[email protected]>; Gary Lee
> <[email protected]>; Minh Hon Chau <[email protected]>
> Cc: [email protected]
> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when
split
> network [#3001]
> 
> Hi Vu, ack review only. Two comments below/Thanks HansN
> 
> On 1/25/19 12:34, Vu Minh Nguyen wrote:
> > ---
> >   scripts/opensaf_reboot   | 33 +++++++++++++++++++++++++++------
> >   src/amf/amfd/ndproc.cc   |  4 ++--
> >   src/base/ncssysf_def.h   |  6 ++++++
> >   src/base/sysf_def.c      | 10 ++++++++++
> >   src/fm/fmd/fm_main.cc    |  6 +++---
> >   src/fm/fmd/fm_rda.cc     |  5 ++---
> >   src/rde/rded/rde_main.cc |  6 ++----
> >   7 files changed, 52 insertions(+), 18 deletions(-)
> >
> > diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot index
> > 727272e1d..2f7a7daeb 100644
> > --- a/scripts/opensaf_reboot
> > +++ b/scripts/opensaf_reboot
> > @@ -31,7 +31,7 @@ export
> LD_LIBRARY_PATH=$libdir:$LD_LIBRARY_PATH
> >
> >   # Node fencing: OpenSAF cannot reboot a node when there's no CLM
> node to
> >   # PLM EE mapping in the information model. In such cases rebooting
> > would be done -# through proprietary mechanisms, i.e. not through PLM.
> > Node_id is (the only
> > +# through proprietary mechanisms, i.e. not through PLM. Node_id is
> > +(the only
> >   # entity) at the disposal of such a mechanism.
> >
> >   if [ -f "$pkgsysconfdir/fmd.conf" ]; then @@ -81,7 +81,6 @@
> > opensaf_reboot_with_remote_fencing()
> >   #if plm exists in the system,then the reboot is performed using the
> eename.
> >   opensaf_reboot_with_plm()
> >   {
> > -
> >   immadm -o 7 $ee_name
> >   retval=$?
> >   if [ $retval != 0 ]; then
> > @@ -96,12 +95,29 @@ opensaf_reboot_with_plm()
> >   logger -t "opensaf_reboot" "abrupt restart failed for $ee_name: unable
to
> restart remote node"
> >   exit 1
> >   fi
> > -fi
> > +fi
> >   fi
> >   #Note: Operation Id SA_PLM_ADMIN_RESTART=7
> >   #In the example the $ee_name would expand to (for eg:-)
> safEE=my_linux_os,safHE=64bitmulticore,safDomain=my_domain
> >   }
> >
> > +# Force local node reboot as fast as possible
> > +quick_local_node_reboot()
> > +{
> > +logger -t "opensaf_reboot" "Do quick local node reboot"
> [HansN] perhaps reuse the same logic as in sysf_def.c, i.e. use the sysrq
as
> fallback and use a short timeout
[Vu]
Forcing node reboot by touching /proc/sysrq-trigger is not allowed on
containers such as LXC 
(as container is immutable), therefore I provided 02 more alternatives below
in case the first try is failed.
> > +
> > +$icmd /bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger
> [HansN] if not run as root, i.e. icmd is sudo, I think you need to use
> cmd: /bin/echo -n 'b' | $icmd tee /proc/sysrq-trigger , please check
> [HansN] or $icmd  /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq-
> trigger"
[Vu] Thanks for your suggestion. Will update accordingly.
> > +ret_code=$?
> > +
> > +if [ $ret_code != 0 ] && [ -x /bin/systemctl ]; then
> > +$icmd /bin/systemctl --force --force reboot
> > +ret_code=$?
> > +fi
> > +
> > +if [ $ret_code != 0 ]; then
> > +$icmd /sbin/reboot -f
> > +fi
> > +}
> >
> >   if ! test -f "$NODE_ID_FILE"; then
> >   logger -t "opensaf_reboot" "$NODE_ID_FILE doesnt exists,reboot failed
"
> > @@ -112,8 +128,13 @@ temp_node_id=`cat "$NODE_ID_FILE"`
> >   temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e
> 's:^:0x:'`
> >   self_node_id=`printf "%d" $temp_node_id`
> >
> > -# If clm cluster reboot requested argument one and two are set but
> > not used, argument 3 is set to 1, "safe reboot" request -if [
> > "$safe_reboot" = 1 ]; then
> > +# If no argument is provided, forcing node reboot immediately without
> > +log # flushing, process terminating, disk un-mounting.
> > +# If clm cluster reboot requested argument one and two are set but
> > +not used, # argument 3 is set to 1, "safe reboot" request.
> > +if [ "$#" = 0 ]; then
> > +quick_local_node_reboot
> > +elif [ "$safe_reboot" = 1 ]; then
> >   opensaf_safe_reboot
> >   else
> >   # A node ID of zero(0) means an order to reboot the local node @@
> > -165,7 +186,7 @@ else
> >   logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it
is
> not in INSTANTIATED state"
> >   elif [ $plm_node_state != 2 ]; then
> >   opensaf_reboot_with_plm
> > -else
> > +else
> >   logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it
is
> already in locked state"
> >   fi
> >   else
> > diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index
> > ec347fc71..c5060e67b 100644
> > --- a/src/amf/amfd/ndproc.cc
> > +++ b/src/amf/amfd/ndproc.cc
> > @@ -1262,7 +1262,7 @@ void check_quorum(AVD_CL_CB *cb) {
> >
> >       // remote fencing is disabled and we have lost write access
> >       // reboot this node to prevent split brain
> > -    opensaf_reboot(0, nullptr,
> > -      "Quorum lost. Rebooting this node to prevent split-brain");
> > +    opensaf_quick_reboot("Quorum lost. Rebooting this node to "
> > +                         "prevent split-brain");
> >     }
> >   }
> > diff --git a/src/base/ncssysf_def.h b/src/base/ncssysf_def.h index
> > 3df8b221e..3de6f44f6 100644
> > --- a/src/base/ncssysf_def.h
> > +++ b/src/base/ncssysf_def.h
> > @@ -76,6 +76,12 @@ void opensaf_reboot_prepare(void);
> >    */
> >   void opensaf_reboot(unsigned node_id, const char* ee_name, const
> > char* reason);
> >
> > +/**
> > + * Do quick local node reboot - without first unmounting file systems
> > + * or syncing disks attached to the system.
> > +*/
> > +void opensaf_quick_reboot(const char* reason);
> > +
> >
> /**************************************************************
> ***************
> >    **
**
> >    ** ncs_os_get_time_stamp:      Return the current timestamp as
"time_t"
> in **
> > diff --git a/src/base/sysf_def.c b/src/base/sysf_def.c index
> > d0cfc94ef..880f08ec4 100644
> > --- a/src/base/sysf_def.c
> > +++ b/src/base/sysf_def.c
> > @@ -271,6 +271,16 @@ void opensaf_reboot(unsigned node_id, const
> char *ee_name, const char *reason)
> >   }
> >   }
> >
> > +void opensaf_quick_reboot(const char *reason) {
> > +syslog(LOG_CRIT, "Quick local node rebooting, Reason: %s ", reason);
> > +int reboot_result = system(PKGLIBDIR "/opensaf_reboot");
> > +if (reboot_result != EXIT_SUCCESS) {
> > +syslog(LOG_CRIT, "node reboot failure: exit code %d",
> > +       WEXITSTATUS(reboot_result));
> > +}
> > +}
> > +
> >   /**
> >    * syslog and abort to generate a core dump (if enabled)
> >    * @param __file
> > diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc index
> > 122b2175c..b473b68aa 100644
> > --- a/src/fm/fmd/fm_main.cc
> > +++ b/src/fm/fmd/fm_main.cc
> > @@ -665,9 +665,9 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb,
> FM_EVT *fm_mbx_evt) {
> >                 "Two active controllers observed in a cluster,
newActive: %x and "
> >                 "old-Active: %x",
> >                 unsigned(fm_cb->node_id),
unsigned(fm_cb->peer_node_id));
> > -          opensaf_reboot(0, NULL,
> > -                         "Received svc up from peer node (old-active is
not "
> > -                         "fully DOWN), hence rebooting the new
Active");
> > +          opensaf_quick_reboot("Received svc up from peer node
(old-active "
> > +                               "is not fully DOWN), "
> > +                               "hence rebooting the new Active");
> >           }
> >         }
> >
> > diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc index
> > 2a9f1664f..028bfa380 100644
> > --- a/src/fm/fmd/fm_rda.cc
> > +++ b/src/fm/fmd/fm_rda.cc
> > @@ -105,9 +105,8 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb,
> PCS_RDA_ROLE role) {
> >         LOG_ER(
> >             "A controller is already active. We were separated from the
"
> >             "cluster?");
> > -      opensaf_reboot(0, nullptr,
> > -                     "A controller is already active. We were separated
"
> > -                     "from the cluster?");
> > +      opensaf_quick_reboot("A controller is already active. We were
> separated "
> > +                           "from the cluster?");
> >       }
> >     }
> >
> > diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index
> > 2d9aa51dc..b4c842942 100644
> > --- a/src/rde/rded/rde_main.cc
> > +++ b/src/rde/rded/rde_main.cc
> > @@ -140,9 +140,7 @@ static void handle_mbx_event() {
> >                    active_controller.c_str());
> >             if (consensus_service.IsRemoteFencingEnabled() == false) {
> >               LOG_ER("Probable split-brain. Rebooting this node");
> > -
> > -            opensaf_reboot(0, nullptr,
> > -                           "Split-brain detected by consensus
service");
> > +            opensaf_quick_reboot("Split-brain detected by consensus
> > + service");
> >             }
> >           }
> >
> > @@ -258,7 +256,7 @@ static void CheckForSplitBrain(const rde_msg *msg)
> {
> >     PCS_RDA_ROLE own_role = role->role();
> >     PCS_RDA_ROLE other_role = msg->info.peer_info.ha_role;
> >     if (own_role == PCS_RDA_ACTIVE && other_role == PCS_RDA_ACTIVE) {
> > -    opensaf_reboot(0, nullptr, "Split-brain detected");
> > +    opensaf_quick_reboot("Split-brain detected");
> >     }
> >   }
> >
> 
> _______________________________________________
> Opensaf-devel mailing list
> [email protected]
> https://lists.sourceforge.net/lists/listinfo/opensaf-devel



_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to