Thanks for the details and I agree.
I was actually enquiring about 
- why a new "opensaf_safe_reboot" option in the opensaf_reboot script and
- Why a osaf_safe_reboot() is added in osaf_utility.c instead of using the 
opensaf_reboot() function itself.

For the first question, I think the obvious guess would be that we want to use 
'shutdown -r' such that rc scripts are invoked when doing the reboot.

Thanks,
Mathi.

> -----Original Message-----
> From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com]
> Sent: Friday, September 30, 2016 2:14 PM
> To: Mathivanan Naickan Palanivelu; anders.wid...@ericsson.com
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: Re: [PATCH 1 of 1] clm: add support for cluster reboot V3 [#2053]
> 
> Hi Mathi,
> 
> ordering a cluster reboots are done at several places, both outside OpenSAF
> and inside OpenSAF e.g. SMF.
> 
> There are several different solutions to order a cluster reboot and in many
> cases they don't work well, so it would be
> 
> good if OpenSAF could provide one way to perform a cluster reboot. AMF
> may also use this for implementing SA_AMF_CLUSTER_RESET.
> 
> One common use case is upgrade with a following cluster reboot. The
> payloads are normally PXE booted and the DHCP
> 
> is stopped at the controllers before ordering a cluster reboot so the payloads
> will not start until the controllers has rebooted and DHCP is started.
> 
> Then each node is rebooted by in sequence do ssh to each node and order a
> reboot. This may take time and if CLM can broadcast this reboot request the
> 
> reboot will be considerable faster. So therefor in the first version of the
> cluster reboot support I don't think we need to consider implementing
> phases, this can
> 
> be added later, with e.g. a flag in CLM as AndersW suggested.
> 
> /Thanks HansN
> 
> 
> On 09/30/2016 09:43 AM, Mathivanan Naickan Palanivelu wrote:
> > Hi Hans,
> >
> > Could you provide some background on the need for
> opensaf_safe_reboot().
> > What would be the need for this?
> >
> > Thanks,
> > Mathi.
> >
> >
> >> -----Original Message-----
> >> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com]
> >> Sent: Wednesday, September 28, 2016 5:26 PM
> >> To: anders.wid...@ericsson.com; Mathivanan Naickan Palanivelu
> >> Cc: opensaf-devel@lists.sourceforge.net
> >> Subject: [PATCH 1 of 1] clm: add support for cluster reboot V3
> >> [#2053]
> >>
> >>   osaf/libs/common/clmsv/include/clmsv_msg.h   |   6 +++
> >>   osaf/libs/core/common/include/osaf_utility.h |   5 +++
> >>   osaf/libs/core/common/osaf_utility.c         |  22 +++++++++++++
> >>   osaf/services/saf/clmsv/clms/clms.h          |   3 +-
> >>   osaf/services/saf/clmsv/clms/clms_imm.c      |  18 ++++++++++
> >>   osaf/services/saf/clmsv/clms/clms_mds.c      |  46
> >> +++++++++++++++++++++++++++-
> >>   osaf/services/saf/clmsv/clms/clms_util.c     |  12 +++++++
> >>   osaf/services/saf/clmsv/nodeagent/main.c     |  12 +++++++
> >>   scripts/opensaf_reboot                       |  22 ++++++++++---
> >>   9 files changed, 139 insertions(+), 7 deletions(-)
> >>
> >>
> >> Admin command to request cluster reboot:
> >> immadm -o 1 safCluster=myClmCluster
> >>
> >> diff --git a/osaf/libs/common/clmsv/include/clmsv_msg.h
> >> b/osaf/libs/common/clmsv/include/clmsv_msg.h
> >> --- a/osaf/libs/common/clmsv/include/clmsv_msg.h
> >> +++ b/osaf/libs/common/clmsv/include/clmsv_msg.h
> >> @@ -23,6 +23,7 @@ typedef enum clms_msg_type {
> >>     CLMSV_CLMS_TO_CLMA_CBK_MSG,
> >>     CLMSV_CLMS_TO_CLMA_API_RESP_MSG,
> >>     CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG,
> >> +  CLMSV_CLMS_TO_CLMNA_REBOOT_MSG,
> >>     CLMSV_MSG_MAX
> >>   } CLMSV_MSG_TYPE;
> >>
> >> @@ -174,6 +175,10 @@ typedef struct clmsv_is_member_info_t {
> >>     SaUint32T client_id;
> >>   }CLMSV_IS_MEMBER_INFO;
> >>
> >> +typedef struct clmsv_reboot_info_t {
> >> +  SaClmNodeIdT node_id;
> >> +} CLMSV_REBOOT_INFO;
> >> +
> >>   /* Top Level CLMSv MDS message structure for use between CLMS->
> >> CLMA && CLMA -> CLMS */  typedef struct clmsv_msg_t {
> >>     struct clmsv_msg_t *next;       /* Mailbox processing */
> >> @@ -183,6 +188,7 @@ typedef struct clmsv_msg_t {
> >>       CLMSV_CBK_INFO cbk_info;        /* Callback Messages from CLMS to
> CLA
> >> */
> >>       CLMSV_API_RESP_INFO api_resp_info;      /* Response Messages from
> >> CLMS to CLA */
> >>       CLMSV_IS_MEMBER_INFO is_member_info;    /*Is node member or
> not
> >> Message from CLMS to CLA*/
> >> +    CLMSV_REBOOT_INFO reboot_info;        /* Reboot request from
> >> CLMS to CLMNA */
> >>     } info;
> >>   } CLMSV_MSG;
> >>
> >> diff --git a/osaf/libs/core/common/include/osaf_utility.h
> >> b/osaf/libs/core/common/include/osaf_utility.h
> >> --- a/osaf/libs/core/common/include/osaf_utility.h
> >> +++ b/osaf/libs/core/common/include/osaf_utility.h
> >> @@ -24,6 +24,8 @@
> >>   #ifndef OPENSAF_CORE_OSAF_UTILITY_H_
> >>   #define OPENSAF_CORE_OSAF_UTILITY_H_
> >>
> >> +#define USE_SAFE_REBOOT 1
> >> +
> >>   #include <pthread.h>
> >>
> >>   #ifdef  __cplusplus
> >> @@ -68,6 +70,9 @@ extern void osaf_abort(long i_cause)  #endif
> >>           nothrow, noreturn));
> >>
> >> +extern void osaf_safe_reboot()
> >> +    __attribute__ ((nothrow));
> >> +
> >>   static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) {
> >>     int result = pthread_mutex_lock(io_mutex);
> >>     if (result != 0) osaf_abort(result); diff --git
> >> a/osaf/libs/core/common/osaf_utility.c
> >> b/osaf/libs/core/common/osaf_utility.c
> >> --- a/osaf/libs/core/common/osaf_utility.c
> >> +++ b/osaf/libs/core/common/osaf_utility.c
> >> @@ -16,9 +16,12 @@
> >>    */
> >>
> >>   #include "osaf_utility.h"
> >> +#include "ncssysf_def.h"
> >> +#include "configmake.h"
> >>   #include <stdlib.h>
> >>   #include <errno.h>
> >>   #include <syslog.h>
> >> +#include <stdio.h>
> >>
> >>   void osaf_abort(long i_cause)
> >>   {
> >> @@ -26,3 +29,22 @@ void osaf_abort(long i_cause)
> >>            i_cause, __builtin_return_address(0), errno);
> >>    abort();
> >>   }
> >> +
> >> +void osaf_safe_reboot()
> >> +{
> >> +  char str[256];
> >> +
> >> +  snprintf(str, sizeof(str), PKGLIBDIR "/opensaf_reboot %u %s %u", 0,
> >> "not_used", USE_SAFE_REBOOT);
> >> +  syslog(LOG_NOTICE, "Reboot ordered using command: %s", str);
> >> +
> >> +  int rc = system(str);
> >> +  if (rc < 0) {
> >> +          syslog(LOG_CRIT, "Node reboot failure: exit code %d",
> >> WEXITSTATUS(rc));
> >> +  } else {
> >> +           if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0) {
> >> +                  syslog(LOG_NOTICE, "Command: %s successfully
> >> executed", str);
> >> +          } else {
> >> +                  syslog(LOG_CRIT, "Command: %s failed with exit
> >> code %d", str, WEXITSTATUS(rc));
> >> +          }
> >> +  }
> >> +}
> >> diff --git a/osaf/services/saf/clmsv/clms/clms.h
> >> b/osaf/services/saf/clmsv/clms/clms.h
> >> --- a/osaf/services/saf/clmsv/clms/clms.h
> >> +++ b/osaf/services/saf/clmsv/clms/clms.h
> >> @@ -99,6 +99,7 @@ extern uint32_t clms_mds_msg_send(CLMS_C
> >>                                     MDS_DEST *dest,
> >>                                     MDS_SYNC_SND_CTXT *mds_ctxt,
> >> MDS_SEND_PRIORITY_TYPE prio, NCSMDS_SVC_ID svc_id);
> >>
> >> +extern uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG
> >> *bcast_msg);
> >>   extern SaAisErrorT clms_imm_activate(CLMS_CB * cb);  extern
> >> uint32_t clms_node_trackresplist_empty(CLMS_CLUSTER_NODE *
> op_node);
> >> extern uint32_t clms_send_cbk_start_sub(CLMS_CB * cb,
> >> CLMS_CLUSTER_NODE * node); @@ -125,5 +126,5 @@ extern void
> >> clms_cb_dump(void);  extern uint32_t
> clms_send_is_member_info(CLMS_CB
> >> * cb, SaClmNodeIdT node_id,  SaBoolT member, SaBoolT is_configured);
> >> extern void clm_imm_reinit_bg(CLMS_CB * cb);  extern void
> >> proc_downs_during_rolechange (void);
> >> -
> >> +extern void clms_cluster_reboot();
> >>   #endif   /* ifndef CLMS_H */
> >> diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c
> >> b/osaf/services/saf/clmsv/clms/clms_imm.c
> >> --- a/osaf/services/saf/clmsv/clms/clms_imm.c
> >> +++ b/osaf/services/saf/clmsv/clms/clms_imm.c
> >> @@ -19,6 +19,7 @@
> >>
> >>   #include "clms.h"
> >>   #include "osaf_extended_name.h"
> >> +#include "osaf_utility.h"
> >>
> >>   extern struct ImmutilWrapperProfile immutilWrapperProfile;
> >>
> >> @@ -886,6 +887,23 @@ static void clms_imm_admin_op_callback(S
> >>
> >>    TRACE_ENTER2("Admin callback for nodename:%s, opId:%llu",
> >> objectName->value, opId);
> >>
> >> +  // E.g. immadm -o 1 safCluster=myClmCluster
> >> +  if (strncmp(osaf_extended_name_borrow(objectName),
> >> +                  osaf_extended_name_borrow(&osaf_cluster->name),
> >> +                  osaf_extended_name_length(objectName)) == 0) {
> >> +          if (opId == 1) {
> >> +                  LOG_WA("Cluster reboot requested. Ordering
> >> cluster reboot");
> >> +                  // MDS broadcast/multi cast call is synchronous
> >> +                  clms_cluster_reboot();
> >> +                  sleep(1);
> >> +                  osaf_safe_reboot();
> >> +          } else {
> >> +                  LOG_ER("Admin Operation not supported for %s",
> >> osaf_extended_name_borrow(objectName));
> >> +
> >>    immutil_saImmOiAdminOperationResult(immOiHandle, invocation,
> >> SA_AIS_ERR_INVALID_PARAM);
> >> +          }
> >> +          goto done;
> >> +  }
> >> +
> >>    /*Lookup by the node_name and get the cluster node for CLM
> Admin
> >> oper */
> >>    nodeop = clms_node_get_by_name(objectName);
> >>    if (nodeop == NULL) {
> >> diff --git a/osaf/services/saf/clmsv/clms/clms_mds.c
> >> b/osaf/services/saf/clmsv/clms/clms_mds.c
> >> --- a/osaf/services/saf/clmsv/clms/clms_mds.c
> >> +++ b/osaf/services/saf/clmsv/clms/clms_mds.c
> >> @@ -659,7 +659,17 @@ uint32_t clms_mds_enc(struct ncsmds_call
> >>    ncs_enc_claim_space(uba, 4);
> >>    total_bytes += 4;
> >>
> >> -  if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) {
> >> +  if (CLMSV_CLMS_TO_CLMNA_REBOOT_MSG == msg->evt_type) {
> >> +          /* encode the reboot msg **/
> >> +          p8 = ncs_enc_reserve_space(uba, 4);
> >> +          if (!p8) {
> >> +                  TRACE("ncs_enc_reserve_space failed");
> >> +                  goto err;
> >> +          }
> >> +          ncs_encode_32bit(&p8, msg->info.reboot_info.node_id);
> >> +          ncs_enc_claim_space(uba, 4);
> >> +          total_bytes += 4;
> >> +  } else if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg-
> >>> evt_type) {
> >>    /** encode the API RSP msg subtype **/
> >>            p8 = ncs_enc_reserve_space(uba, 4);
> >>            if (!p8) {
> >> @@ -1517,3 +1527,37 @@ uint32_t clms_mds_msg_send(CLMS_CB * cb,
> >>    TRACE_LEAVE();
> >>    return rc;
> >>   }
> >> +
> >>
> +/*********************************************************
> >> *******************
> >> +  Name          : clms_mds_msg_bcast
> >> +
> >> +  Description   : This routine sends a broadcast message to CLMNA.
> >> +
> >> +  Arguments     : cb  - ptr to the CLMA CB
> >> +                  bcast_msg - ptr to the CLMSv broadcast message
> >> +
> >> +  Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE
> >> +
> >> +  Notes         : None.
> >>
> +*********************************************************
> >> **************
> >> +*******/ uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG
> >> *bcast_msg)
> >> +{
> >> +  NCSMDS_INFO snd_mds = {0};
> >> +  uint32_t rc;
> >> +
> >> +  snd_mds.i_mds_hdl = cb->mds_hdl;
> >> +  snd_mds.i_svc_id = NCSMDS_SVC_ID_CLMS;
> >> +  snd_mds.i_op = MDS_SEND;
> >> +  snd_mds.info.svc_send.i_msg = (NCSCONTEXT)bcast_msg;
> >> +  snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMNA;
> >> +  snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH;
> >> +  snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_BCAST;
> >> +  snd_mds.info.svc_send.info.bcast.i_bcast_scope =
> >> NCSMDS_SCOPE_NONE;
> >> +
> >> +  if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) {
> >> +          LOG_ER("%s: ncsmds_api MDS_SEND failed %u",
> >> __FUNCTION__ ,rc);
> >> +          return rc;
> >> +  }
> >> +
> >> +  return NCSCC_RC_SUCCESS;
> >> +}
> >> \ No newline at end of file
> >> diff --git a/osaf/services/saf/clmsv/clms/clms_util.c
> >> b/osaf/services/saf/clmsv/clms/clms_util.c
> >> --- a/osaf/services/saf/clmsv/clms/clms_util.c
> >> +++ b/osaf/services/saf/clmsv/clms/clms_util.c
> >> @@ -1200,3 +1200,15 @@ bool ip_matched(uint16_t family1, uint8_
> >>    return true;
> >>   }
> >>
> >> +//
> >> +void clms_cluster_reboot()
> >> +{
> >> +  CLMSV_MSG bcast_msg;
> >> +  bcast_msg.evt_type = CLMSV_CLMS_TO_CLMNA_REBOOT_MSG;
> >> +  bcast_msg.info.reboot_info.node_id = clms_cb->node_id;
> >> +  if (clms_mds_msg_bcast(clms_cb, &bcast_msg) ==
> >> NCSCC_RC_SUCCESS) {
> >> +          LOG_NO("Sending cluster reboot broadcast message
> >> succeeded");
> >> +  } else {
> >> +          LOG_ER("Sending cluster reboot broadcast message failed");
> >> +  }
> >> +}
> >> diff --git a/osaf/services/saf/clmsv/nodeagent/main.c
> >> b/osaf/services/saf/clmsv/nodeagent/main.c
> >> --- a/osaf/services/saf/clmsv/nodeagent/main.c
> >> +++ b/osaf/services/saf/clmsv/nodeagent/main.c
> >> @@ -114,6 +114,18 @@ static uint32_t clmna_mds_dec(struct ncs
> >>    total_bytes += 4;
> >>
> >>    switch (msg->evt_type) {
> >> +  case    CLMSV_CLMS_TO_CLMNA_REBOOT_MSG:
> >> +          {
> >> +                  p8 = ncs_dec_flatten_space(uba, local_data, 4);
> >> +                  msg->info.reboot_info.node_id =
> >> ncs_decode_32bit(&p8);
> >> +                  ncs_dec_skip_space(uba, 4);
> >> +                  total_bytes += 4;
> >> +                  // Reboot will be performed by CLMS for this node.
> >> +                  if (clmna_cb->node_info.node_id != msg-
> >>> info.reboot_info.node_id) {
> >> +                          osaf_safe_reboot();
> >> +                  }
> >> +                  break;
> >> +          }
> >>    case CLMSV_CLMS_TO_CLMA_API_RESP_MSG:
> >>            {
> >>                    p8 = ncs_dec_flatten_space(uba, local_data, 8); diff -
> -git
> >> a/scripts/opensaf_reboot b/scripts/opensaf_reboot
> >> --- a/scripts/opensaf_reboot
> >> +++ b/scripts/opensaf_reboot
> >> @@ -40,10 +40,17 @@ NODE_ID_FILE=$pkglocalstatedir/node_id
> >>
> >>   node_id=$1
> >>   ee_name=$2
> >> +safe_reboot=$3
> >>
> >>   # Run commands through sudo when not superuser  test $(id -u) -ne 0
> >> && icmd=$(which sudo 2> /dev/null)
> >>
> >> +opensaf_safe_reboot()
> >> +{
> >> +    logger -t "opensaf_reboot" "Rebooting local node using shutdown"
> >> +    $icmd /sbin/shutdown -r now
> >> +}
> >> +
> >>   ## Use stonith for remote fencing
> >>   opensaf_reboot_with_remote_fencing()
> >>   {
> >> @@ -91,8 +98,12 @@ temp_node_id=`cat "$NODE_ID_FILE"`
> >> temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e
> >> 's:^:0x:'`  self_node_id=`printf "%d" $temp_node_id`
> >>
> >> -# A node ID of zero(0) means an order to reboot the local node -if [
> >> "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then
> >> +
> >> +if [ "$safe_reboot" = 1 ]; then
> >> +    opensaf_safe_reboot
> >> +else
> >> +    # A node ID of zero(0) means an order to reboot the local node
> >> +    if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then
> >>    # uncomment the following line if debugging errors that keep
> >> restarting the node
> >>    # exit 0
> >>
> >> @@ -114,8 +125,8 @@ if [ "$self_node_id" = "$node_id" ] || [
> >>
> >>    # Reboot (not shutdown) system WITH file system sync
> >>    $icmd /sbin/reboot -f
> >> -else
> >> -  if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then
> >> +    else
> >> +  if [ "$FMS_USE_REMOTE_FENCING" = 1 ]; then
> >>            opensaf_reboot_with_remote_fencing
> >>    else
> >>            if [ ":$ee_name" != ":" ]; then
> >> @@ -133,4 +144,5 @@ else
> >>                    logger -t "opensaf_reboot" "Rebooting remote node
> in the
> >> absence of PLM is outside the scope of OpenSAF"
> >>            fi
> >>    fi
> >> -fi
> >> +    fi
> >> +fi
> >> \ No newline at end of file
> 

------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to