Thanks for the details and I agree. I was actually enquiring about - why a new "opensaf_safe_reboot" option in the opensaf_reboot script and - Why a osaf_safe_reboot() is added in osaf_utility.c instead of using the opensaf_reboot() function itself.
For the first question, I think the obvious guess would be that we want to use 'shutdown -r' such that rc scripts are invoked when doing the reboot. Thanks, Mathi. > -----Original Message----- > From: Hans Nordebäck [mailto:hans.nordeb...@ericsson.com] > Sent: Friday, September 30, 2016 2:14 PM > To: Mathivanan Naickan Palanivelu; anders.wid...@ericsson.com > Cc: opensaf-devel@lists.sourceforge.net > Subject: Re: [PATCH 1 of 1] clm: add support for cluster reboot V3 [#2053] > > Hi Mathi, > > ordering a cluster reboots are done at several places, both outside OpenSAF > and inside OpenSAF e.g. SMF. > > There are several different solutions to order a cluster reboot and in many > cases they don't work well, so it would be > > good if OpenSAF could provide one way to perform a cluster reboot. AMF > may also use this for implementing SA_AMF_CLUSTER_RESET. > > One common use case is upgrade with a following cluster reboot. The > payloads are normally PXE booted and the DHCP > > is stopped at the controllers before ordering a cluster reboot so the payloads > will not start until the controllers has rebooted and DHCP is started. > > Then each node is rebooted by in sequence do ssh to each node and order a > reboot. This may take time and if CLM can broadcast this reboot request the > > reboot will be considerable faster. So therefor in the first version of the > cluster reboot support I don't think we need to consider implementing > phases, this can > > be added later, with e.g. a flag in CLM as AndersW suggested. > > /Thanks HansN > > > On 09/30/2016 09:43 AM, Mathivanan Naickan Palanivelu wrote: > > Hi Hans, > > > > Could you provide some background on the need for > opensaf_safe_reboot(). > > What would be the need for this? > > > > Thanks, > > Mathi. > > > > > >> -----Original Message----- > >> From: Hans Nordeback [mailto:hans.nordeb...@ericsson.com] > >> Sent: Wednesday, September 28, 2016 5:26 PM > >> To: anders.wid...@ericsson.com; Mathivanan Naickan Palanivelu > >> Cc: opensaf-devel@lists.sourceforge.net > >> Subject: [PATCH 1 of 1] clm: add support for cluster reboot V3 > >> [#2053] > >> > >> osaf/libs/common/clmsv/include/clmsv_msg.h | 6 +++ > >> osaf/libs/core/common/include/osaf_utility.h | 5 +++ > >> osaf/libs/core/common/osaf_utility.c | 22 +++++++++++++ > >> osaf/services/saf/clmsv/clms/clms.h | 3 +- > >> osaf/services/saf/clmsv/clms/clms_imm.c | 18 ++++++++++ > >> osaf/services/saf/clmsv/clms/clms_mds.c | 46 > >> +++++++++++++++++++++++++++- > >> osaf/services/saf/clmsv/clms/clms_util.c | 12 +++++++ > >> osaf/services/saf/clmsv/nodeagent/main.c | 12 +++++++ > >> scripts/opensaf_reboot | 22 ++++++++++--- > >> 9 files changed, 139 insertions(+), 7 deletions(-) > >> > >> > >> Admin command to request cluster reboot: > >> immadm -o 1 safCluster=myClmCluster > >> > >> diff --git a/osaf/libs/common/clmsv/include/clmsv_msg.h > >> b/osaf/libs/common/clmsv/include/clmsv_msg.h > >> --- a/osaf/libs/common/clmsv/include/clmsv_msg.h > >> +++ b/osaf/libs/common/clmsv/include/clmsv_msg.h > >> @@ -23,6 +23,7 @@ typedef enum clms_msg_type { > >> CLMSV_CLMS_TO_CLMA_CBK_MSG, > >> CLMSV_CLMS_TO_CLMA_API_RESP_MSG, > >> CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG, > >> + CLMSV_CLMS_TO_CLMNA_REBOOT_MSG, > >> CLMSV_MSG_MAX > >> } CLMSV_MSG_TYPE; > >> > >> @@ -174,6 +175,10 @@ typedef struct clmsv_is_member_info_t { > >> SaUint32T client_id; > >> }CLMSV_IS_MEMBER_INFO; > >> > >> +typedef struct clmsv_reboot_info_t { > >> + SaClmNodeIdT node_id; > >> +} CLMSV_REBOOT_INFO; > >> + > >> /* Top Level CLMSv MDS message structure for use between CLMS-> > >> CLMA && CLMA -> CLMS */ typedef struct clmsv_msg_t { > >> struct clmsv_msg_t *next; /* Mailbox processing */ > >> @@ -183,6 +188,7 @@ typedef struct clmsv_msg_t { > >> CLMSV_CBK_INFO cbk_info; /* Callback Messages from CLMS to > CLA > >> */ > >> CLMSV_API_RESP_INFO api_resp_info; /* Response Messages from > >> CLMS to CLA */ > >> CLMSV_IS_MEMBER_INFO is_member_info; /*Is node member or > not > >> Message from CLMS to CLA*/ > >> + CLMSV_REBOOT_INFO reboot_info; /* Reboot request from > >> CLMS to CLMNA */ > >> } info; > >> } CLMSV_MSG; > >> > >> diff --git a/osaf/libs/core/common/include/osaf_utility.h > >> b/osaf/libs/core/common/include/osaf_utility.h > >> --- a/osaf/libs/core/common/include/osaf_utility.h > >> +++ b/osaf/libs/core/common/include/osaf_utility.h > >> @@ -24,6 +24,8 @@ > >> #ifndef OPENSAF_CORE_OSAF_UTILITY_H_ > >> #define OPENSAF_CORE_OSAF_UTILITY_H_ > >> > >> +#define USE_SAFE_REBOOT 1 > >> + > >> #include <pthread.h> > >> > >> #ifdef __cplusplus > >> @@ -68,6 +70,9 @@ extern void osaf_abort(long i_cause) #endif > >> nothrow, noreturn)); > >> > >> +extern void osaf_safe_reboot() > >> + __attribute__ ((nothrow)); > >> + > >> static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) { > >> int result = pthread_mutex_lock(io_mutex); > >> if (result != 0) osaf_abort(result); diff --git > >> a/osaf/libs/core/common/osaf_utility.c > >> b/osaf/libs/core/common/osaf_utility.c > >> --- a/osaf/libs/core/common/osaf_utility.c > >> +++ b/osaf/libs/core/common/osaf_utility.c > >> @@ -16,9 +16,12 @@ > >> */ > >> > >> #include "osaf_utility.h" > >> +#include "ncssysf_def.h" > >> +#include "configmake.h" > >> #include <stdlib.h> > >> #include <errno.h> > >> #include <syslog.h> > >> +#include <stdio.h> > >> > >> void osaf_abort(long i_cause) > >> { > >> @@ -26,3 +29,22 @@ void osaf_abort(long i_cause) > >> i_cause, __builtin_return_address(0), errno); > >> abort(); > >> } > >> + > >> +void osaf_safe_reboot() > >> +{ > >> + char str[256]; > >> + > >> + snprintf(str, sizeof(str), PKGLIBDIR "/opensaf_reboot %u %s %u", 0, > >> "not_used", USE_SAFE_REBOOT); > >> + syslog(LOG_NOTICE, "Reboot ordered using command: %s", str); > >> + > >> + int rc = system(str); > >> + if (rc < 0) { > >> + syslog(LOG_CRIT, "Node reboot failure: exit code %d", > >> WEXITSTATUS(rc)); > >> + } else { > >> + if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0) { > >> + syslog(LOG_NOTICE, "Command: %s successfully > >> executed", str); > >> + } else { > >> + syslog(LOG_CRIT, "Command: %s failed with exit > >> code %d", str, WEXITSTATUS(rc)); > >> + } > >> + } > >> +} > >> diff --git a/osaf/services/saf/clmsv/clms/clms.h > >> b/osaf/services/saf/clmsv/clms/clms.h > >> --- a/osaf/services/saf/clmsv/clms/clms.h > >> +++ b/osaf/services/saf/clmsv/clms/clms.h > >> @@ -99,6 +99,7 @@ extern uint32_t clms_mds_msg_send(CLMS_C > >> MDS_DEST *dest, > >> MDS_SYNC_SND_CTXT *mds_ctxt, > >> MDS_SEND_PRIORITY_TYPE prio, NCSMDS_SVC_ID svc_id); > >> > >> +extern uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG > >> *bcast_msg); > >> extern SaAisErrorT clms_imm_activate(CLMS_CB * cb); extern > >> uint32_t clms_node_trackresplist_empty(CLMS_CLUSTER_NODE * > op_node); > >> extern uint32_t clms_send_cbk_start_sub(CLMS_CB * cb, > >> CLMS_CLUSTER_NODE * node); @@ -125,5 +126,5 @@ extern void > >> clms_cb_dump(void); extern uint32_t > clms_send_is_member_info(CLMS_CB > >> * cb, SaClmNodeIdT node_id, SaBoolT member, SaBoolT is_configured); > >> extern void clm_imm_reinit_bg(CLMS_CB * cb); extern void > >> proc_downs_during_rolechange (void); > >> - > >> +extern void clms_cluster_reboot(); > >> #endif /* ifndef CLMS_H */ > >> diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c > >> b/osaf/services/saf/clmsv/clms/clms_imm.c > >> --- a/osaf/services/saf/clmsv/clms/clms_imm.c > >> +++ b/osaf/services/saf/clmsv/clms/clms_imm.c > >> @@ -19,6 +19,7 @@ > >> > >> #include "clms.h" > >> #include "osaf_extended_name.h" > >> +#include "osaf_utility.h" > >> > >> extern struct ImmutilWrapperProfile immutilWrapperProfile; > >> > >> @@ -886,6 +887,23 @@ static void clms_imm_admin_op_callback(S > >> > >> TRACE_ENTER2("Admin callback for nodename:%s, opId:%llu", > >> objectName->value, opId); > >> > >> + // E.g. immadm -o 1 safCluster=myClmCluster > >> + if (strncmp(osaf_extended_name_borrow(objectName), > >> + osaf_extended_name_borrow(&osaf_cluster->name), > >> + osaf_extended_name_length(objectName)) == 0) { > >> + if (opId == 1) { > >> + LOG_WA("Cluster reboot requested. Ordering > >> cluster reboot"); > >> + // MDS broadcast/multi cast call is synchronous > >> + clms_cluster_reboot(); > >> + sleep(1); > >> + osaf_safe_reboot(); > >> + } else { > >> + LOG_ER("Admin Operation not supported for %s", > >> osaf_extended_name_borrow(objectName)); > >> + > >> immutil_saImmOiAdminOperationResult(immOiHandle, invocation, > >> SA_AIS_ERR_INVALID_PARAM); > >> + } > >> + goto done; > >> + } > >> + > >> /*Lookup by the node_name and get the cluster node for CLM > Admin > >> oper */ > >> nodeop = clms_node_get_by_name(objectName); > >> if (nodeop == NULL) { > >> diff --git a/osaf/services/saf/clmsv/clms/clms_mds.c > >> b/osaf/services/saf/clmsv/clms/clms_mds.c > >> --- a/osaf/services/saf/clmsv/clms/clms_mds.c > >> +++ b/osaf/services/saf/clmsv/clms/clms_mds.c > >> @@ -659,7 +659,17 @@ uint32_t clms_mds_enc(struct ncsmds_call > >> ncs_enc_claim_space(uba, 4); > >> total_bytes += 4; > >> > >> - if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) { > >> + if (CLMSV_CLMS_TO_CLMNA_REBOOT_MSG == msg->evt_type) { > >> + /* encode the reboot msg **/ > >> + p8 = ncs_enc_reserve_space(uba, 4); > >> + if (!p8) { > >> + TRACE("ncs_enc_reserve_space failed"); > >> + goto err; > >> + } > >> + ncs_encode_32bit(&p8, msg->info.reboot_info.node_id); > >> + ncs_enc_claim_space(uba, 4); > >> + total_bytes += 4; > >> + } else if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg- > >>> evt_type) { > >> /** encode the API RSP msg subtype **/ > >> p8 = ncs_enc_reserve_space(uba, 4); > >> if (!p8) { > >> @@ -1517,3 +1527,37 @@ uint32_t clms_mds_msg_send(CLMS_CB * cb, > >> TRACE_LEAVE(); > >> return rc; > >> } > >> + > >> > +/********************************************************* > >> ******************* > >> + Name : clms_mds_msg_bcast > >> + > >> + Description : This routine sends a broadcast message to CLMNA. > >> + > >> + Arguments : cb - ptr to the CLMA CB > >> + bcast_msg - ptr to the CLMSv broadcast message > >> + > >> + Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE > >> + > >> + Notes : None. > >> > +********************************************************* > >> ************** > >> +*******/ uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG > >> *bcast_msg) > >> +{ > >> + NCSMDS_INFO snd_mds = {0}; > >> + uint32_t rc; > >> + > >> + snd_mds.i_mds_hdl = cb->mds_hdl; > >> + snd_mds.i_svc_id = NCSMDS_SVC_ID_CLMS; > >> + snd_mds.i_op = MDS_SEND; > >> + snd_mds.info.svc_send.i_msg = (NCSCONTEXT)bcast_msg; > >> + snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMNA; > >> + snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH; > >> + snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_BCAST; > >> + snd_mds.info.svc_send.info.bcast.i_bcast_scope = > >> NCSMDS_SCOPE_NONE; > >> + > >> + if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) { > >> + LOG_ER("%s: ncsmds_api MDS_SEND failed %u", > >> __FUNCTION__ ,rc); > >> + return rc; > >> + } > >> + > >> + return NCSCC_RC_SUCCESS; > >> +} > >> \ No newline at end of file > >> diff --git a/osaf/services/saf/clmsv/clms/clms_util.c > >> b/osaf/services/saf/clmsv/clms/clms_util.c > >> --- a/osaf/services/saf/clmsv/clms/clms_util.c > >> +++ b/osaf/services/saf/clmsv/clms/clms_util.c > >> @@ -1200,3 +1200,15 @@ bool ip_matched(uint16_t family1, uint8_ > >> return true; > >> } > >> > >> +// > >> +void clms_cluster_reboot() > >> +{ > >> + CLMSV_MSG bcast_msg; > >> + bcast_msg.evt_type = CLMSV_CLMS_TO_CLMNA_REBOOT_MSG; > >> + bcast_msg.info.reboot_info.node_id = clms_cb->node_id; > >> + if (clms_mds_msg_bcast(clms_cb, &bcast_msg) == > >> NCSCC_RC_SUCCESS) { > >> + LOG_NO("Sending cluster reboot broadcast message > >> succeeded"); > >> + } else { > >> + LOG_ER("Sending cluster reboot broadcast message failed"); > >> + } > >> +} > >> diff --git a/osaf/services/saf/clmsv/nodeagent/main.c > >> b/osaf/services/saf/clmsv/nodeagent/main.c > >> --- a/osaf/services/saf/clmsv/nodeagent/main.c > >> +++ b/osaf/services/saf/clmsv/nodeagent/main.c > >> @@ -114,6 +114,18 @@ static uint32_t clmna_mds_dec(struct ncs > >> total_bytes += 4; > >> > >> switch (msg->evt_type) { > >> + case CLMSV_CLMS_TO_CLMNA_REBOOT_MSG: > >> + { > >> + p8 = ncs_dec_flatten_space(uba, local_data, 4); > >> + msg->info.reboot_info.node_id = > >> ncs_decode_32bit(&p8); > >> + ncs_dec_skip_space(uba, 4); > >> + total_bytes += 4; > >> + // Reboot will be performed by CLMS for this node. > >> + if (clmna_cb->node_info.node_id != msg- > >>> info.reboot_info.node_id) { > >> + osaf_safe_reboot(); > >> + } > >> + break; > >> + } > >> case CLMSV_CLMS_TO_CLMA_API_RESP_MSG: > >> { > >> p8 = ncs_dec_flatten_space(uba, local_data, 8); diff - > -git > >> a/scripts/opensaf_reboot b/scripts/opensaf_reboot > >> --- a/scripts/opensaf_reboot > >> +++ b/scripts/opensaf_reboot > >> @@ -40,10 +40,17 @@ NODE_ID_FILE=$pkglocalstatedir/node_id > >> > >> node_id=$1 > >> ee_name=$2 > >> +safe_reboot=$3 > >> > >> # Run commands through sudo when not superuser test $(id -u) -ne 0 > >> && icmd=$(which sudo 2> /dev/null) > >> > >> +opensaf_safe_reboot() > >> +{ > >> + logger -t "opensaf_reboot" "Rebooting local node using shutdown" > >> + $icmd /sbin/shutdown -r now > >> +} > >> + > >> ## Use stonith for remote fencing > >> opensaf_reboot_with_remote_fencing() > >> { > >> @@ -91,8 +98,12 @@ temp_node_id=`cat "$NODE_ID_FILE"` > >> temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e > >> 's:^:0x:'` self_node_id=`printf "%d" $temp_node_id` > >> > >> -# A node ID of zero(0) means an order to reboot the local node -if [ > >> "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then > >> + > >> +if [ "$safe_reboot" = 1 ]; then > >> + opensaf_safe_reboot > >> +else > >> + # A node ID of zero(0) means an order to reboot the local node > >> + if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then > >> # uncomment the following line if debugging errors that keep > >> restarting the node > >> # exit 0 > >> > >> @@ -114,8 +125,8 @@ if [ "$self_node_id" = "$node_id" ] || [ > >> > >> # Reboot (not shutdown) system WITH file system sync > >> $icmd /sbin/reboot -f > >> -else > >> - if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then > >> + else > >> + if [ "$FMS_USE_REMOTE_FENCING" = 1 ]; then > >> opensaf_reboot_with_remote_fencing > >> else > >> if [ ":$ee_name" != ":" ]; then > >> @@ -133,4 +144,5 @@ else > >> logger -t "opensaf_reboot" "Rebooting remote node > in the > >> absence of PLM is outside the scope of OpenSAF" > >> fi > >> fi > >> -fi > >> + fi > >> +fi > >> \ No newline at end of file > ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel