00-README.conf                                  |   47 +++++++++
 osaf/services/infrastructure/fm/config/fmd.conf |    9 +-
 osaf/services/infrastructure/fm/fms/Makefile.am |    3 +-
 osaf/services/infrastructure/fm/fms/fm_cb.h     |    4 +
 osaf/services/infrastructure/fm/fms/fm_main.c   |  118 +++++++++++++++++++++++-
 scripts/opensaf_reboot                          |   47 +++++++--
 6 files changed, 210 insertions(+), 18 deletions(-)


diff --git a/00-README.conf b/00-README.conf
--- a/00-README.conf
+++ b/00-README.conf
@@ -530,3 +530,50 @@ and not access any of its members direct
 saAisNameBorrow() access functions shall be used. The
 SA_MAX_UNEXTENDED_NAME_LENGTH constant can be used to refer to the maximum
 string length that can be stored in the unextended SaNameT type.
+
+Configuring remote fencing support using STONITH
+================================================
+
+In an virtualized enironment STONITH can be used to for remote fencing the 
other
+system controller in case of "link loss" or the peer system controller is 
"live hanging",
+this to avoid split-brains.
+Node self-fencing will also be used if e.g. the active controller loses 
connectivity to
+all other nodes in the cluster.
+
+Example installing on using Ubuntu 14.04,
+
+On each virtual node install stonith package:
+
+  sudo apt-get install cluster-glue
+
+The name of each virtual node should be the same as the clm node name,
+e.g. safNode=SC-2,safCluster=myClmCluster the virtual node name should be SC-2.
+
+If a firewall is used on the "hypervisor" host, the tcp port 16509 
+has to be added. If ssh is used use ssh-keygen and generate ssh keys for each 
+virtual node.
+
+To verify the installation virsh can be used, e.g:
+virsh --connect=qemu+tcp://192.168.122.1/system list --all
+
+Example of output:
+Id    Name                           State
+----------------------------------------------------
+ 2     SC-1                           running
+ 3     SC-2                           running
+ 4     PL-3                           running
+
+Update the fmd.conf file:
+
+# The Promote active timer is set to delay the Standby controllers reboot 
request,
+# as the Active controller probably also are requesting reboot of the standby.
+# The resolution is in 10 ms units.
+export FMS_PROMOTE_ACTIVE_TIMER=300
+
+# Uncomment the next 5 lines and update acordingly to enable remote fencing
+# See also documentation for STONITH
+export FMS_USE_REMOTE_FENCING=1
+export FMS_FENCE_CMD="stonith"
+export FMS_DEVICE_TYPE="external/libvirt"
+export FMS_HYPERVISOR_URI="qemu+tcp://192.168.122.1/system"
+export FMS_FENCE_ACTION="reset"
diff --git a/osaf/services/infrastructure/fm/config/fmd.conf 
b/osaf/services/infrastructure/fm/config/fmd.conf
--- a/osaf/services/infrastructure/fm/config/fmd.conf
+++ b/osaf/services/infrastructure/fm/config/fmd.conf
@@ -17,7 +17,14 @@ export FM_CONTROLLER2_SUBSLOT=15
 export FMS_HA_ENV_HEALTHCHECK_KEY="Default"
 
 # Promote active timer
-export FMS_PROMOTE_ACTIVE_TIMER=0
+export FMS_PROMOTE_ACTIVE_TIMER=500
+
+# Uncomment the next 5 lines and update acordingly to enable remote fencing
+export FMS_USE_REMOTE_FENCING=1
+export FMS_FENCE_CMD="stonith"
+export FMS_DEVICE_TYPE="external/libvirt"
+export FMS_HYPERVISOR_URI="qemu+tcp://192.168.122.1/system"
+export FMS_FENCE_ACTION="reset"
 
 # FM will supervise transitions to the ACTIVE role when this variable is set to
 # a non-zero value. The value is the time in the unit of 10 ms to wait for a
diff --git a/osaf/services/infrastructure/fm/fms/Makefile.am 
b/osaf/services/infrastructure/fm/fms/Makefile.am
--- a/osaf/services/infrastructure/fm/fms/Makefile.am
+++ b/osaf/services/infrastructure/fm/fms/Makefile.am
@@ -46,4 +46,5 @@ osaffmd_SOURCES = \
 osaffmd_LDADD = \
        $(top_builddir)/osaf/libs/core/libopensaf_core.la \
        $(top_builddir)/osaf/libs/saf/libSaAmf/libSaAmf.la \
-       $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la
+       $(top_builddir)/osaf/libs/agents/infrastructure/rda/librda.la \
+       $(top_builddir)/osaf/libs/saf/libSaClm/libSaClm.la
diff --git a/osaf/services/infrastructure/fm/fms/fm_cb.h 
b/osaf/services/infrastructure/fm/fms/fm_cb.h
--- a/osaf/services/infrastructure/fm/fms/fm_cb.h
+++ b/osaf/services/infrastructure/fm/fms/fm_cb.h
@@ -26,6 +26,7 @@
 #include "mds_papi.h"
 #include "rda_papi.h"
 #include "fm_amf.h"
+#include "saClm.h"
 
 #include <stdbool.h>
 #include <stdint.h>
@@ -102,6 +103,9 @@ typedef struct fm_cb {
        uint64_t cluster_size;
        struct timespec last_well_connected;
        struct timespec node_isolation_timeout;
+       SaClmHandleT clm_hdl;
+       bool use_remote_fencing;
+       SaNameT peer_clm_node_name;
 } FM_CB;
 
 extern char *role_string[];
diff --git a/osaf/services/infrastructure/fm/fms/fm_main.c 
b/osaf/services/infrastructure/fm/fms/fm_main.c
--- a/osaf/services/infrastructure/fm/fms/fm_main.c
+++ b/osaf/services/infrastructure/fm/fms/fm_main.c
@@ -32,6 +32,13 @@ This file contains the main() routine fo
 #include "fm.h"
 #include "osaf_time.h"
 
+#define FM_CLM_API_TIMEOUT 10000000000LL
+
+static         SaVersionT clm_version = { 'B', 4, 1 };
+static const SaClmCallbacksT_4 clm_callbacks = {
+       0, 0
+};
+
 enum {
        FD_TERM = 0,
        FD_AMF = 1,
@@ -54,6 +61,8 @@ static uint32_t fm_get_args(FM_CB *);
 static uint32_t fms_fms_exchange_node_info(FM_CB *);
 static uint32_t fm_nid_notify(uint32_t);
 static uint32_t fm_tmr_start(FM_TMR *, SaTimeT);
+static SaAisErrorT get_peer_clm_node_name(NODE_ID);
+static SaAisErrorT fm_clm_init();
 static void fm_mbx_msg_handler(FM_CB *, FM_EVT *);
 static void fm_evt_proc_rda_callback(FM_CB*, FM_EVT*);
 static void fm_tmr_exp(void *);
@@ -313,6 +322,8 @@ uint32_t initialize_for_assignment(FM_CB
                LOG_ER("immd_mds_register FAILED %d", rc);
                goto done;
        }
+
+       cb->clm_hdl = 0;
        cb->fully_initialized = true;
 done:
        TRACE_LEAVE2("rc = %u", rc);
@@ -383,8 +394,17 @@ static uint32_t fm_agents_startup(void)
 *****************************************************************************/
 static uint32_t fm_get_args(FM_CB *fm_cb)
 {
+       char *use_remote_fencing = NULL;
        char *value;
        TRACE_ENTER();
+
+       fm_cb->use_remote_fencing = false;
+       use_remote_fencing = getenv("FMS_USE_REMOTE_FENCING");
+       if (use_remote_fencing != NULL) {
+               fm_cb->use_remote_fencing = true;
+               LOG_NO("Remote fencing is enabled");
+       }
+
        value = getenv("EE_ID");
        if (value != NULL) {
                fm_cb->node_name.length = strlen(value);
@@ -474,6 +494,81 @@ void fm_proc_svc_down(FM_CB *cb, FM_EVT 
 }
 
 /****************************************************************************
+* Name          : fm_clm_init
+*
+* Description   : Initialize CLM. 
+*
+* Arguments     : None. 
+*
+* Return Values : None.
+* 
+* Notes         : None. 
+*****************************************************************************/
+static SaAisErrorT get_peer_clm_node_name(NODE_ID node_id)
+{
+       SaAisErrorT rc = SA_AIS_OK;
+       char *node;
+       SaClmClusterNodeT_4 cluster_node;
+
+       if ((rc = fm_clm_init()) != SA_AIS_OK) {
+               LOG_ER("clm init FAILED %d", rc);
+       } else {
+               LOG_NO("clm init OK");
+       }
+
+       if ((rc = saClmClusterNodeGet_4(fm_cb->clm_hdl, node_id, 
FM_CLM_API_TIMEOUT, &cluster_node)) == SA_AIS_OK) {
+               // Extract peer clm node name, e.g SC-2 from 
"safNode=SC-2,safCluster=myClmCluster"
+               // The peer clm node name will be passed to opensaf_reboot 
script to support remote fencing.
+               // The peer clm node name should correspond to the name of the 
virtual machine for that node.
+
+               node = strtok((char*) cluster_node.nodeName.value, "=");
+               node = strtok(NULL, ",");
+               strncpy((char*) fm_cb->peer_clm_node_name.value, node, 
cluster_node.nodeName.length);
+               LOG_NO("Peer clm node name: %s", 
fm_cb->peer_clm_node_name.value);
+       } else {
+               LOG_WA("saClmClusterNodeGet_4 returned %u", (unsigned) rc);
+       }
+       return rc;
+}
+
+/****************************************************************************
+* Name          : fm_clm_init
+*
+* Description   : Initialize CLM. 
+*
+* Arguments     : None. 
+*
+* Return Values : None.
+* 
+* Notes         : None. 
+*****************************************************************************/
+static SaAisErrorT fm_clm_init()
+{
+       SaAisErrorT rc = SA_AIS_OK;
+
+       for (;;) {
+               rc = saClmInitialize_4(&fm_cb->clm_hdl, &clm_callbacks, 
&clm_version);
+               if (rc == SA_AIS_ERR_TRY_AGAIN ||
+                       rc == SA_AIS_ERR_TIMEOUT ||
+                       rc == SA_AIS_ERR_UNAVAILABLE) {
+                       LOG_WA("saClmInitialize_4 returned %u", (unsigned) rc);
+
+                       if (rc != SA_AIS_ERR_TRY_AGAIN) {
+                               LOG_WA("saClmInitialize_4 returned %u",
+                                       (unsigned) rc);
+                       }
+                       osaf_nanosleep(&kHundredMilliseconds);
+                       continue;
+               }
+               if (rc == SA_AIS_OK) break;
+               LOG_ER("Failed to Initialize with CLM: %u", rc);
+               goto done;
+       }
+done:
+       return rc;
+}
+
+/****************************************************************************
 * Name          : fm_mbx_msg_handler
 *
 * Description   : Processes Mail box messages between FM. 
@@ -517,8 +612,13 @@ static void fm_mbx_msg_handler(FM_CB *fm
                                         * but just that failover has been 
trigerred quicker than the
                                         * node_down event has been received.
                                         */
-                               opensaf_reboot(fm_cb->peer_node_id, (char 
*)fm_cb->peer_node_name.value,
-                                               "Received Node Down for peer 
controller");
+                               if (fm_cb->use_remote_fencing) {
+                                       opensaf_reboot(fm_cb->peer_node_id, 
(char *)fm_cb->peer_clm_node_name.value,
+                                                       "Received Node Down for 
peer controller");
+                               } else {
+                                       opensaf_reboot(fm_cb->peer_node_id, 
(char *)fm_cb->peer_node_name.value,
+                                                       "Received Node Down for 
peer controller");
+                               }
                                if (!((fm_cb->role == PCS_RDA_ACTIVE) && 
(fm_cb->amf_state == (SaAmfHAStateT)PCS_RDA_ACTIVE))) {
                                        fm_cb->role = PCS_RDA_ACTIVE;
                                        LOG_NO("Controller Failover: Setting 
role to ACTIVE");
@@ -534,6 +634,8 @@ static void fm_mbx_msg_handler(FM_CB *fm
 /* Peer fm came up so sending ee_id of this node */
                if (fm_cb->node_name.length != 0)
                        fms_fms_exchange_node_info(fm_cb);
+
+               get_peer_clm_node_name(fm_mbx_evt->node_id);
                break;
        case FM_EVT_TMR_EXP:
 /* Timer Expiry event posted */
@@ -547,8 +649,16 @@ static void fm_mbx_msg_handler(FM_CB *fm
                        fm_cb->role = PCS_RDA_ACTIVE;
 
                        LOG_NO("Reseting peer controller node id: %x", 
fm_cb->peer_node_id);
-                       opensaf_reboot(fm_cb->peer_node_id, (char 
*)fm_cb->peer_node_name.value,
-                                      "Received Node Down for Active peer");
+                       if (fm_cb->use_remote_fencing) {
+                               LOG_NO("saClmClusterNodeGet succeeded node_id 
0x%X, clm peer node name %s",
+                                       fm_mbx_evt->node_id, 
fm_cb->peer_clm_node_name.value);
+
+                               opensaf_reboot(fm_cb->peer_node_id, (char 
*)fm_cb->peer_clm_node_name.value,
+                                               "Received Node Down for peer 
controller");
+                       } else {
+                               opensaf_reboot(fm_cb->peer_node_id, (char 
*)fm_cb->peer_node_name.value,
+                                              "Received Node Down for Active 
peer");
+                       }
                        fm_rda_set_role(fm_cb, PCS_RDA_ACTIVE);
                } else if (fm_mbx_evt->info.fm_tmr->type == 
FM_TMR_ACTIVATION_SUPERVISION) {
                        opensaf_reboot(0, NULL, "Activation timer supervision "
diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot
--- a/scripts/opensaf_reboot
+++ b/scripts/opensaf_reboot
@@ -26,13 +26,31 @@
 # through proprietary mechanisms, i.e. not through PLM. Node_id is (the only 
 # entity) at the disposal of such a mechanism.
 
+if [ -f "$pkgsysconfdir/fmd.conf" ]; then
+  . "$pkgsysconfdir/fmd.conf"
+fi
+
 NODE_ID_FILE=$pkglocalstatedir/node_id
+
 node_id=$1
 ee_name=$2
 
 # Run commands through sudo when not superuser
 test $(id -u) -ne 0 && icmd=$(which sudo 2> /dev/null)
 
+## Use stonith for remote fencing
+opensaf_reboot_with_remote_fencing()
+{
+       "$FMS_FENCE_CMD" -t "$FMS_DEVICE_TYPE" hostlist="node:$ee_name" 
hypervisor_uri="$FMS_HYPERVISOR_URI" -T "$FMS_FENCE_ACTION" node
+
+       retval=$?
+       if [ $retval != 0 ]; then
+               logger -t "opensaf_reboot" "Rebooting remote node $ee_name 
using $FMS_FENCE_CMD failed, rc: $retval"
+       exit 1
+       fi
+}
+
+
 #if plm exists in the system,then the reboot is performed using the eename.
 opensaf_reboot_with_plm()
 {
@@ -86,17 +104,22 @@ if [ "$self_node_id" = "$node_id" ] || [
        # Reboot (not shutdown) system WITH file system sync
        $icmd /sbin/reboot -f
 else
-       if [ ":$ee_name" != ":" ]; then
-               plm_node_presence_state=`immlist $ee_name |grep 
saPlmEEPresenceState|awk '{print $3}'`
-               plm_node_state=`immlist $ee_name |grep saPlmEEAdminState|awk 
'{print $3}'` 
-               if [ "$plm_node_presence_state" != 3 ] ; then
-                       logger -t "opensaf_reboot" "Not rebooting remote node 
$ee_name as it is not in INSTANTIATED state"
-               elif [ $plm_node_state != 2 ]; then
-                       opensaf_reboot_with_plm
-               else    
-                       logger -t "opensaf_reboot" "Not rebooting remote node 
$ee_name as it is already in locked state"
+       if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then
+               opensaf_reboot_with_remote_fencing
+       else
+               if [ ":$ee_name" != ":" ]; then
+
+                       plm_node_presence_state=`immlist $ee_name |grep 
saPlmEEPresenceState|awk '{print $3}'`
+                       plm_node_state=`immlist $ee_name |grep 
saPlmEEAdminState|awk '{print $3}'`
+                       if [ "$plm_node_presence_state" != 3 ] ; then
+                               logger -t "opensaf_reboot" "Not rebooting 
remote node $ee_name as it is not in INSTANTIATED state"
+                       elif [ $plm_node_state != 2 ]; then
+                               opensaf_reboot_with_plm
+                       else    
+                               logger -t "opensaf_reboot" "Not rebooting 
remote node $ee_name as it is already in locked state"
+                       fi
+               else
+                       logger -t "opensaf_reboot" "Rebooting remote node in 
the absence of PLM is outside the scope of OpenSAF"
                fi
-       else
-               logger -t "opensaf_reboot" "Rebooting remote node in the 
absence of PLM is outside the scope of OpenSAF"
-       fi      
+       fi
 fi     

------------------------------------------------------------------------------
Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San
Francisco, CA to explore cutting-edge tech and listen to tech luminaries
present their vision of the future. This family event has something for
everyone, including kids. Get more information and register today.
http://sdm.link/attshape
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to