Hi all,

I've written a small patch for externel/ipmi, so it's possible to
configure it not to reset a node, but trigger a crashdump via NMI.

If a node becomes unavailable for several reasons it will be fenced but
this makes investigating the root cause of the nodes unavailability very
difficult; if you have a chashdump you can reconstruct the root cause.

For this I added 3 new options:

crashdump       -> set this to true to enable crashdump.
sshcheck        -> if this is true, a ssh connection will be
                   established to eighter $sshipaddr, if this is not
                   set, $hostname will be used as remoteadress.
sshipaddr       -> in case ssh is listening on an other interface,
                   where dns isn't equal $hostname.

Maybe it could be usefull for others too.

For any comments, suggestions I would be glad.


Tobias D. Oestreicher

-- 
Tobias D. Oestreicher
Linux Consultant & Trainer
Tel.: +49-160-5329935
Mail: oestreic...@b1-systems.de

B1 Systems GmbH
Osterfeldstraße 7 / 85088 Vohburg / http://www.b1-systems.de
GF: Ralph Dehner / Unternehmenssitz: Vohburg / AG: Ingolstadt,HRB 3537
diff -r da5832ae23dd lib/plugins/stonith/external/ipmi
--- a/lib/plugins/stonith/external/ipmi	Sun Dec 23 16:05:11 2012 +0100
+++ b/lib/plugins/stonith/external/ipmi	Mon Jan 14 22:01:57 2013 +0100
@@ -36,7 +36,11 @@
 POWEROFF="power off"
 POWERON="power on"
 STATUS="power status"
+CRASHDUMP="chassis power diag"
+
 IPMITOOL=${ipmitool:-"`which ipmitool 2>/dev/null`"}
+SYSCTL=`which sysctl 2>/dev/null`
+SSH_OPTS="-q -o PasswordAuthentication=no -o StrictHostKeyChecking=no"
 
 have_ipmi() {
 	test -x "${IPMITOOL}"
@@ -138,7 +142,11 @@
 	;;
 reset)
 	if ipmi_is_power_on; then
-		do_ipmi "${RESET}"
+		if [ "${crashdump}" == "true" ]; then
+			do_ipmi "${CRASHDUMP}"
+		else
+			do_ipmi "${RESET}"
+		fi
 	else
 		do_ipmi "${POWERON}"
 	fi
@@ -149,11 +157,40 @@
 	# the managed node. Hence, only check if we can contact the
 	# IPMI device with "power status" command, don't pay attention
 	# to whether the node is in fact powered on or off.
+	if [ "${crashdump}" == "true" ]; then
+		if [ "${sshcheck}" == "true" ];then
+			if [ -z "${hostname}" -a -z "${sshipaddr}" ]; then
+				ha_log.sh err "Neigther hostname nor sshipaddr is set, crashdump testing not possible" 
+			elif [ -z "${sshipaddr}" ]; then
+				REMOTESSHHOST="${hostname}"
+			else
+				REMOTESSHHOST="${sshipaddr}"
+			fi
+			SSH_BIN=`which ssh 2>/dev/null`
+			SSH_COMMAND="${SSH_BIN} ${REMOTESSHHOST} ${SSH_OPTS}"
+			remote_crashdump_state=`${SSH_COMMAND} "grep -c crashkernel /proc/cmdline;${SYSCTL} -n kernel.unknown_nmi_panic kernel.panic_on_unrecovered_nmi"`
+			if [ $? -ne 0 ];then
+				ha_log.sh err "Not possible to connect via ssh to ${REMOTESSHHOST}"
+				exit 1
+			fi
+			unknown_nmi=`echo ${remote_crashdump_state}|awk '{print $2}'`
+			unrecovered_nmi=`echo ${remote_crashdump_state}|awk '{print $3}'`
+			crashdump_kernel_option=`echo ${remote_crashdump_state}|awk '{print $1}'`
+			if [ ${crashdump_kernel_option} -ne 1 ];then
+				ha_log.sh err "Crashdump seems not to be configured on host ${REMOTESSHHOST}"
+				exit 1
+			fi
+			if [ ${unknown_nmi} -eq 0 -o ${unrecovered_nmi} -eq 0 ]; then
+                                ha_log.sh err "Non Maskerable Interupts do not trigger a reset. Set \"kernel.unknown_nmi_panic\" and \"kernel.panic_on_unrecovered_nmi\" to \"1\""
+				exit 1
+			fi
+		fi
+	fi
 	do_ipmi "${STATUS}"
 	exit $?
 	;;
 getconfignames)
-	for i in hostname ipaddr userid passwd interface; do
+	for i in hostname ipaddr userid passwd interface crashdump sshipaddr sshcheck; do
 		echo $i
 	done
 	exit 0
@@ -266,6 +303,39 @@
 </longdesc>
 </parameter>
 
+<parameter name="crashdump" unique="0" required="0">
+<content type="string" default="false"/>
+<shortdesc lang="en">
+Trigger Crahdump
+</shortdesc>
+<longdesc lang="en">
+Instead of sending a reset to the IPMI board, submit a NMI signal to trigger a crashdump.
+
+!!! ATTENTION USE ONLY FOR DEBUGGING PURPOSES. NMI MUST BE TESTED PRIOR TO USE !!!
+</longdesc>
+</parameter>
+
+<parameter name="sshipaddr" unique="0">
+<content type="string" />
+<shortdesc lang="en">
+IP Address of the node to stonith.
+</shortdesc>
+<longdesc lang="en">
+The IP address of the node to contact via ssh in case it differs from hostname to perform checks regarding crashdump and NMI configuration.
+</longdesc>
+</parameter>
+
+<parameter name="sshcheck" unique="0">
+<content type="string" default="false"/>
+<shortdesc lang="en">
+Checks whether node is configured for crashdump.
+</shortdesc>
+<longdesc lang="en">
+This will be done via ssh and requires a password-less ssh connection.
+Enable Crashdump Checks. (true|false)
+</longdesc>
+</parameter>
+
 </parameters>
 IPMIXML
 	exit 0



<<attachment: oestreicher.vcf>>

Attachment: signature.asc
Description: OpenPGP digital signature

_______________________________________________________
Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org
http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev
Home Page: http://linux-ha.org/

Reply via email to