This fixes various problems leading to spurious stop failures: Don't fail on stop just because one virsh domstate got "no state". Don't fail a "forced stop" of an already stopped resource. Don't timeout in stop before escalating to "forced stop".
Combination of the first two bugs frequently led to "failed stop", where the stop in fact succeeded just fine. diff -r 8cb5ba3e1d97 heartbeat/VirtualDomain --- a/heartbeat/VirtualDomain Fri Jun 25 19:54:24 2010 +0200 +++ b/heartbeat/VirtualDomain Mon Jun 28 11:20:42 2010 +0200 @@ -24,6 +24,11 @@ : ${OCF_RESKEY_hypervisor=${OCF_RESKEY_hypervisor_default}} ####################################################################### +## I'd very much suggest to make this RA use bash, +## and then use magic $SECONDS. +## But for now: +NOW=$(date +%s) + usage() { echo "usage: $0 {start|stop|status|monitor|migrate_to|migrate_from|meta-data|validate-all}" } @@ -149,9 +154,11 @@ } VirtualDomain_Status() { + local try=0 rc=$OCF_ERR_GENERIC status="no state" while [ "$status" = "no state" ]; do + try=$(($try + 1 )) status="`virsh $VIRSH_OPTIONS domstate $DOMAIN_NAME`" case "$status" in "shut off") @@ -175,7 +182,7 @@ # whenever virsh can't reliably obtain the domain # state. status="no state" - if [ "$__OCF_ACTION" = "stop" ]; then + if [ "$__OCF_ACTION" = "stop" ] && [ $try -ge 3 ]; then # During the stop operation, we want to bail out # quickly, so as to be able to force-stop (destroy) # the domain if necessary. @@ -221,6 +228,7 @@ local i local status local shutdown_timeout + local out ex VirtualDomain_Status status=$? @@ -233,9 +241,9 @@ virsh $VIRSH_OPTIONS shutdown ${DOMAIN_NAME} # The "shutdown_timeout" we use here is the operation # timeout specified in the CIB, minus 5 seconds - shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) - # Loop on status for $shutdown_timeout seconds - for i in `seq $shutdown_timeout`; do + shutdown_timeout=$(( $NOW + ($OCF_RESKEY_CRM_meta_timeout/1000) -5 )) + # Loop on status until we reach $shutdown_timeout + while [ $NOW -lt $shutdown_timeout ]; do VirtualDomain_Status status=$? case $status in @@ -256,6 +264,7 @@ # resort to forced stop (destroy). break; esac + NOW=$(date +%s) done fi ;; @@ -264,11 +273,24 @@ return $OCF_SUCCESS esac # OK. Now if the above graceful shutdown hasn't worked, kill - # off the domain with destroy. If that too does not work, give - # up and have the LRM time us out. + # off the domain with destroy. If that too does not work, + # have the LRM time us out. ocf_log info "Issuing forced shutdown (destroy) request for domain ${DOMAIN_NAME}." - virsh $VIRSH_OPTIONS destroy ${DOMAIN_NAME} || return $OCF_ERR_GENERIC + out=$(virsh $VIRSH_OPTIONS destroy ${DOMAIN_NAME} 2>&1) ; ex=$? + echo >&2 "$out" + # unconditionally clean up. VirtualDomain_Cleanup_Statefile + case $ex$out in + *"error: Requested operation is not valid: domain is not running"*) + : ;; # unexpected path to the intended outcome, all is well + [!0]*) + return $OCF_ERR_GENERIC ;; + 0*) + while [ $status != $OCF_NOT_RUNNING ]; do + VirtualDomain_Status + status=$? + done ;; + esac return $OCF_SUCCESS } -- : Lars Ellenberg : LINBIT | Your Way to High Availability : DRBD/HA support and consulting http://www.linbit.com DRBD® and LINBIT® are registered trademarks of LINBIT, Austria. _______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/