Hello all, I got a report recently that a Raid1 resource failed to stop. It turned out that some web management daemon called amDaemon kept the MD devices open. After commit 2f8ec082408fb5c825a5fe30ec436c7e5208aa0a (attached), there is a code now which stops such processes.
Do you have objections to this or do you think it should be done in a different way? Note that it won't change stuff like filesystems mounted or VG running on top of an MD device. And did you ever see such a process opening the MD device? I'm worried a bit about killing processes which are not supposed to be removed. Cheers, Dejan
commit 2f8ec082408fb5c825a5fe30ec436c7e5208aa0a Author: Dejan Muhamedagic <de...@suse.de> Date: Sat Sep 15 20:43:55 2012 +0200 Medium: Raid1: stop processes using raiddev If one or more processes have the raiddev open, the stop will fail. It seems like some RAID management solutions have such processes (amDaemon by Fujitsu). Of course, this won't help if there's a mounted filesystem or open VG on the device. diff --git a/heartbeat/Raid1 b/heartbeat/Raid1 index f85f55a..4043b97 100755 --- a/heartbeat/Raid1 +++ b/heartbeat/Raid1 @@ -89,6 +89,21 @@ supposed to own them. <shortdesc lang="en">Homehost for mdadm</shortdesc> <content type="string" default="" /> </parameter> + +<parameter name="force_stop" unique="0" required="0"> +<longdesc lang="en"> +If processes or kernel threads are using the array, it cannot be +stopped. We will try to stop processes, first by sending TERM and +then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. +The lsof(8) program is required to get the list of array users. +Of course, the kernel threads cannot be stopped this way. +If the processes are critical for data integrity, then set this +parameter to false. Note that in that case the stop operation +will fail and the node will be fenced. +</longdesc> +<shortdesc lang="en">force stop processes using the array</shortdesc> +<content type="boolean" default="true" /> +</parameter> </parameters> <actions> @@ -122,6 +137,14 @@ forall() { done return $rc } +do_func() { + local func=$1 + if [ "$MDDEV" = auto ]; then + forall $func all + else + $func $MDDEV + fi +} # # START: Start up the RAID device @@ -191,6 +214,37 @@ raid1_stop_one() { ocf_log info "Stopping array $1" $MDADM --stop $1 --config=$RAIDCONF --wait-clean -W } +get_users_pids() { + local mddev=$1 + local outp l + ocf_log debug "running lsof to list $mddev users..." + outp=`lsof $mddev | tail -n +2` + echo "$outp" | awk '{print $2}' | sort -u + echo "$outp" | while read l; do + ocf_log warn "$l" + done +} +stop_raid_users() { + local pids + pids=`do_func get_users_pids | sort -u` + if [ -z "$pids" ]; then + ocf_log warn "lsof reported no users holding arrays" + return 2 + else + ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids + fi +} +stop_arrays() { + if [ $HAVE_RAIDTOOLS = "true" ]; then + $RAIDSTOP --configfile $RAIDCONF $MDDEV + else + if [ "$MDDEV" = auto ]; then + forall raid1_stop_one all + else + raid1_stop_one $MDDEV + fi + fi +} raid1_stop() { local rc # See if the MD device is already cleanly stopped: @@ -202,19 +256,26 @@ raid1_stop() { fi # Turn off raid - if [ $HAVE_RAIDTOOLS = "true" ]; then - $RAIDSTOP --configfile $RAIDCONF $MDDEV - else - if [ "$MDDEV" = auto ]; then - forall raid1_stop_one all + if ! stop_arrays; then + if ocf_is_true $FORCESTOP; then + if have_binary lsof; then + stop_raid_users + case $? in + 2) false;; + *) stop_arrays;; + esac + else + ocf_log warn "install lsof(8) to list users holding the disk" + false + fi else - raid1_stop_one $MDDEV + false fi fi rc=$? if [ $rc -ne 0 ]; then - ocf_log err "Couldn't stop RAID for $MDDEV (rc=$rc)" + ocf_log warn "Couldn't stop RAID for $MDDEV (rc=$rc)" if [ $HAVE_RAIDTOOLS != "true" ]; then if [ "$MDDEV" = auto ]; then forall mark_readonly all @@ -325,7 +386,8 @@ raid1_status() { raid1_validate_all() { return $OCF_SUCCESS } - + +PROC_CLEANUP_TIME=3 if ( [ $# -ne 1 ] ) @@ -349,6 +411,7 @@ esac RAIDCONF="$OCF_RESKEY_raidconf" MDDEV="$OCF_RESKEY_raiddev" +FORCESTOP="${OCF_RESKEY_force_stop:-1}" if [ -z "$RAIDCONF" ] ; then ocf_log err "Please set OCF_RESKEY_raidconf!" @@ -365,6 +428,10 @@ if [ -z "$MDDEV" ] ; then exit $OCF_ERR_CONFIGURED fi +if ocf_is_true $FORCESTOP && ! have_binary lsof; then + ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..." +fi + HAVE_RAIDTOOLS=false if have_binary $MDADM >/dev/null 2>&1 ; then if [ -n "$OCF_RESKEY_homehost" ]; then
_______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/