Hi I fixed most of the things Lars mentioned in http://hg.linux-ha.org/dev/rev/15bcf3491f9c and will explain why I did not fix some of them. ocf-tester runs fine with the RA.
> # FIXME: This should use pidofproc pidofproc is not available everywhere and is not able to get down to command line options, eg could not tell the difference between "$process $option_a" and "$process $option_b" which I wanted to support with this agent. Example: dktest3:~/src/linuxha/hg/dev # sleep 200 & [1] 5799 dktest3:~/src/linuxha/hg/dev # sleep 300 & [2] 5801 dktest3:~/src/linuxha/hg/dev # pidofproc sleep 5801 5799 dktest3:~/src/linuxha/hg/dev # pidofproc "sleep 300" > # FIXME: use start_daemon start_daemon is not available everywhere either. > # FIXME: What about daemons which can manage their own pidfiles? This agent is meant to be used for programs that are not actually daemons by design. It is meant to be able to run sth "stupid" in the cluster. Even like "/bin/sleep 10000000000000000000" > # FIXME: use killproc This is also a problem with "$process $option_a" and "$process $option_b". You can't just "killproc $process" then. > # FIXME: Attributes special meaning to the resource id I tried to, but couldn't understand what you meant here. I also talked to Dejan on IRC and we agreed that "anything" is a bad name for the RA and the changeset description was propably bad, too. This RA is not for (as the cs stated) "arbitrary daemons", it is more for daemonizing programs which were not meant to be daemons. If a proper name comes to anyone's mind - please share. Hopefully, now it is a bit clearer what I wanted to be able to do with this RA. I agree the "cmd=" lines and pid file creation are very very ugly, but I could not yet find a better way. Not that much of a shell genius I guess :( Please share if you can improve things. Regards Dominik
exporting patch: # HG changeset patch # User Dominik Klein <d...@in-telegence.net> # Date 1234350091 -3600 # Node ID 04533b37813c8be009814f52de7b14ff65bf9862 # Parent 90ff997faa7288248ac57583b0c03df4c8e41bda RA: anything. Implement most of lmbs suggestions. diff -r 90ff997faa72 -r 04533b37813c resources/OCF/anything --- a/resources/OCF/anything Wed Feb 11 11:31:02 2009 +0100 +++ b/resources/OCF/anything Wed Feb 11 12:01:31 2009 +0100 @@ -32,6 +32,7 @@ # OCF_RESKEY_errlogfile # OCF_RESKEY_user # OCF_RESKEY_monitor_hook +# OCF_RESKEY_stop_timeout # # This RA starts $binfile with $cmdline_options as $user and writes a $pidfile from that. # If you want it to, it logs: @@ -47,18 +48,20 @@ # Initialization: . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs -getpid() { # make sure that the file contains a number - # FIXME: pidfiles could contain spaces - grep '^[0-9][0-9]*$' $1 +getpid() { + grep -o '[0-9]*' $1 } anything_status() { - # FIXME: This should use pidofproc - # FIXME: pidfile w/o process means the process died, so should - # be ERR_GENERIC - if test -f "$pidfile" && pid=`getpid $pidfile` && kill -0 $pid + if test -f "$pidfile" then - return $OCF_RUNNING + if pid=`getpid $pidfile` && kill -0 $pid + then + return $OCF_RUNNING + else + # pidfile w/o process means the process died + return $OCF_ERR_GENERIC + fi else return $OCF_NOT_RUNNING fi @@ -66,8 +69,6 @@ anything_start() { if ! anything_status - # FIXME: use start_daemon - # FIXME: What about daemons which can manage their own pidfiles? then if [ -n "$logfile" -a -n "$errlogfile" ] then @@ -101,29 +102,48 @@ } anything_stop() { - # FIXME: use killproc + if [ -n "$OCF_RESKEY_stop_timeout" ] + then + stop_timeout=$OCF_RESKEY_stop_timeout + elif [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + # Allow 2/3 of the action timeout for the orderly shutdown + # (The origin unit is ms, hence the conversion) + stop_timeout=$((OCF_RESKEY_CRM_meta_timeout/1500)) + else + stop_timeout=10 + fi if anything_status then - pid=`getpid $pidfile` - kill $pid - i=0 - # FIXME: escalate to kill -9 before timeout - while sleep 1 - do - if ! anything_status - then - rm -f $pidfile > /dev/null 2>&1 - return $OCF_SUCCESS - fi - let "i++" - done + pid=`getpid $pidfile` + kill $pid + rm -f $pidfile + i=0 + while [ $i -lt $stop_timeout ] + do + while sleep 1 + do + if ! anything_status + then + return $OCF_SUCCESS + fi + let "i++" + done + done + ocf_log warn "Stop with SIGTERM failed/timed out, now sending SIGKILL." + kill -9 $pid + if ! anything_status + then + ocf_log warn "SIGKILL did the job." + return $OCF_SUCCESS + else + ocf_log err "Failed to stop - even with SIGKILL." + return $OCF_ERR_GENERIC + fi else # was not running, so stop can be considered successful rm -f $pidfile return $OCF_SUCCESS fi - # FIXME: Never reached. - return $OCF_ERR_GENERIC } anything_monitor() { @@ -131,12 +151,12 @@ ret=$? if [ $ret -eq $OCF_SUCCESS ] then - # implement your deeper monitor operation here if [ -n "$OCF_RESKEY_monitor_hook" ]; then eval "$OCF_RESKEY_monitor_hook" - # FIXME: Implement a check that this doesn't - # accidentially return NOT_RUNNING? - return + if [ $? -ne $OCF_SUCCESS ]; then + return ${OCF_ERR_GENERIC} + fi + return $OCF_SUCCESS else true fi @@ -150,19 +170,16 @@ binfile="$OCF_RESKEY_binfile" cmdline_options="$OCF_RESKEY_cmdline_options" pidfile="$OCF_RESKEY_pidfile" -# FIXME: Why test for $binfile here? -[ -z "$pidfile" -a -n "$binfile" ] && pidfile=${HA_VARRUN}/anything_${process}.pid +[ -z "$pidfile" ] && pidfile=${HA_VARRUN}/anything_${process}.pid logfile="$OCF_RESKEY_logfile" errlogfile="$OCF_RESKEY_errlogfile" user="$OCF_RESKEY_user" [ -z "$user" ] && user=root anything_validate() { - # FIXME: Actually this needs to test from the point of view of - # the user. - if [ ! -x "$binfile" ] + if ! su - $user -c "test -x $binfile" then - ocf_log err "binfile $binfile does not exist or is not executable." + ocf_log err "binfile $binfile does not exist or is not executable by $user." exit $OCF_ERR_INSTALLED fi if ! getent passwd $user >/dev/null 2>&1 @@ -243,6 +260,14 @@ <shortdesc lang="en">Command to run in monitor operation</shortdesc> <content type="string"/> </parameter> +<parameter name="stop_timeout"> +<longdesc lang="en"> +In the stop operation: Seconds to wait for kill -TERM to succeed +before sending kill -SIGKILL. Defaults to 2/3 of the stop operation timeout. +</longdesc> +<shortdesc lang="en">Seconds to wait after having sent SIGTERM before sending SIGKILL in stop operation</shortdesc> +<content type="string" default=""/> +</parameter> </parameters> <actions> <action name="start" timeout="90" />
_______________________________________________________ Linux-HA-Dev: Linux-HA-Dev@lists.linux-ha.org http://lists.linux-ha.org/mailman/listinfo/linux-ha-dev Home Page: http://linux-ha.org/