The branch, master has been updated via f59b40b3f8ea3da8ffb8601bc025e83c237072d5 (commit) via f23b5a160184db8c92f8c69307dc4a64adae839d (commit) via 637cab6304dae66b85668506028c76ea1ee88980 (commit) via 13acd58c41fba1a33894fbd654fed69ea0eac322 (commit) via 92f74fd589467b46c758e116e97417edfe8773d7 (commit) from 6e68797af67bee36f2bad045f94806e7e98f27e9 (commit)
http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master - Log ----------------------------------------------------------------- commit f59b40b3f8ea3da8ffb8601bc025e83c237072d5 Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu May 17 11:16:57 2012 +1000 GANESHA: make the ganesha script executable by default commit f23b5a160184db8c92f8c69307dc4a64adae839d Merge: 6e68797af67bee36f2bad045f94806e7e98f27e9 637cab6304dae66b85668506028c76ea1ee88980 Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu May 17 11:48:07 2012 +1000 Merge remote branch 'martins/ganesha' commit 637cab6304dae66b85668506028c76ea1ee88980 Author: Martin Schwenke <mar...@meltin.net> Date: Wed May 16 17:24:21 2012 +1000 Eventscripts: Modernise 60.ganesha to match 60.nfs Originally from Srikrishan Malik <srikrishan.ma...@in.ibm.com> with some style changes by me. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 13acd58c41fba1a33894fbd654fed69ea0eac322 Author: Martin Schwenke <mar...@meltin.net> Date: Wed May 16 13:29:58 2012 +1000 Eventscripts: restart lockd in the background when going unhealthy Sometimes the restart can hang when there are I/O problems. Then the eventscript times out and gets killed so the node never marked as unhealthy. Restarting in the background avoids this. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 92f74fd589467b46c758e116e97417edfe8773d7 Author: Martin Schwenke <mar...@meltin.net> Date: Tue May 8 14:53:58 2012 +1000 Eventscript functions: add optional version to nfs_check_rpc_service() This can be optional because the 1st item of each action-triple is a test comparison that starts with '-'. Signed-off-by: Martin Schwenke <mar...@meltin.net> ----------------------------------------------------------------------- Summary of changes: Makefile.in | 2 +- config/events.d/60.ganesha | 207 ++++++++++++++++++++------------------------ config/events.d/60.nfs | 4 +- config/functions | 31 +++++++ config/statd-callout | 14 ++- 5 files changed, 137 insertions(+), 121 deletions(-) Changeset truncated at 500 lines: diff --git a/Makefile.in b/Makefile.in index cdebbd7..cbb987a 100755 --- a/Makefile.in +++ b/Makefile.in @@ -343,7 +343,7 @@ install: all $(PMDA_INSTALL) ${INSTALLCMD} -m 755 config/events.d/41.httpd $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/50.samba $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/60.nfs $(DESTDIR)$(etcdir)/ctdb/events.d - ${INSTALLCMD} -m 644 config/events.d/60.ganesha $(DESTDIR)$(etcdir)/ctdb/events.d + ${INSTALLCMD} -m 755 config/events.d/60.ganesha $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/62.cnfs $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/70.iscsi $(DESTDIR)$(etcdir)/ctdb/events.d ${INSTALLCMD} -m 755 config/events.d/91.lvs $(DESTDIR)$(etcdir)/ctdb/events.d diff --git a/config/events.d/60.ganesha b/config/events.d/60.ganesha index fb3b7c2..cee7792 100755 --- a/config/events.d/60.ganesha +++ b/config/events.d/60.ganesha @@ -1,34 +1,58 @@ #!/bin/sh # script to manage nfs in a clustered environment -start_nfs() { - mkdir -p $CTDB_VARDIR/state/nfs - mkdir -p $CTDB_VARDIR/state/statd/ip - ctdb_service_stop - ctdb_service_start - echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle +. $CTDB_BASE/functions + +service_name="nfs-ganesha-gpfs" + + +service_start () +{ + startstop_ganesha stop + startstop_ganesha start + set_proc "sys/net/ipv4/tcp_tw_recycle" 1 } -. $CTDB_BASE/functions +service_stop () +{ + startstop_ganesha stop +} + +service_reconfigure () +{ + # if the ips have been reallocated, we must restart ganesha + # across all nodes and ping all statd listeners + [ -x $CTDB_BASE/statd-callout ] && { + $CTDB_BASE/statd-callout notify & + } >/dev/null 2>&1 +} +loadconfig "nfs" -loadconfig nfs +[ "$NFS_SERVER_MODE" == "ganesha" ] || exit 0 -[ "$NFS_SERVER_MODE" = "GANESHA" ] || exit 0 +ctdb_setup_service_state_dir + +statd_update_trigger="$service_state_dir/update-trigger" +# We want this file to always exist. The corner case is when +# auto-start/stop is switched off, NFS is added as a managed service +# some time after ctdbd is started and someone else starts the NFS +# service for us. In this case this file might not otherwise exist +# when we get to a monitor event. +touch "$statd_update_trigger" -service_name="nfs-ganesha-gpfs" ctdb_start_stop_service is_ctdb_managed_service || exit 0 +ctdb_service_check_reconfigure + case "$1" in init) # read statd from persistent database ;; startup) ctdb_service_start - mkdir -p $CTDB_VARDIR/state/statd - touch $CTDB_VARDIR/state/statd/update-trigger ;; shutdown) @@ -44,111 +68,68 @@ case "$1" in ;; monitor) - if ctdb_service_needs_reconfigure ; then - ctdb_service_reconfigure - exit 0 - fi update_tickles 2049 - - # check that statd responds to rpc requests - # if statd is not running we try to restart it - if ctdb_check_rpc "STATD" status 1 >/dev/null ; then - (service_name="nfs_statd"; ctdb_counter_init) - else - p="rpc.statd" ; cmd="$p" - cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" - cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}" - cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" - ( - service_name="nfs_statd" - ctdb_counter_incr - ctdb_check_counter_limit 10 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - echo "Trying to restart STATD [$cmd]" - } - $cmd - fi - - # check that NFS responds to rpc requests - [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || { - if ctdb_check_rpc "NFS" nfs 3 >/dev/null ; then - (service_name="nfs_knfsd"; ctdb_counter_init) - else - ( - service_name="nfs_knfsd" - ctdb_counter_incr - - ctdb_check_counter_equal 10 || { - echo "Trying to restart NFS service" - ctdb_service_stop - ctdb_service_start - exit 0 - } - - ctdb_check_counter_limit 15 quiet >/dev/null - ) || { - echo "$ctdb_check_rpc_out" - echo "Trying to restart NFS service" - ctdb_service_stop - ctdb_service_start - exit 1 - } - fi - } - - # and that its directories are available - [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || { - grep Path /etc/ganesha/gpfs.ganesha.exports.conf | - cut -f2 -d\" | ctdb_check_directories - } || exit $? - - # check that lockd responds to rpc requests - ctdb_check_rpc "LOCKD" nlockmgr 4 || { - echo "Trying to restart lock manager service" - ctdb_service_stop - ctdb_service_start - exit 1 - } - - # check mounts responds to rpc requests - ctdb_check_rpc "MOUNTD" mountd 1 >/dev/null || { - echo "Trying to restart mountd service" - ctdb_service_stop - ctdb_service_start - exit 1 - } - - # rquotad needs special handling since it is sometimes not started - # correctly on RHEL5 - # this is not a critical service so we dont flag the node as unhealthy - ctdb_check_rpc "RQUOTAD" rquotad 1 || { - p="rpc.rquotad" - cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" - echo "Trying to restart RQUOTAD [${cmd}]" - killall -q -9 $p - $cmd & - } - - # once every 60 seconds, update the statd state database for which - # clients need notifications - LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger 2>/dev/null` - CURRENT_TIME=`date +"%s"` - [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && { - mkdir -p $CTDB_VARDIR/state/statd - touch $CTDB_VARDIR/state/statd/update-trigger - $CTDB_BASE/statd-callout updatelocal & - $CTDB_BASE/statd-callout updateremote & - } - ;; + # check that statd responds to rpc requests + # if statd is not running we try to restart it + # we only do this IF we have a rpc.statd command. + # For platforms where rpc.statd does not exist, we skip + # the check completely + p="rpc.statd" + which $p >/dev/null 2>/dev/null && \ + nfs_check_rpc_service "statd" 1 \ + -ge 6 "verbose unhealthy" \ + -eq 4 "verbose restart" \ + -eq 2 "restart:bs" + + PIDFILE="/var/run/ganesha.pid" + RUNNING=0 + if [ -e $PIDFILE ] + then + PID=`cat $PIDFILE` + GANESHA="/usr/bin/gpfs.ganesha.nfsd" + RUNNING=`cat /proc/$PID/cmdline | grep $GANESHA | wc -l` + fi + if [ $RUNNING != 1 ] + then + echo "Trying fast restart of NFS service" + startstop_ganesha restart + fi + + # check that NFS responds to rpc requests + if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then + nfs_check_rpc_service "ganesha" \ + -ge 6 "verbose unhealthy" \ + -eq 4 "verbose restart" \ + -eq 2 "restart:bs" + fi + + + # rquotad is sometimes not started correctly on RHEL5 + # not a critical service so we dont flag the node as unhealthy + nfs_check_rpc_service "rquotad" 1\ + -gt 0 "verbose restart:b" + + + # Check that directories for shares actually exist. + [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || { + grep Path /etc/ganesha/gpfs.ganesha.exports.conf | + cut -f2 -d\" | ctdb_check_directories + } || exit $? + + # once every 60 seconds, update the statd state database for which + # clients need notifications + LAST_UPDATE=`stat --printf="%Y" "$statd_update_trigger" 2>/dev/null` + CURRENT_TIME=`date +"%s"` + [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && { + touch "$statd_update_trigger" + $CTDB_BASE/statd-callout updatelocal & + $CTDB_BASE/statd-callout updateremote & + } + ;; ipreallocated) - # if the ips have been reallocated, we must restart the lockmanager - # across all nodes and ping all statd listeners - [ -x $CTDB_BASE/statd-callout ] && { - $CTDB_BASE/statd-callout notify & - } >/dev/null 2>&1 + ctdb_service_set_reconfigure ;; *) ctdb_standard_event_handler "$@" diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs index e8ac61f..ef2c1f7 100755 --- a/config/events.d/60.nfs +++ b/config/events.d/60.nfs @@ -27,7 +27,7 @@ service_reconfigure () loadconfig -[ "$NFS_SERVER_MODE" != "GANESHA" ] || exit 0 +[ "$NFS_SERVER_MODE" != "ganesha" ] || exit 0 ctdb_setup_service_state_dir @@ -98,7 +98,7 @@ case "$1" in # check that lockd responds to rpc requests nfs_check_rpc_service "lockd" \ - -ge 15 "verbose restart unhealthy" \ + -ge 15 "verbose restart:b unhealthy" \ -eq 10 "restart:bs" # mountd is sometimes not started correctly on RHEL5 diff --git a/config/functions b/config/functions index 9c2898c..a07aa8e 100755 --- a/config/functions +++ b/config/functions @@ -228,6 +228,12 @@ nfs_check_rpc_service () _restart="echo 'Trying to restart NFS service'" _restart="${_restart}; startstop_nfs restart" ;; + ganesha) + _rpc_prog=nfs + _version=${_v:-3} + _restart="echo 'Trying to restart Ganesha NFS service'" + _restart="${_restart}; startstop_ganesha restart" + ;; mountd) _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" ;; @@ -678,6 +684,31 @@ tickle_tcp_connections() { } ######################################################## +# start/stop the Ganesha nfs service +######################################################## +startstop_ganesha() +{ + _ganesha_fsal_list="gpfs" + for _fsal in $_ganesha_fsal_list ; do + _service_name="nfs-ganesha-${_fsal}" + if [ -x /etc/init.d/$_service_name ] ; then + break + fi + done + case "$1" in + start) + service "$_service_name" start + ;; + stop) + service "$_service_name" stop + ;; + restart) + service "$_service_name" restart + ;; + esac +} + +######################################################## # start/stop the nfs service on different platforms ######################################################## startstop_nfs() { diff --git a/config/statd-callout b/config/statd-callout index 3078ef2..63fee09 100755 --- a/config/statd-callout +++ b/config/statd-callout @@ -138,11 +138,15 @@ case "$1" in # will respond "strangely" immediately after restarting it, which # causes clients to fail to reclaim the locks. # - startstop_nfslock stop > /dev/null 2>&1 - sleep 2 - - # now start lockmanager again with the new state directory. - startstop_nfslock start > /dev/null 2>&1 + if [ "$NFS_SERVER_MODE" = "ganesha" ] ; then + startstop_ganesha stop >/dev/null 2>&1 + sleep 2 + startstop_ganesha start >/dev/null 2>&1 + else + startstop_nfslock stop >/dev/null 2>&1 + sleep 2 + startstop_nfslock start >/dev/null 2>&1 + fi # we now need to send out additional statd notifications to ensure # that clients understand that the lockmanager has restarted. -- CTDB repository