The branch, 1.2-nodeflags has been updated via ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9 (commit) via 6d118faada8ffbde7f35e1a0b4d2df21ac67dec2 (commit) via 7b91e4fa1d00bcba860dfe502e83161a6d06de6b (commit) via 9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44 (commit) via d887c991d48f010b4512ada24b059161614bb168 (commit) via 10e005cd0eb5b350a22377629ecea87e876f6bfb (commit) via 138e2c912ae718ce024d7e342bc7808416aeec25 (commit) from 256b4f44d9de1b394d4f260f26fd11e2ff6adf8f (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=1.2-nodeflags - Log ----------------------------------------------------------------- commit ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9 Author: Martin Schwenke <mar...@meltin.net> Date: Tue Aug 31 17:40:40 2010 +1000 Eventscripts: make loadconfig() function hookable by the test suite. Rename loadconfig() to _loadconfig(). Add a new loadconfig() that simply calls _loadconfig(). This makes it easy for the test suite to override loadconfig(). Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 6d118faada8ffbde7f35e1a0b4d2df21ac67dec2 Author: Martin Schwenke <mar...@meltin.net> Date: Tue Nov 16 19:42:31 2010 +1100 Make a time comparison in 60.nfs eventscript more readable. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 7b91e4fa1d00bcba860dfe502e83161a6d06de6b Author: Martin Schwenke <mar...@meltin.net> Date: Tue Nov 16 19:31:18 2010 +1100 60.nfs only fails or warns after 10 consecutive nfsd/statd failures. These failures are sometimes the result of slow restarts so we want to avoid dirtying the logs or marking a node unhealthy because of them, unless they are excessive. For these 2 cases we use the existing fail counting code but hack a temporary service_name in a subshell to allow separate fail counts. We also update ctdb_check_rpc() so that it captures the error output from rpcinfo and we add a message including the service name to the beginning. The error is printed to stdout but is also stored in ctdb_check_rpc_out to allow it to be conditionally used by the caller. This function also now returns non-zero rather than exiting on failure. Other direct rpcinfo calls are relaced by called to ctdb_check_rpc() for consistency. Option handling code for service restarts is cleaned up so that fits in 80 columns. A more informative restart messageis now used in all cases, printing the exact command being used to start a service. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44 Author: Martin Schwenke <mar...@meltin.net> Date: Tue Oct 12 11:10:38 2010 +1100 Test suite: fix typo in ctdb ping test grep pattern. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit d887c991d48f010b4512ada24b059161614bb168 Author: Martin Schwenke <mar...@meltin.net> Date: Wed Oct 6 16:32:22 2010 +1100 Test suite: match changed output for ctdb ping to disconnected node. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 10e005cd0eb5b350a22377629ecea87e876f6bfb Author: Martin Schwenke <mar...@meltin.net> Date: Fri Oct 15 15:09:08 2010 +1100 Test suite: make statistics test cope with changes to statistics output. Signed-off-by: Martin Schwenke <mar...@meltin.net> commit 138e2c912ae718ce024d7e342bc7808416aeec25 Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Mon Nov 15 16:30:44 2010 +1100 initialize the statistics to the current time, not start of epoch this makes "ctdb statistics" show correct "start of starts collection" ----------------------------------------------------------------------- Summary of changes: config/events.d/60.nfs | 77 ++++++++++++++++++++++-------------- config/functions | 18 ++++++-- server/ctdb_statistics.c | 1 + tests/simple/09_ctdb_ping.sh | 2 +- tests/simple/14_ctdb_statistics.sh | 2 +- 5 files changed, 63 insertions(+), 37 deletions(-) Changeset truncated at 500 lines: diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs index 57c81d3..038adbb 100755 --- a/config/events.d/60.nfs +++ b/config/events.d/60.nfs @@ -51,24 +51,43 @@ case "$1" in # check that statd responds to rpc requests # if statd is not running we try to restart it - rpcinfo -u localhost 100024 1 > /dev/null || { - RPCSTATDOPTS="" - [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n $STATD_HOSTNAME" - [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p $STATD_PORT" - [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o $STATD_OUTGOING_PORT" - rpc.statd $RPCSTATDOPTS - echo "ERROR: STATD is not responding. Trying to restart it. [rpc.statd $RPCSTATDOPTS]" - } + if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then + (service_name="nfs_statd"; ctdb_counter_init) + else + p="rpc.statd" ; cmd="$p" + cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" + cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}" + cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" + ( + service_name="nfs_statd" + ctdb_counter_incr + ctdb_check_counter_limit 10 quiet >/dev/null + ) || { + echo "$ctdb_check_rpc_out" + echo "Trying to restart STATD [$cmd]" + } + $cmd + fi # check that NFS responds to rpc requests [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || { - (ctdb_check_rpc "NFS" 100003 3) - [ $? = "0" ] || { - echo "Trying to restart NFS service" - startstop_nfs restart - exit 1 - } + if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then + (service_name="nfs_knfsd"; ctdb_counter_init) + else + ( + service_name="nfs_knfsd" + ctdb_counter_incr + ctdb_check_counter_limit 10 quiet >/dev/null + ) || { + echo "$ctdb_check_rpc_out" + echo "Trying to restart NFS service" + startstop_nfs restart + exit 1 + } + # we haven't hit the failure limit so restart quietly + startstop_nfs restart >/dev/null 2>&1 & + fi } # and that its directories are available @@ -79,8 +98,7 @@ case "$1" in } || exit $? # check that lockd responds to rpc requests - (ctdb_check_rpc "lockd" 100021 1) - [ $? = "0" ] || { + ctdb_check_rpc "LOCKD" 100021 1 || { echo "Trying to restart lock manager service" startstop_nfs restart startstop_nfslock restart @@ -89,31 +107,30 @@ case "$1" in # mount needs special handling since it is sometimes not started # correctly on RHEL5 - rpcinfo -u localhost 100005 1 > /dev/null || { - echo "ERROR: MOUNTD is not running. Trying to restart it." - RPCMOUNTDOPTS="" - [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p $MOUNTD_PORT" - killall -q -9 rpc.mountd - rpc.mountd $RPCMOUNTDOPTS & + ctdb_check_rpc "MOUNTD" 100005 1 || { + p="rpc.mountd" + cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" + echo "Trying to restart MOUNTD [${cmd}]" + killall -q -9 $p + $cmd & exit 1 } # rquotad needs special handling since it is sometimes not started # correctly on RHEL5 # this is not a critical service so we dont flag the node as unhealthy - rpcinfo -u localhost 100011 1 > /dev/null || { - echo "ERROR: RQUOTAD is not running. Trying to restart it." - RPCRQUOTADOPTS="" - [ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p $RQUOTAD_PORT" - killall -q -9 rpc.rquotad - rpc.rquotad $RPCRQUOTADOPTS & + ctdb_check_rpc "RQUOTAD" 100011 1 || { + p="rpc.rquotad" + cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" + echo "Trying to restart RQUOTAD [${cmd}]" + killall -q -9 $p + $cmd & } # once every 60 seconds, update the statd state database for which # clients need notifications LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger` CURRENT_TIME=`date +"%s"` - expr "$CURRENT_TIME" ">" "(" "$LAST_UPDATE" "+" "60" ")" >/dev/null 2>/dev/null - [ $? = "0" ] && { + [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && { mkdir -p $CTDB_VARDIR/state/statd touch $CTDB_VARDIR/state/statd/update-trigger $CTDB_BASE/statd-callout updatelocal & diff --git a/config/functions b/config/functions index 9659d48..610085b 100755 --- a/config/functions +++ b/config/functions @@ -4,7 +4,7 @@ PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH ####################################### # pull in a system config file, if any -loadconfig() { +_loadconfig() { if [ -z "$1" ] ; then foo="${service_config:-${service_name}}" @@ -25,6 +25,10 @@ loadconfig() { fi } +loadconfig () { + _loadconfig "$@" +} + ############################################################## # determine on what type of system (init style) we are running detect_init_style() { @@ -144,10 +148,14 @@ ctdb_check_rpc() { progname="$1" prognum="$2" version="$3" - rpcinfo -u localhost $prognum $version > /dev/null || { - echo "ERROR: $progname not responding to rpc requests" - exit 1 - } + + ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1) + if [ $? -ne 0 ] ; then + ctdb_check_rpc_out="ERROR: $progname failed RPC check: +$ctdb_check_rpc_out" + echo "$ctdb_check_rpc_out" + return 1 + fi } ###################################################### diff --git a/server/ctdb_statistics.c b/server/ctdb_statistics.c index aec17ad..29e6d6a 100644 --- a/server/ctdb_statistics.c +++ b/server/ctdb_statistics.c @@ -42,6 +42,7 @@ static void ctdb_statistics_update(struct event_context *ev, struct timed_event int ctdb_statistics_init(struct ctdb_context *ctdb) { bzero(&ctdb->statistics, sizeof(struct ctdb_statistics)); + ctdb->statistics.statistics_start_time = timeval_current(); bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics)); ctdb->statistics_current.statistics_start_time = timeval_current(); diff --git a/tests/simple/09_ctdb_ping.sh b/tests/simple/09_ctdb_ping.sh index 6ca50d4..6610431 100755 --- a/tests/simple/09_ctdb_ping.sh +++ b/tests/simple/09_ctdb_ping.sh @@ -53,5 +53,5 @@ try_command_on_node -v 0 "! $CTDB ping -n 1" sanity_check_output \ 1 \ - "(: ctdb_control error: 'ctdb_control to disconnected node'|Unable to get ping response from node 1|Node 1 is DISCONNECTED)" \ + "(: ctdb_control error: ('ctdb_control to disconnected node'|'node is disconnected')|Unable to get ping response from node 1|Node 1 is DISCONNECTED|ctdb_control for getpnn failed|: Can not access node. Node is not operational\.)" \ "$out" diff --git a/tests/simple/14_ctdb_statistics.sh b/tests/simple/14_ctdb_statistics.sh index 9a95a83..e9ecce5 100755 --- a/tests/simple/14_ctdb_statistics.sh +++ b/tests/simple/14_ctdb_statistics.sh @@ -33,7 +33,7 @@ set -e cluster_is_healthy -pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec)$' +pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec|[[:space:]]*(reclock_ctdbd|reclock_recd|call_latency|lockwait_latency|childwrite_latency)[[:space:]]+MIN/AVG/MAX[[:space:]]+[-.[:digit:]]+/[-.[:digit:]]+/[-.[:digit:]]+ sec out of [[:digit:]]+)$' try_command_on_node -v 1 "$CTDB statistics" -- CTDB repository