[SCM] CTDB repository - branch 1.2-nodeflags updated - ctdb-1.9.1-232-gea77f1b

Ronnie Sahlberg Tue, 16 Nov 2010 17:06:41 -0800

The branch, 1.2-nodeflags has been updated
       via  ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9 (commit)
       via  6d118faada8ffbde7f35e1a0b4d2df21ac67dec2 (commit)
       via  7b91e4fa1d00bcba860dfe502e83161a6d06de6b (commit)
       via  9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44 (commit)
       via  d887c991d48f010b4512ada24b059161614bb168 (commit)
       via  10e005cd0eb5b350a22377629ecea87e876f6bfb (commit)
       via  138e2c912ae718ce024d7e342bc7808416aeec25 (commit)
      from  256b4f44d9de1b394d4f260f26fd11e2ff6adf8f (commit)


http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=1.2-nodeflags


- Log -----------------------------------------------------------------
commit ea77f1b2a161f194b8b152da3cd94c67cbdd7ed9
Author: Martin Schwenke <mar...@meltin.net>
Date:   Tue Aug 31 17:40:40 2010 +1000

    Eventscripts: make loadconfig() function hookable by the test suite.
    
    Rename loadconfig() to _loadconfig().  Add a new loadconfig() that
    simply calls _loadconfig().
    
    This makes it easy for the test suite to override loadconfig().
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit 6d118faada8ffbde7f35e1a0b4d2df21ac67dec2
Author: Martin Schwenke <mar...@meltin.net>
Date:   Tue Nov 16 19:42:31 2010 +1100

    Make a time comparison in 60.nfs eventscript more readable.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit 7b91e4fa1d00bcba860dfe502e83161a6d06de6b
Author: Martin Schwenke <mar...@meltin.net>
Date:   Tue Nov 16 19:31:18 2010 +1100

    60.nfs only fails or warns after 10 consecutive nfsd/statd failures.
    
    These failures are sometimes the result of slow restarts so we want to
    avoid dirtying the logs or marking a node unhealthy because of them,
    unless they are excessive.
    
    For these 2 cases we use the existing fail counting code but hack a
    temporary service_name in a subshell to allow separate fail counts.
    
    We also update ctdb_check_rpc() so that it captures the error output
    from rpcinfo and we add a message including the service name to the
    beginning.  The error is printed to stdout but is also stored in
    ctdb_check_rpc_out to allow it to be conditionally used by the caller.
    This function also now returns non-zero rather than exiting on
    failure.
    
    Other direct rpcinfo calls are relaced by called to ctdb_check_rpc()
    for consistency.
    
    Option handling code for service restarts is cleaned up so that fits
    in 80 columns.  A more informative restart messageis now used in all
    cases, printing the exact command being used to start a service.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit 9f0e1e5e0b100c3ccfaa2e459b3249386ec27b44
Author: Martin Schwenke <mar...@meltin.net>
Date:   Tue Oct 12 11:10:38 2010 +1100

    Test suite: fix typo in ctdb ping test grep pattern.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit d887c991d48f010b4512ada24b059161614bb168
Author: Martin Schwenke <mar...@meltin.net>
Date:   Wed Oct 6 16:32:22 2010 +1100

    Test suite: match changed output for ctdb ping to disconnected node.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit 10e005cd0eb5b350a22377629ecea87e876f6bfb
Author: Martin Schwenke <mar...@meltin.net>
Date:   Fri Oct 15 15:09:08 2010 +1100

    Test suite: make statistics test cope with changes to statistics output.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>

commit 138e2c912ae718ce024d7e342bc7808416aeec25
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Mon Nov 15 16:30:44 2010 +1100

    initialize the statistics to the current time, not start of epoch
    this makes "ctdb statistics" show correct "start of starts collection"

-----------------------------------------------------------------------

Summary of changes:
 config/events.d/60.nfs             |   77 ++++++++++++++++++++++--------------
 config/functions                   |   18 ++++++--
 server/ctdb_statistics.c           |    1 +
 tests/simple/09_ctdb_ping.sh       |    2 +-
 tests/simple/14_ctdb_statistics.sh |    2 +-
 5 files changed, 63 insertions(+), 37 deletions(-)


Changeset truncated at 500 lines:

diff --git a/config/events.d/60.nfs b/config/events.d/60.nfs
index 57c81d3..038adbb 100755
--- a/config/events.d/60.nfs
+++ b/config/events.d/60.nfs
@@ -51,24 +51,43 @@ case "$1" in
 
        # check that statd responds to rpc requests
        # if statd is not running we try to restart it
-       rpcinfo -u localhost 100024 1 > /dev/null || {
-               RPCSTATDOPTS=""
-               [ -n "$STATD_HOSTNAME" ] && RPCSTATDOPTS="$RPCSTATDOPTS -n 
$STATD_HOSTNAME"
-               [ -n "$STATD_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -p 
$STATD_PORT"
-               [ -n "$STATD_OUTGOING_PORT" ] && RPCSTATDOPTS="$RPCSTATDOPTS -o 
$STATD_OUTGOING_PORT"
-               rpc.statd $RPCSTATDOPTS 
-               echo "ERROR: STATD is not responding. Trying to restart it. 
[rpc.statd $RPCSTATDOPTS]"
-       }
+       if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+               (service_name="nfs_statd"; ctdb_counter_init)
+       else
+               p="rpc.statd" ; cmd="$p"
+               cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+               cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
+               cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+               (
+                       service_name="nfs_statd"
+                       ctdb_counter_incr
+                       ctdb_check_counter_limit 10 quiet >/dev/null
+               ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart STATD [$cmd]"
+               }
+               $cmd
+       fi
 
 
        # check that NFS responds to rpc requests
        [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-           (ctdb_check_rpc "NFS" 100003 3)
-           [ $? = "0" ] || {
-               echo "Trying to restart NFS service"
-               startstop_nfs restart
-               exit 1
-           }
+           if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+               (service_name="nfs_knfsd"; ctdb_counter_init)
+           else
+               (
+                       service_name="nfs_knfsd"
+                       ctdb_counter_incr
+                       ctdb_check_counter_limit 10 quiet >/dev/null
+               ) || {
+                       echo "$ctdb_check_rpc_out"
+                       echo "Trying to restart NFS service"
+                       startstop_nfs restart
+                       exit 1
+               }
+               # we haven't hit the failure limit so restart quietly
+               startstop_nfs restart >/dev/null 2>&1 &
+           fi
        }
 
        # and that its directories are available
@@ -79,8 +98,7 @@ case "$1" in
        } || exit $?
 
        # check that lockd responds to rpc requests
-       (ctdb_check_rpc "lockd" 100021 1)
-       [ $? = "0" ] || {
+       ctdb_check_rpc "LOCKD" 100021 1 || {
                echo "Trying to restart lock manager service"
                startstop_nfs restart
                startstop_nfslock restart
@@ -89,31 +107,30 @@ case "$1" in
 
        # mount needs special handling since it is sometimes not started
        # correctly on RHEL5
-       rpcinfo -u localhost 100005 1 > /dev/null || {
-               echo "ERROR: MOUNTD is not running. Trying to restart it."
-               RPCMOUNTDOPTS=""
-               [ -n "$MOUNTD_PORT" ] && RPCMOUNTDOPTS="$RPCMOUNTDOPTS -p 
$MOUNTD_PORT"
-               killall -q -9 rpc.mountd
-               rpc.mountd $RPCMOUNTDOPTS &
+       ctdb_check_rpc "MOUNTD" 100005 1 || {
+               p="rpc.mountd"
+               cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+               echo "Trying to restart MOUNTD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
                exit 1
        }
        # rquotad needs special handling since it is sometimes not started
        # correctly on RHEL5
        # this is not a critical service so we dont flag the node as unhealthy
-       rpcinfo -u localhost 100011 1 > /dev/null || {
-               echo "ERROR: RQUOTAD is not running. Trying to restart it."
-               RPCRQUOTADOPTS=""
-               [ -n "$RQUOTAD_PORT" ] && RPCRQUOTADOPTS="$RPCRQUOTADOPTS -p 
$RQUOTAD_PORT"
-               killall -q -9 rpc.rquotad
-               rpc.rquotad $RPCRQUOTADOPTS &
+       ctdb_check_rpc "RQUOTAD" 100011 1 || {
+               p="rpc.rquotad"
+               cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+               echo "Trying to restart RQUOTAD [${cmd}]"
+               killall -q -9 $p
+               $cmd &
        }
 
        # once every 60 seconds, update the statd state database for which
        # clients need notifications
        LAST_UPDATE=`stat --printf="%Y" $CTDB_VARDIR/state/statd/update-trigger`
        CURRENT_TIME=`date +"%s"`
-       expr "$CURRENT_TIME" ">" "(" "$LAST_UPDATE" "+" "60" ")" >/dev/null 
2>/dev/null
-       [ $? = "0" ] && {
+       [ $CURRENT_TIME -ge $(($LAST_UPDATE + 60)) ] && {
            mkdir -p $CTDB_VARDIR/state/statd
            touch $CTDB_VARDIR/state/statd/update-trigger
            $CTDB_BASE/statd-callout updatelocal &
diff --git a/config/functions b/config/functions
index 9659d48..610085b 100755
--- a/config/functions
+++ b/config/functions
@@ -4,7 +4,7 @@ PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH
 
 #######################################
 # pull in a system config file, if any
-loadconfig() {
+_loadconfig() {
 
     if [ -z "$1" ] ; then
        foo="${service_config:-${service_name}}"
@@ -25,6 +25,10 @@ loadconfig() {
     fi
 }
 
+loadconfig () {
+    _loadconfig "$@"
+}
+
 ##############################################################
 # determine on what type of system (init style) we are running
 detect_init_style() {
@@ -144,10 +148,14 @@ ctdb_check_rpc() {
     progname="$1"
     prognum="$2"
     version="$3"
-    rpcinfo -u localhost $prognum $version > /dev/null || {
-           echo "ERROR: $progname not responding to rpc requests"
-           exit 1
-    }
+
+    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if [ $? -ne 0 ] ; then
+       ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+       echo "$ctdb_check_rpc_out"
+       return 1
+    fi
 }
 
 ######################################################
diff --git a/server/ctdb_statistics.c b/server/ctdb_statistics.c
index aec17ad..29e6d6a 100644
--- a/server/ctdb_statistics.c
+++ b/server/ctdb_statistics.c
@@ -42,6 +42,7 @@ static void ctdb_statistics_update(struct event_context *ev, 
struct timed_event
 int ctdb_statistics_init(struct ctdb_context *ctdb)
 {
        bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+       ctdb->statistics.statistics_start_time = timeval_current();
 
        bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
        ctdb->statistics_current.statistics_start_time = timeval_current();
diff --git a/tests/simple/09_ctdb_ping.sh b/tests/simple/09_ctdb_ping.sh
index 6ca50d4..6610431 100755
--- a/tests/simple/09_ctdb_ping.sh
+++ b/tests/simple/09_ctdb_ping.sh
@@ -53,5 +53,5 @@ try_command_on_node -v 0 "! $CTDB ping -n 1"
 
 sanity_check_output \
     1 \
-    "(: ctdb_control error: 'ctdb_control to disconnected node'|Unable to get 
ping response from node 1|Node 1 is DISCONNECTED)" \
+    "(: ctdb_control error: ('ctdb_control to disconnected node'|'node is 
disconnected')|Unable to get ping response from node 1|Node 1 is 
DISCONNECTED|ctdb_control for getpnn failed|: Can not access node. Node is not 
operational\.)" \
     "$out"
diff --git a/tests/simple/14_ctdb_statistics.sh 
b/tests/simple/14_ctdb_statistics.sh
index 9a95a83..e9ecce5 100755
--- a/tests/simple/14_ctdb_statistics.sh
+++ b/tests/simple/14_ctdb_statistics.sh
@@ -33,7 +33,7 @@ set -e
 
 cluster_is_healthy
 
-pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics 
collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ 
nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec)$'
+pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics 
collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ 
nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec|[[:space:]]*(reclock_ctdbd|reclock_recd|call_latency|lockwait_latency|childwrite_latency)[[:space:]]+MIN/AVG/MAX[[:space:]]+[-.[:digit:]]+/[-.[:digit:]]+/[-.[:digit:]]+
 sec out of [[:digit:]]+)$'
 
 try_command_on_node -v 1 "$CTDB statistics"
 


-- 
CTDB repository

[SCM] CTDB repository - branch 1.2-nodeflags updated - ctdb-1.9.1-232-gea77f1b

Reply via email to