[SCM] Samba Shared Repository - branch master updated

Amitay Isaacs Thu, 19 Jun 2014 17:08:26 -0700

The branch, master has been updated
       via  6f43896 ctdb-daemon: Debugging for tickle updates
       via  6a552f1 ctdb-tests: Try harder to avoid failures due to repeated 
recoveries
      from  364bdad messaging3: Make messaging_dgm_init return 0/errno


http://gitweb.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 6f43896e1258c4cf43401cbfeba24a50de3c3140
Author: Martin Schwenke <mar...@meltin.net>
Date:   Wed Mar 5 16:21:45 2014 +1100

    ctdb-daemon: Debugging for tickle updates
    
    This was useful for debugging the race fixed by commit
    4f79fa6c7c843502fcdaa2dead534ea3719b9f69.  It might be useful again.
    
    Also fix a nearby comment typo.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>
    Reviewed-by: Amitay Isaacs <ami...@gmail.com>
    
    Autobuild-User(master): Amitay Isaacs <ami...@samba.org>
    Autobuild-Date(master): Fri Jun 20 02:07:48 CEST 2014 on sn-devel-104

commit 6a552f1a12ebe43f946bbbee2a3846b5a640ae4f
Author: Martin Schwenke <mar...@meltin.net>
Date:   Tue Jun 10 15:16:44 2014 +1000

    ctdb-tests: Try harder to avoid failures due to repeated recoveries
    
    About a year ago a check was added to _cluster_is_healthy() to make
    sure that node 0 isn't in recovery.  This was to avoid unexpected
    recoveries causing tests to fail.  However, it was misguided because
    each test initially calls cluster_is_healthy() and will now fail if an
    unexpected recovery occurs.
    
    Instead, have cluster_is_healthy() warn if the cluster is in recovery.
    
    Also:
    
    * Rename wait_until_healthy() to wait_until_ready() because it waits
      until both healthy and out of recovery.
    
    * Change the post-recovery sleep in restart_ctdb() to 2 seconds and
      add a loop to wait (for 2 seconds at a time) if the cluster is back
      in recovery.  The logic here is that the re-recovery timeout has
      been set to 1 second, so sleeping for just 1 second might race
      against the next recovery.
    
    * Use reverse logic in node_has_status() so that it works for "all".
    
    * Tweak wait_until() so that it can handle timeouts with a
      recheck-interval specified.
    
    Signed-off-by: Martin Schwenke <mar...@meltin.net>
    Reviewed-by: Amitay Isaacs <ami...@gmail.com>

-----------------------------------------------------------------------

Summary of changes:
 ctdb/server/ctdb_takeover.c                 |   11 +++++-
 ctdb/tests/complex/34_nfs_tickle_restart.sh |    2 +-
 ctdb/tests/scripts/integration.bash         |   57 ++++++++++++++++++++-------
 3 files changed, 54 insertions(+), 16 deletions(-)


Changeset truncated at 500 lines:

diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
index 6c21e2b..8449288 100644
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@ -3230,7 +3230,7 @@ int32_t ctdb_control_tcp_remove(struct ctdb_context 
*ctdb, TDB_DATA indata)
 
 
 /*
-  Called when another daemon starts - caises all tickles for all
+  Called when another daemon starts - causes all tickles for all
   public addresses we are serving to be sent to the new node on the
   next check.  This actually causes the next scheduled call to
   tdb_update_tcp_tickles() to update all nodes.  This is simple and
@@ -3240,6 +3240,9 @@ int32_t ctdb_control_startup(struct ctdb_context *ctdb, 
uint32_t pnn)
 {
        struct ctdb_vnn *vnn;
 
+       DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
+                          (unsigned long) pnn));
+
        for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
                vnn->tcp_update_needed = true;
        }
@@ -3908,6 +3911,9 @@ int32_t ctdb_control_set_tcp_tickle_list(struct 
ctdb_context *ctdb, TDB_DATA ind
                return -1;
        }
 
+       DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
+                          ctdb_addr_to_str(&list->addr)));
+
        vnn = find_public_ip_vnn(ctdb, &list->addr);
        if (vnn == NULL) {
                DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, 
'%s' is not a public address\n",
@@ -4056,6 +4062,9 @@ static void ctdb_update_tcp_tickles(struct event_context 
*ev,
                        DEBUG(DEBUG_ERR,("Failed to send the tickle update for 
public address %s\n",
                                ctdb_addr_to_str(&vnn->public_address)));
                } else {
+                       DEBUG(DEBUG_INFO,
+                             ("Sent tickle update for public address %s\n",
+                              ctdb_addr_to_str(&vnn->public_address)));
                        vnn->tcp_update_needed = false;
                }
        }
diff --git a/ctdb/tests/complex/34_nfs_tickle_restart.sh 
b/ctdb/tests/complex/34_nfs_tickle_restart.sh
index 93587e2..b7eea4c 100755
--- a/ctdb/tests/complex/34_nfs_tickle_restart.sh
+++ b/ctdb/tests/complex/34_nfs_tickle_restart.sh
@@ -79,7 +79,7 @@ try_command_on_node $rn $CTDB_TEST_WRAPPER restart_ctdb_1
 echo "Setting NoIPTakeover on node ${rn}"
 try_command_on_node $rn $CTDB setvar NoIPTakeover 1
 
-wait_until_healthy
+wait_until_ready
 
 echo "Getting TickleUpdateInterval..."
 try_command_on_node $test_node $CTDB getvar TickleUpdateInterval
diff --git a/ctdb/tests/scripts/integration.bash 
b/ctdb/tests/scripts/integration.bash
index 1ff02d5..dec60a2 100644
--- a/ctdb/tests/scripts/integration.bash
+++ b/ctdb/tests/scripts/integration.bash
@@ -205,11 +205,19 @@ select_test_node_and_ips ()
 #######################################
 
 # Wait until either timeout expires or command succeeds.  The command
-# will be tried once per second.
+# will be tried once per second, unless timeout has format T/I, where
+# I is the recheck interval.
 wait_until ()
 {
     local timeout="$1" ; shift # "$@" is the command...
 
+    local interval=1
+    case "$timeout" in
+       */*)
+           interval="${timeout#*/}"
+           timeout="${timeout%/*}"
+    esac
+
     local negate=false
     if [ "$1" = "!" ] ; then
        negate=true
@@ -227,9 +235,12 @@ wait_until ()
            echo "OK"
            return 0
        fi
-       echo -n .
-       t=$(($t - 1))
-       sleep 1
+       local i
+       for i in $(seq 1 $interval) ; do
+           echo -n .
+       done
+       t=$(($t - $interval))
+       sleep $interval
     done
 
     echo "*TIMEOUT*"
@@ -249,14 +260,26 @@ sleep_for ()
 
 _cluster_is_healthy ()
 {
-    $CTDB nodestatus all >/dev/null && \
-       node_has_status 0 recovered
+    $CTDB nodestatus all >/dev/null
+}
+
+_cluster_is_recovered ()
+{
+    node_has_status all recovered
+}
+
+_cluster_is_ready ()
+{
+    _cluster_is_healthy && _cluster_is_recovered
 }
 
 cluster_is_healthy ()
 {
     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
        echo "Cluster is HEALTHY"
+       if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+         echo "WARNING: cluster in recovery mode!"
+       fi
        return 0
     else
        echo "Cluster is UNHEALTHY"
@@ -272,13 +295,13 @@ cluster_is_healthy ()
     fi
 }
 
-wait_until_healthy ()
+wait_until_ready ()
 {
     local timeout="${1:-120}"
 
-    echo "Waiting for cluster to become healthy..."
+    echo "Waiting for cluster to become ready..."
 
-    wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_healthy
+    wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_ready
 }
 
 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
@@ -303,7 +326,7 @@ node_has_status ()
        (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
        (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
        (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
-       (recovered)    rpat='^Recovery mode:NORMAL \(0\)$' ;;
+       (recovered)    rpat='^Recovery mode:RECOVERY \(1\)$' ;;
        *)
            echo "node_has_status: unknown status \"$status\""
            return 1
@@ -329,7 +352,7 @@ node_has_status ()
     elif [ -n "$mpat" ] ; then
        $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
     elif [ -n "$rpat" ] ; then
-        $CTDB status -n "$pnn" | egrep -q "$rpat"
+        ! $CTDB status -n "$pnn" | egrep -q "$rpat"
     else
        echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
        return 1
@@ -479,8 +502,8 @@ restart_ctdb ()
            continue
        }
 
-       wait_until_healthy || {
-           echo "Cluster didn't become healthy.  Restarting..."
+       wait_until_ready || {
+           echo "Cluster didn't become ready.  Restarting..."
            continue
        }
 
@@ -492,7 +515,13 @@ restart_ctdb ()
        # help the cluster to stabilise before a subsequent test.
        echo "Forcing a recovery..."
        onnode -q 0 $CTDB recover
-       sleep_for 1
+       sleep_for 2
+
+       if ! onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+           echo "Cluster has gone into recovery again, waiting..."
+           wait_until 30/2 onnode -q any $CTDB_TEST_WRAPPER 
_cluster_is_recovered
+       fi
+
 
        # Cluster is still healthy.  Good, we're done!
        if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then


-- 
Samba Shared Repository

[SCM] Samba Shared Repository - branch master updated

Reply via email to