[SCM] CTDB repository - branch 1.0.69 updated - ctdb-1.0.69-8-gac55576

Ronnie Sahlberg Thu, 23 Apr 2009 21:47:35 -0700

The branch, 1.0.69 has been updated
       via  ac5557659e667da5f3a33cc612e06a21396fce2d (commit)
       via  968739ea9af2aaba90e6bb4569ae4d9fb60b79aa (commit)
       via  2031fd8b83bd832d1104128c3faddf209e6bb23e (commit)
       via  0563b3c6d9787164475538fbc1342f8c6be62eac (commit)
       via  3aba067b7fa7d77836901f82deee441a07b3a15d (commit)
       via  0a7dbe56d1b4545e9f45c29ea719d24b3b3ada3e (commit)
      from  767e92f64b63a6bcaa36efe633e8a3e55a803e79 (commit)


http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=1.0.69


- Log -----------------------------------------------------------------
commit ac5557659e667da5f3a33cc612e06a21396fce2d
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Fri Apr 24 14:43:48 2009 +1000

    new version 1.0.69-3

commit 968739ea9af2aaba90e6bb4569ae4d9fb60b79aa
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Fri Apr 24 14:41:21 2009 +1000

    tweak some timeouts so that we do trigger a banning even if the control 
hangs/timesout

commit 2031fd8b83bd832d1104128c3faddf209e6bb23e
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Fri Apr 24 13:58:32 2009 +1000

    If we can not pull a database from a node during recovery, mark this node 
as a "culprit" so that it will eventually become banned.

commit 0563b3c6d9787164475538fbc1342f8c6be62eac
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Thu Apr 23 09:31:44 2009 +1000

    new version 1.0.69-2

commit 3aba067b7fa7d77836901f82deee441a07b3a15d
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Mon Apr 6 12:00:22 2009 +1000

    We dont need to verify the nodemap on remote nodes that are banned

commit 0a7dbe56d1b4545e9f45c29ea719d24b3b3ada3e
Author: Ronnie Sahlberg <ronniesahlb...@gmail.com>
Date:   Thu Apr 2 14:50:43 2009 +1100

    if we cant pull the remote nodemap off a node we should mark it as a 
culprit so it eventually becomes banned.

-----------------------------------------------------------------------

Summary of changes:
 packaging/RPM/ctdb.spec |    8 +++++++-
 server/ctdb_recoverd.c  |   32 +++++++++++++++++++++++++++-----
 server/ctdb_tunables.c  |    4 ++--
 3 files changed, 36 insertions(+), 8 deletions(-)


Changeset truncated at 500 lines:

diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec
index aaa8a07..8572ab0 100644
--- a/packaging/RPM/ctdb.spec
+++ b/packaging/RPM/ctdb.spec
@@ -5,7 +5,7 @@ Vendor: Samba Team
 Packager: Samba Team <sa...@samba.org>
 Name: ctdb
 Version: 1.0
-Release: 69_1
+Release: 69_3
 Epoch: 0
 License: GNU GPL version 3
 Group: System Environment/Daemons
@@ -121,6 +121,12 @@ fi
 %{_includedir}/ctdb_private.h
 
 %changelog
+* Fri Apr 24 2009 : Version 1.0.69_3
+ - Make sure that if during recovery a node is stuck and does not reply to
+   pull_db requests that we eventually ban this node from the recovery master.
+* Thu Apr 23 2009 : Version 1.0.69_2
+ - In the recovery daemon we dont need to check the nodemap status
+   of banned nodes.
 * Wed Feb 5 2009 : Version 1.0.69_1
  - Dont check the result of the modflags control, to allow compatibility
    with earlier versions of ctdb
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 540749d..2e460dd 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -244,6 +244,23 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, 
uint32_t culprit)
        rec->culprit_counter++;
 }
 
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t 
culprit, uint32_t count)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       if (rec->last_culprit != culprit ||
+           timeval_elapsed(&rec->first_recover_time) > 
ctdb->tunable.recovery_grace_period) {
+               DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit));
+               /* either a new node is the culprit, or we've decided to 
forgive them */
+               rec->last_culprit = culprit;
+               rec->first_recover_time = timeval_current();
+               rec->culprit_counter = 0;
+       }
+       rec->culprit_counter += count;
+}
 
 /* this callback is called for every node that failed to execute the
    start recovery event
@@ -612,7 +629,9 @@ static int pull_one_remote_database(struct ctdb_context 
*ctdb, uint32_t srcnode,
 /*
   pull all the remote database contents into the recdb
  */
-static int pull_remote_database(struct ctdb_context *ctdb, struct 
ctdb_node_map *nodemap, 
+static int pull_remote_database(struct ctdb_context *ctdb,
+                               struct ctdb_recoverd *rec, 
+                               struct ctdb_node_map *nodemap, 
                                struct tdb_wrap *recdb, uint32_t dbid)
 {
        int j;
@@ -628,6 +647,7 @@ static int pull_remote_database(struct ctdb_context *ctdb, 
struct ctdb_node_map
                if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, 
recdb, dbid) != 0) {
                        DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote 
database from node %u\n", 
                                 nodemap->nodes[j].pnn));
+                       ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, 
nodemap->num);
                        return -1;
                }
        }
@@ -1244,7 +1264,7 @@ static int recover_database(struct ctdb_recoverd *rec,
        }
 
        /* pull all remote databases onto the recdb */
-       ret = pull_remote_database(ctdb, nodemap, recdb, dbid);
+       ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid);
        if (ret != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 
0x%x\n", dbid));
                return -1;
@@ -1326,9 +1346,9 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
        if (rec->culprit_counter > 2*nodemap->num) {
                DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f 
seconds - banning it for %u seconds\n",
-                        culprit, rec->culprit_counter, 
timeval_elapsed(&rec->first_recover_time),
+                        rec->last_culprit, rec->culprit_counter, 
timeval_elapsed(&rec->first_recover_time),
                         ctdb->tunable.recovery_ban_period));
-               ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period);
+               ctdb_ban_node(rec, rec->last_culprit, 
ctdb->tunable.recovery_ban_period);
        }
 
        if (!ctdb_recovery_lock(ctdb, true)) {
@@ -2657,12 +2677,14 @@ again:
        /* verify that all other nodes have the same nodemap as we have
        */
        for (j=0; j<nodemap->num; j++) {
-               if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
                        continue;
                }
 
                if (remote_nodemaps[j] == NULL) {
                        DEBUG(DEBUG_ERR,(__location__ " Did not get a remote 
nodemap for node %d, restarting monitoring\n", j));
+                       ctdb_set_culprit(rec, j);
+
                        goto again;
                }
 
diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c
index f758c2c..365865e 100644
--- a/server/ctdb_tunables.c
+++ b/server/ctdb_tunables.c
@@ -31,7 +31,7 @@ static const struct {
        { "KeepaliveInterval",    5,  offsetof(struct ctdb_tunable, 
keepalive_interval) },
        { "KeepaliveLimit",       5,  offsetof(struct ctdb_tunable, 
keepalive_limit) },
        { "MaxLACount",           7,  offsetof(struct ctdb_tunable, 
max_lacount) },
-       { "RecoverTimeout",      30,  offsetof(struct ctdb_tunable, 
recover_timeout) },
+       { "RecoverTimeout",      20,  offsetof(struct ctdb_tunable, 
recover_timeout) },
        { "RecoverInterval",      1,  offsetof(struct ctdb_tunable, 
recover_interval) },
        { "ElectionTimeout",      3,  offsetof(struct ctdb_tunable, 
election_timeout) },
        { "TakeoverTimeout",      5,  offsetof(struct ctdb_tunable, 
takeover_timeout) },
@@ -39,7 +39,7 @@ static const struct {
        { "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, 
tickle_update_interval) },
        { "EventScriptTimeout",  20,  offsetof(struct ctdb_tunable, 
script_timeout) },
        { "EventScriptBanCount",  5,  offsetof(struct ctdb_tunable, 
script_ban_count) },
-       { "RecoveryGracePeriod", 60,  offsetof(struct ctdb_tunable, 
recovery_grace_period) },
+       { "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, 
recovery_grace_period) },
        { "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, 
recovery_ban_period) },
        { "DatabaseHashSize", 10000,  offsetof(struct ctdb_tunable, 
database_hash_size) },
        { "DatabaseMaxDead",      5,  offsetof(struct ctdb_tunable, 
database_max_dead) },


-- 
CTDB repository

[SCM] CTDB repository - branch 1.0.69 updated - ctdb-1.0.69-8-gac55576

Reply via email to