The branch, 1.0.69 has been updated via ac5557659e667da5f3a33cc612e06a21396fce2d (commit) via 968739ea9af2aaba90e6bb4569ae4d9fb60b79aa (commit) via 2031fd8b83bd832d1104128c3faddf209e6bb23e (commit) via 0563b3c6d9787164475538fbc1342f8c6be62eac (commit) via 3aba067b7fa7d77836901f82deee441a07b3a15d (commit) via 0a7dbe56d1b4545e9f45c29ea719d24b3b3ada3e (commit) from 767e92f64b63a6bcaa36efe633e8a3e55a803e79 (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=1.0.69 - Log ----------------------------------------------------------------- commit ac5557659e667da5f3a33cc612e06a21396fce2d Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Fri Apr 24 14:43:48 2009 +1000 new version 1.0.69-3 commit 968739ea9af2aaba90e6bb4569ae4d9fb60b79aa Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Fri Apr 24 14:41:21 2009 +1000 tweak some timeouts so that we do trigger a banning even if the control hangs/timesout commit 2031fd8b83bd832d1104128c3faddf209e6bb23e Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Fri Apr 24 13:58:32 2009 +1000 If we can not pull a database from a node during recovery, mark this node as a "culprit" so that it will eventually become banned. commit 0563b3c6d9787164475538fbc1342f8c6be62eac Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu Apr 23 09:31:44 2009 +1000 new version 1.0.69-2 commit 3aba067b7fa7d77836901f82deee441a07b3a15d Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Mon Apr 6 12:00:22 2009 +1000 We dont need to verify the nodemap on remote nodes that are banned commit 0a7dbe56d1b4545e9f45c29ea719d24b3b3ada3e Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu Apr 2 14:50:43 2009 +1100 if we cant pull the remote nodemap off a node we should mark it as a culprit so it eventually becomes banned. ----------------------------------------------------------------------- Summary of changes: packaging/RPM/ctdb.spec | 8 +++++++- server/ctdb_recoverd.c | 32 +++++++++++++++++++++++++++----- server/ctdb_tunables.c | 4 ++-- 3 files changed, 36 insertions(+), 8 deletions(-) Changeset truncated at 500 lines: diff --git a/packaging/RPM/ctdb.spec b/packaging/RPM/ctdb.spec index aaa8a07..8572ab0 100644 --- a/packaging/RPM/ctdb.spec +++ b/packaging/RPM/ctdb.spec @@ -5,7 +5,7 @@ Vendor: Samba Team Packager: Samba Team <sa...@samba.org> Name: ctdb Version: 1.0 -Release: 69_1 +Release: 69_3 Epoch: 0 License: GNU GPL version 3 Group: System Environment/Daemons @@ -121,6 +121,12 @@ fi %{_includedir}/ctdb_private.h %changelog +* Fri Apr 24 2009 : Version 1.0.69_3 + - Make sure that if during recovery a node is stuck and does not reply to + pull_db requests that we eventually ban this node from the recovery master. +* Thu Apr 23 2009 : Version 1.0.69_2 + - In the recovery daemon we dont need to check the nodemap status + of banned nodes. * Wed Feb 5 2009 : Version 1.0.69_1 - Dont check the result of the modflags control, to allow compatibility with earlier versions of ctdb diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index 540749d..2e460dd 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -244,6 +244,23 @@ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit) rec->culprit_counter++; } +/* + remember the trouble maker + */ +static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count) +{ + struct ctdb_context *ctdb = rec->ctdb; + + if (rec->last_culprit != culprit || + timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) { + DEBUG(DEBUG_NOTICE,("New recovery culprit %u\n", culprit)); + /* either a new node is the culprit, or we've decided to forgive them */ + rec->last_culprit = culprit; + rec->first_recover_time = timeval_current(); + rec->culprit_counter = 0; + } + rec->culprit_counter += count; +} /* this callback is called for every node that failed to execute the start recovery event @@ -612,7 +629,9 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode, /* pull all the remote database contents into the recdb */ -static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, +static int pull_remote_database(struct ctdb_context *ctdb, + struct ctdb_recoverd *rec, + struct ctdb_node_map *nodemap, struct tdb_wrap *recdb, uint32_t dbid) { int j; @@ -628,6 +647,7 @@ static int pull_remote_database(struct ctdb_context *ctdb, struct ctdb_node_map if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) { DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n", nodemap->nodes[j].pnn)); + ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num); return -1; } } @@ -1244,7 +1264,7 @@ static int recover_database(struct ctdb_recoverd *rec, } /* pull all remote databases onto the recdb */ - ret = pull_remote_database(ctdb, nodemap, recdb, dbid); + ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid); if (ret != 0) { DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid)); return -1; @@ -1326,9 +1346,9 @@ static int do_recovery(struct ctdb_recoverd *rec, if (rec->culprit_counter > 2*nodemap->num) { DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds\n", - culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time), + rec->last_culprit, rec->culprit_counter, timeval_elapsed(&rec->first_recover_time), ctdb->tunable.recovery_ban_period)); - ctdb_ban_node(rec, culprit, ctdb->tunable.recovery_ban_period); + ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period); } if (!ctdb_recovery_lock(ctdb, true)) { @@ -2657,12 +2677,14 @@ again: /* verify that all other nodes have the same nodemap as we have */ for (j=0; j<nodemap->num; j++) { - if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) { + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { continue; } if (remote_nodemaps[j] == NULL) { DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j)); + ctdb_set_culprit(rec, j); + goto again; } diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c index f758c2c..365865e 100644 --- a/server/ctdb_tunables.c +++ b/server/ctdb_tunables.c @@ -31,7 +31,7 @@ static const struct { { "KeepaliveInterval", 5, offsetof(struct ctdb_tunable, keepalive_interval) }, { "KeepaliveLimit", 5, offsetof(struct ctdb_tunable, keepalive_limit) }, { "MaxLACount", 7, offsetof(struct ctdb_tunable, max_lacount) }, - { "RecoverTimeout", 30, offsetof(struct ctdb_tunable, recover_timeout) }, + { "RecoverTimeout", 20, offsetof(struct ctdb_tunable, recover_timeout) }, { "RecoverInterval", 1, offsetof(struct ctdb_tunable, recover_interval) }, { "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) }, { "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) }, @@ -39,7 +39,7 @@ static const struct { { "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) }, { "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) }, { "EventScriptBanCount", 5, offsetof(struct ctdb_tunable, script_ban_count) }, - { "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) }, + { "RecoveryGracePeriod", 120, offsetof(struct ctdb_tunable, recovery_grace_period) }, { "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period) }, { "DatabaseHashSize", 10000, offsetof(struct ctdb_tunable, database_hash_size) }, { "DatabaseMaxDead", 5, offsetof(struct ctdb_tunable, database_max_dead) }, -- CTDB repository