The branch, master has been updated via c386f2c62f06f1c60047b7d4b1ec7a9eec11873c (commit) via 80b8889267339b870868841ff077e850bc5b52e2 (commit) via 93df096773c89f21f77b3bcf9aa90bf28881b852 (commit) via 942f44123350d4d0c4ad7f3fcd5ff2d0d175739b (commit) from 1261f3d9702800a4e59550c881350daf479f00ef (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master - Log ----------------------------------------------------------------- commit c386f2c62f06f1c60047b7d4b1ec7a9eec11873c Author: Stefan Metzmacher <me...@samba.org> Date: Tue Aug 31 09:28:34 2010 +0200 server/banning: also release all ips if we're banning ourself metze commit 80b8889267339b870868841ff077e850bc5b52e2 Author: Stefan Metzmacher <me...@samba.org> Date: Mon Aug 30 18:25:28 2010 +0200 server/recoverd: if we can't get the recovery lock, ban ourself metze commit 93df096773c89f21f77b3bcf9aa90bf28881b852 Author: Stefan Metzmacher <me...@samba.org> Date: Tue Aug 31 08:42:32 2010 +0200 server/recoverd: do takeover_run after verifying the reclock file metze commit 942f44123350d4d0c4ad7f3fcd5ff2d0d175739b Author: Stefan Metzmacher <me...@samba.org> Date: Tue Aug 24 09:22:49 2010 +0200 server/monitor: ask for a takeoverrun after propagating our new flags metze ----------------------------------------------------------------------- Summary of changes: include/ctdb_private.h | 1 + server/ctdb_banning.c | 30 +++++++++++++++++++++++++++++- server/ctdb_monitor.c | 48 +++++++++++++++--------------------------------- server/ctdb_recoverd.c | 15 +++++++++------ 4 files changed, 54 insertions(+), 40 deletions(-) Changeset truncated at 500 lines: diff --git a/include/ctdb_private.h b/include/ctdb_private.h index b707afd..89b8f08 100644 --- a/include/ctdb_private.h +++ b/include/ctdb_private.h @@ -1291,6 +1291,7 @@ int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db); int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata); int32_t ctdb_control_disable_script(struct ctdb_context *ctdb, TDB_DATA indata); +int32_t ctdb_local_node_got_banned(struct ctdb_context *ctdb); int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata); int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata); int32_t ctdb_control_set_db_priority(struct ctdb_context *ctdb, TDB_DATA indata); diff --git a/server/ctdb_banning.c b/server/ctdb_banning.c index 3d5f216..5684907 100644 --- a/server/ctdb_banning.c +++ b/server/ctdb_banning.c @@ -42,6 +42,31 @@ ctdb_ban_node_event(struct event_context *ev, struct timed_event *te, } } +int32_t ctdb_local_node_got_banned(struct ctdb_context *ctdb) +{ + uint32_t i; + + /* make sure we are frozen */ + DEBUG(DEBUG_NOTICE,("This node has been banned - forcing freeze and recovery\n")); + + /* Reset the generation id to 1 to make us ignore any + REQ/REPLY CALL/DMASTER someone sends to us. + We are now banned so we shouldnt service database calls + anymore. + */ + ctdb->vnn_map->generation = INVALID_GENERATION; + + for (i=1; i<=NUM_DB_PRIORITIES; i++) { + if (ctdb_start_freeze(ctdb, i) != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to freeze db priority %u\n", i)); + } + } + ctdb_release_all_ips(ctdb); + ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + + return 0; +} + int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata) { struct ctdb_ban_time *bantime = (struct ctdb_ban_time *)indata.dptr; @@ -96,7 +121,10 @@ int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata) ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED; event_add_timed(ctdb->ev, ctdb->banning_ctx, timeval_current_ofs(bantime->time,0), ctdb_ban_node_event, ctdb); - + if (bantime->pnn == ctdb->pnn) { + return ctdb_local_node_got_banned(ctdb); + } + return 0; } diff --git a/server/ctdb_monitor.c b/server/ctdb_monitor.c index 7f5da5c..dff6f42 100644 --- a/server/ctdb_monitor.c +++ b/server/ctdb_monitor.c @@ -114,6 +114,7 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) int ret; TDB_DATA rddata; struct takeover_run_reply rd; + const char *state_str = NULL; c.pnn = ctdb->pnn; c.old_flags = node->flags; @@ -141,28 +142,12 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) ctdb->monitor->next_interval = 5; ctdb_run_notification_script(ctdb, "unhealthy"); - - /* ask the recmaster to reallocate all addresses */ - DEBUG(DEBUG_ERR,("Node became UNHEALTHY. Ask recovery master %u to perform ip reallocation\n", ctdb->recovery_master)); - ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master)); - } - } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n")); node->flags &= ~NODE_FLAGS_UNHEALTHY; ctdb->monitor->next_interval = 5; ctdb_run_notification_script(ctdb, "healthy"); - - /* ask the recmaster to reallocate all addresses */ - DEBUG(DEBUG_ERR,("Node became HEALTHY. Ask recovery master %u to perform ip reallocation\n", ctdb->recovery_master)); - ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata); - if (ret != 0) { - DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master)); - } - } after_change_status: @@ -190,6 +175,19 @@ after_change_status: ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_PUSH_NODE_FLAGS, data); + if (c.new_flags & NODE_FLAGS_UNHEALTHY) { + state_str = "UNHEALTHY"; + } else { + state_str = "HEALTHY"; + } + + /* ask the recmaster to reallocate all addresses */ + DEBUG(DEBUG_ERR,("Node became %s. Ask recovery master %u to perform ip reallocation\n", + state_str, ctdb->recovery_master)); + ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master)); + } } @@ -433,7 +431,6 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr; struct ctdb_node *node; uint32_t old_flags; - int i; if (c->pnn >= ctdb->num_nodes) { DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes)); @@ -483,22 +480,7 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) /* if we have become banned, we should go into recovery mode */ if ((node->flags & NODE_FLAGS_BANNED) && !(c->old_flags & NODE_FLAGS_BANNED) && (node->pnn == ctdb->pnn)) { - /* make sure we are frozen */ - DEBUG(DEBUG_NOTICE,("This node has been banned - forcing freeze and recovery\n")); - /* Reset the generation id to 1 to make us ignore any - REQ/REPLY CALL/DMASTER someone sends to us. - We are now banned so we shouldnt service database calls - anymore. - */ - ctdb->vnn_map->generation = INVALID_GENERATION; - - for (i=1; i<=NUM_DB_PRIORITIES; i++) { - if (ctdb_start_freeze(ctdb, i) != 0) { - DEBUG(DEBUG_ERR,(__location__ " Failed to freeze db priority %u\n", i)); - } - } - ctdb_release_all_ips(ctdb); - ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE; + return ctdb_local_node_got_banned(ctdb); } return 0; diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index 437e4cb..30c34b3 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -1382,8 +1382,10 @@ static int do_recovery(struct ctdb_recoverd *rec, DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n")); start_time = timeval_current(); if (!ctdb_recovery_lock(ctdb, true)) { - ctdb_set_culprit(rec, pnn); - DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n")); + DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery " + "and ban ourself for %u seconds\n", + ctdb->tunable.recovery_ban_period)); + ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period); return -1; } ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time)); @@ -3009,10 +3011,6 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, rec->reallocate_callers = NULL; } } - /* if there are takeovers requested, perform it and notify the waiters */ - if (rec->reallocate_callers) { - process_ipreallocate_requests(ctdb, rec); - } if (rec->recmaster == (uint32_t)-1) { DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n")); @@ -3199,6 +3197,11 @@ static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, } } + /* if there are takeovers requested, perform it and notify the waiters */ + if (rec->reallocate_callers) { + process_ipreallocate_requests(ctdb, rec); + } + /* get the nodemap for all active remote nodes */ remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num); -- CTDB repository