The branch, master has been updated via 05f785b51cfd8b22b3ae35bf034127fbc07005be (commit) via 0b7257642f62ebd83c05b6e2922f0dc2737f175c (commit) from b5a8791268e938d7e017056e0e2bd2cbec1fa690 (commit)
http://gitweb.samba.org/?p=ctdb.git;a=shortlog;h=master - Log ----------------------------------------------------------------- commit 05f785b51cfd8b22b3ae35bf034127fbc07005be Author: Martin Schwenke <mar...@meltin.net> Date: Tue Apr 30 17:22:23 2013 +1000 ctdbd: Avoid freeing non-monitor event callback when monitoring is disabled When running a non-monitor event, check is made for any active monitor events. If there is an active monitor event, then the active monitor event is cancelled. This is done by freeing state->callback which is allocated from monitor_context. When CTDB is stopped or shutdown, monitoring is disabled by freeing monitor_context, which frees callback and then stopped or shutdown event is run. This creates a new callback structure which is allocated at the exact same memory location as the monitor callback which was freed. So in the check for active monitor events, it frees the new callback for non-monitor event. Since the callback function flags successful completion of that event, it is never marked complete and CTDB is stuck in a loop waiting for completion. Move the monitor cancellation to the top of the function so that this can't happen. Follow log snippest highlights the problem. 2013/04/30 16:54:10.673807 [21505]: Received SHUTDOWN command. Stopping CTDB daemon. 2013/04/30 16:54:10.673814 [21505]: Shutting down recovery daemon 2013/04/30 16:54:10.673852 [21505]: server/eventscript.c:696 in remove_callback 0x1c6d5c0 2013/04/30 16:54:10.673858 [21505]: Monitoring has been stopped 2013/04/30 16:54:10.673899 [21505]: server/eventscript.c:594 Sending SIGTERM to child pid:23847 2013/04/30 16:54:10.673913 [21505]: server/eventscript.c:629 searching for callback 0x1c6d5c0 2013/04/30 16:54:10.673932 [21505]: server/eventscript.c:641 running callback 2013/04/30 16:54:10.673939 [21505]: server/eventscript.c:866 in event_script_callback 2013/04/30 16:54:10.673946 [21505]: server/eventscript.c:696 in remove_callback 0x1c6d5c0 Signed-off-by: Martin Schwenke <mar...@meltin.net> Pair-programmed-with: Amitay Isaacs <ami...@gmail.com> commit 0b7257642f62ebd83c05b6e2922f0dc2737f175c Author: Martin Schwenke <mar...@meltin.net> Date: Thu Feb 21 10:43:35 2013 +1100 recoverd: Interface reference count changes should not cause takeover runs At the moment a naive compare of the all the interface data is done. So, if any IPs move then the reference counts for the the relevant interfaces change, interfaces appear to have changed and another takeover run is initiated by each node that took/released IPs. This change stops the spurious takeover runs by changing the interface comparison to ignore the reference counts. Signed-off-by: Martin Schwenke <mar...@meltin.net> ----------------------------------------------------------------------- Summary of changes: server/ctdb_recoverd.c | 70 ++++++++++++++++++++++++++++++++--------------- server/eventscript.c | 61 ++++++++++++++++++++--------------------- 2 files changed, 77 insertions(+), 54 deletions(-) Changeset truncated at 500 lines: diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index 2cfa9b9..c3a1852 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -2855,17 +2855,61 @@ static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ct return status; } +static bool interfaces_have_changed(struct ctdb_context *ctdb, + struct ctdb_recoverd *rec) +{ + struct ctdb_control_get_ifaces *ifaces = NULL; + TALLOC_CTX *mem_ctx; + bool ret = false; + + mem_ctx = talloc_new(NULL); + + /* Read the interfaces from the local node */ + if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), + CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) { + DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn)); + /* We could return an error. However, this will be + * rare so we'll decide that the interfaces have + * actually changed, just in case. + */ + talloc_free(mem_ctx); + return true; + } + + if (!rec->ifaces) { + /* We haven't been here before so things have changed */ + ret = true; + } else if (rec->ifaces->num != ifaces->num) { + /* Number of interfaces has changed */ + ret = true; + } else { + /* See if interface names or link states have changed */ + int i; + for (i = 0; i < rec->ifaces->num; i++) { + struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i]; + if (strcmp(iface->name, ifaces->ifaces[i].name) != 0 || + iface->link_state != ifaces->ifaces[i].link_state) { + ret = true; + break; + } + } + } + + talloc_free(rec->ifaces); + rec->ifaces = talloc_steal(rec, ifaces); + + talloc_free(mem_ctx); + return ret; +} /* called to check that the local allocation of public ip addresses is ok. */ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap) { TALLOC_CTX *mem_ctx = talloc_new(NULL); - struct ctdb_control_get_ifaces *ifaces = NULL; struct ctdb_uptime *uptime1 = NULL; struct ctdb_uptime *uptime2 = NULL; int ret, j; - bool need_iface_check = false; bool need_takeover_run = false; ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(), @@ -2876,27 +2920,7 @@ static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_rec return -1; } - - /* read the interfaces from the local node */ - ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces); - if (ret != 0) { - DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn)); - talloc_free(mem_ctx); - return -1; - } - - if (!rec->ifaces) { - need_iface_check = true; - } else if (rec->ifaces->num != ifaces->num) { - need_iface_check = true; - } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) { - need_iface_check = true; - } - - talloc_free(rec->ifaces); - rec->ifaces = talloc_steal(rec, ifaces); - - if (need_iface_check) { + if (interfaces_have_changed(ctdb, rec)) { DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on " "local node %u - force takeover run\n", pnn)); diff --git a/server/eventscript.c b/server/eventscript.c index 817cb0c..41bf21d 100644 --- a/server/eventscript.c +++ b/server/eventscript.c @@ -707,36 +707,6 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb, { struct ctdb_event_script_state *state; - state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state); - CTDB_NO_MEMORY(ctdb, state); - - /* The callback isn't done if the context is freed. */ - state->callback = talloc(mem_ctx, struct event_script_callback); - CTDB_NO_MEMORY(ctdb, state->callback); - DLIST_ADD(ctdb->script_callbacks, state->callback); - talloc_set_destructor(state->callback, remove_callback); - state->callback->ctdb = ctdb; - state->callback->fn = callback; - state->callback->private_data = private_data; - - state->ctdb = ctdb; - state->from_user = from_user; - state->call = call; - state->options = talloc_vasprintf(state, fmt, ap); - state->timeout = timeval_set(ctdb->tunable.script_timeout, 0); - state->scripts = NULL; - if (state->options == NULL) { - DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n")); - talloc_free(state); - return -1; - } - if (!check_options(state->call, state->options)) { - DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n", - ctdb_eventscript_call_names[state->call], state->options)); - talloc_free(state); - return -1; - } - if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) { /* we guarantee that only some specifically allowed event scripts are run while in recovery */ @@ -755,7 +725,6 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb, if (i == ARRAY_SIZE(allowed_calls)) { DEBUG(DEBUG_ERR,("Refusing to run event scripts call '%s' while in recovery\n", ctdb_eventscript_call_names[call])); - talloc_free(state); return -1; } } @@ -778,6 +747,36 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb, ctdb->current_monitor = NULL; } + state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state); + CTDB_NO_MEMORY(ctdb, state); + + /* The callback isn't done if the context is freed. */ + state->callback = talloc(mem_ctx, struct event_script_callback); + CTDB_NO_MEMORY(ctdb, state->callback); + DLIST_ADD(ctdb->script_callbacks, state->callback); + talloc_set_destructor(state->callback, remove_callback); + state->callback->ctdb = ctdb; + state->callback->fn = callback; + state->callback->private_data = private_data; + + state->ctdb = ctdb; + state->from_user = from_user; + state->call = call; + state->options = talloc_vasprintf(state, fmt, ap); + state->timeout = timeval_set(ctdb->tunable.script_timeout, 0); + state->scripts = NULL; + if (state->options == NULL) { + DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n")); + talloc_free(state); + return -1; + } + if (!check_options(state->call, state->options)) { + DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n", + ctdb_eventscript_call_names[state->call], state->options)); + talloc_free(state); + return -1; + } + DEBUG(DEBUG_INFO,(__location__ " Starting eventscript %s %s\n", ctdb_eventscript_call_names[state->call], state->options)); -- CTDB repository