The branch, master has been updated via afdbf3c0df02decd823615134294abf2c8a8a5f3 (commit) via 722171fc94a36ffe9e0a5c64502b916fde0a13a4 (commit) via 042377ed803bb8f7ca9d6ea1a387427b7b8ba45a (commit) from 82ee458329968001bb03b2aec42e65f532f007b3 (commit)
http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master - Log ----------------------------------------------------------------- commit afdbf3c0df02decd823615134294abf2c8a8a5f3 Author: Christian Ambach <christian.amb...@de.ibm.com> Date: Wed May 6 19:01:58 2009 +0200 Remove error messages about a non-existing /var/log/log.ctdb when running ctdb with logging to syslog commit 722171fc94a36ffe9e0a5c64502b916fde0a13a4 Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu May 14 18:25:00 2009 +1000 add additional log info to track if/why we cant switch to client mode. commit 042377ed803bb8f7ca9d6ea1a387427b7b8ba45a Author: Ronnie Sahlberg <ronniesahlb...@gmail.com> Date: Thu May 14 10:33:25 2009 +1000 Track how long it takes to take out the recovery lock from both the main dameon and also from the recovery daemon. Log this in "ctdb statistics". Also add a varaible "RecLockLatencyMs" that will log an error everytime it takes longer than this to access the reclock file. ----------------------------------------------------------------------- Summary of changes: client/ctdb_client.c | 23 +++++++++++++++++++++++ common/ctdb_util.c | 16 ++++++++++++++++ include/ctdb_private.h | 8 ++++++++ server/ctdb_control.c | 4 ++++ server/ctdb_recover.c | 6 ++++++ server/ctdb_recoverd.c | 5 +++++ server/ctdb_tunables.c | 1 + tools/ctdb.c | 3 +++ tools/ctdb_diagnostics | 4 ++-- 9 files changed, 68 insertions(+), 2 deletions(-) Changeset truncated at 500 lines: diff --git a/client/ctdb_client.c b/client/ctdb_client.c index 3bdb4b2..df9fa06 100644 --- a/client/ctdb_client.c +++ b/client/ctdb_client.c @@ -263,6 +263,7 @@ int ctdb_socket_connect(struct ctdb_context *ctdb) ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0); if (ctdb->daemon.sd == -1) { + DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno)); return -1; } @@ -272,6 +273,7 @@ int ctdb_socket_connect(struct ctdb_context *ctdb) if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { close(ctdb->daemon.sd); ctdb->daemon.sd = -1; + DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno)); return -1; } @@ -3615,3 +3617,24 @@ int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb, return 0; } +/* + tell the main daemon how long it took to lock the reclock file + */ +int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency) +{ + int ret; + int32_t res; + TDB_DATA data; + + data.dptr = (uint8_t *)&latency; + data.dsize = sizeof(latency); + + ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data, + ctdb, NULL, &res, NULL, NULL); + if (ret != 0 || res != 0) { + DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n")); + return -1; + } + + return 0; +} diff --git a/common/ctdb_util.c b/common/ctdb_util.c index a3f1731..3604cc8 100644 --- a/common/ctdb_util.c +++ b/common/ctdb_util.c @@ -138,6 +138,22 @@ void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *lat } } +/* + update a reclock latency number + */ +void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l) +{ + if (l > *latency) { + *latency = l; + } + + if (ctdb->tunable.reclock_latency_ms !=0) { + if (l*1000 > ctdb->tunable.reclock_latency_ms) { + DEBUG(DEBUG_ERR, ("High RECLOCK latency %fs for operation %s\n", l, name)); + } + } +} + uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state) { uint32_t id; diff --git a/include/ctdb_private.h b/include/ctdb_private.h index f37560a..5ecf8a6 100644 --- a/include/ctdb_private.h +++ b/include/ctdb_private.h @@ -105,6 +105,7 @@ struct ctdb_tunable { uint32_t recd_ping_timeout; uint32_t recd_ping_failcount; uint32_t log_latency_ms; + uint32_t reclock_latency_ms; uint32_t recovery_drop_all_ips; uint32_t verify_recovery_lock; }; @@ -299,6 +300,10 @@ struct ctdb_statistics { uint32_t control; uint32_t traverse; } timeouts; + struct { + double ctdbd; + double recd; + } reclock; uint32_t total_calls; uint32_t pending_calls; uint32_t lockwait_calls; @@ -562,6 +567,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0, CTDB_CONTROL_EVENT_SCRIPT_FINISHED = 95, CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS = 96, CTDB_CONTROL_TRAVERSE_KILL = 97, + CTDB_CONTROL_RECD_RECLOCK_LATENCY = 98, }; /* @@ -954,6 +960,7 @@ void ctdb_recv_raw_pkt(void *p, uint8_t *data, uint32_t length); int ctdb_socket_connect(struct ctdb_context *ctdb); void ctdb_latency(struct ctdb_db_context *ctdb_db, const char *name, double *latency, struct timeval t); +void ctdb_reclock_latency(struct ctdb_context *ctdb, const char *name, double *latency, double l); uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state); void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location); @@ -1431,5 +1438,6 @@ int32_t ctdb_control_event_script_finished(struct ctdb_context *ctdb); int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb, TDB_DATA *outdata); int ctdb_log_event_script_output(struct ctdb_context *ctdb, char *str, uint16_t len); +int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency); #endif diff --git a/server/ctdb_control.c b/server/ctdb_control.c index d5d4766..72e60c5 100644 --- a/server/ctdb_control.c +++ b/server/ctdb_control.c @@ -441,6 +441,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, CHECK_CONTROL_DATA_SIZE(0); return ctdb_control_get_event_script_status(ctdb, outdata); + case CTDB_CONTROL_RECD_RECLOCK_LATENCY: + CHECK_CONTROL_DATA_SIZE(sizeof(double)); + ctdb_reclock_latency(ctdb, "recd reclock", &ctdb->statistics.reclock.recd, *((double *)indata.dptr)); + return 0; default: DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode)); return -1; diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c index c979398..b4428fa 100644 --- a/server/ctdb_recover.c +++ b/server/ctdb_recover.c @@ -492,6 +492,7 @@ struct ctdb_set_recmode_state { struct timed_event *te; struct fd_event *fde; pid_t child; + struct timeval start_time; }; /* @@ -520,6 +521,10 @@ static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_even */ static int set_recmode_destructor(struct ctdb_set_recmode_state *state) { + double l = timeval_elapsed(&state->start_time); + + ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l); + kill(state->child, SIGKILL); return 0; } @@ -632,6 +637,7 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, state = talloc(ctdb, struct ctdb_set_recmode_state); CTDB_NO_MEMORY(ctdb, state); + state->start_time = timeval_current(); if (ctdb->tunable.verify_recovery_lock == 0) { /* dont need to verify the reclock file */ diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c index c6d0a7a..995284f 100644 --- a/server/ctdb_recoverd.c +++ b/server/ctdb_recoverd.c @@ -1328,6 +1328,7 @@ static int do_recovery(struct ctdb_recoverd *rec, struct ctdb_dbid_map *dbmap; TDB_DATA data; uint32_t *nodes; + struct timeval start_time; DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n")); @@ -1351,11 +1352,15 @@ static int do_recovery(struct ctdb_recoverd *rec, ctdb_ban_node(rec, rec->last_culprit, ctdb->tunable.recovery_ban_period); } + DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n")); + start_time = timeval_current(); if (!ctdb_recovery_lock(ctdb, true)) { ctdb_set_culprit(rec, pnn); DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery\n")); return -1; } + ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time)); + DEBUG(DEBUG_ERR,("Recovery lock taken successfully by recovery daemon\n")); DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", culprit)); diff --git a/server/ctdb_tunables.c b/server/ctdb_tunables.c index bab9aef..6a8876f 100644 --- a/server/ctdb_tunables.c +++ b/server/ctdb_tunables.c @@ -53,6 +53,7 @@ static const struct { { "RecdPingTimeout", 20, offsetof(struct ctdb_tunable, recd_ping_timeout) }, { "RecdFailCount", 3, offsetof(struct ctdb_tunable, recd_ping_failcount) }, { "LogLatencyMs", 0, offsetof(struct ctdb_tunable, log_latency_ms) }, + { "RecLockLatencyMs", 1000, offsetof(struct ctdb_tunable, reclock_latency_ms) }, { "RecoveryDropAllIPs", 60, offsetof(struct ctdb_tunable, recovery_drop_all_ips) }, { "VerifyRecoveryLock", 1, offsetof(struct ctdb_tunable, verify_recovery_lock) }, }; diff --git a/tools/ctdb.c b/tools/ctdb.c index 35a4190..b85354c 100644 --- a/tools/ctdb.c +++ b/tools/ctdb.c @@ -209,6 +209,9 @@ static void show_statistics(struct ctdb_statistics *s) preflen?0:4, "", *(uint32_t *)(fields[i].offset+(uint8_t *)s)); } + printf(" %-30s %.6f sec\n", "max_reclock_ctdbd", s->reclock.ctdbd); + printf(" %-30s %.6f sec\n", "max_reclock_recd", s->reclock.recd); + printf(" %-30s %.6f sec\n", "max_call_latency", s->max_call_latency); printf(" %-30s %.6f sec\n", "max_lockwait_latency", s->max_lockwait_latency); printf(" %-30s %.6f sec\n", "max_childwrite_latency", s->max_childwrite_latency); diff --git a/tools/ctdb_diagnostics b/tools/ctdb_diagnostics index a9531d6..7b48299 100755 --- a/tools/ctdb_diagnostics +++ b/tools/ctdb_diagnostics @@ -133,10 +133,10 @@ show_all "ctdb statistics" show_all "ctdb uptime" echo "Showing log.ctdb" -show_all "tail -100 /var/log/log.ctdb" +show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb" echo "Showing log.ctdb" -show_all "tail -100 /var/log/log.ctdb" +show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb" show_all "tail -200 /var/log/messages" show_all "tail -200 /etc/ctdb/state/vacuum.log" -- CTDB repository