In the case where agent-port is used and the agent check is a secondary check to not mark a server as down if the agent becomes unavailable.
In this configuration the agent should only cause a server to be marked as down if the agent returns "fail", "stopped" or "down". Signed-off-by: Simon Horman <ho...@verge.net.au> --- v4 * Never update health of agent on failure to connect Previously the health was recharged on such cases. But this causes the health to be recharged even if the most recent result from the agent was "fail", "stopped", or "down". The result was unnecessary logging of "fail", "stopped", or "down" results from the agent if they were interleaved with timeouts. Or in other words, if the agent responded only occasionally but consistently with "fail", "stopped", or "down". v3 * No change v2 * Only log agent server status if the change in status will affect the server's state --- doc/configuration.txt | 3 ++ include/types/server.h | 5 ++-- src/checks.c | 77 +++++++++++++++++++++++++++++++++----------------- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 233be4e..40675f4 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -7707,6 +7707,9 @@ agent-port <port> using the "port" parameter. A third scenario is to use "http-check agent-hdr" to run an agent and http checks simultaneously over HTTP. + When run in this manner failure to connect to the agent is not + considered an error as connectivity is tested by the primary check. + See also the "agent-inter" parameter. backup diff --git a/include/types/server.h b/include/types/server.h index dd17d61..f316e01 100644 --- a/include/types/server.h +++ b/include/types/server.h @@ -52,8 +52,9 @@ #define SRV_GOINGDOWN 0x0020 /* this server says that it's going down (404) */ #define SRV_WARMINGUP 0x0040 /* this server is warming up after a failure */ #define SRV_MAINTAIN 0x0080 /* this server is in maintenance mode */ -#define SRV_DRAIN 0x0100 /* this server has been requested to drain its connections */ -/* unused: 0x0200, 0x0400 */ +#define SRV_WILLDRAIN 0x0100 /* this server has been requested to drain its connections but has not started doing so yet */ +#define SRV_DRAIN 0x0200 /* this been requested to drain its connections */ +/* unused: 0x0400 */ #define SRV_SEND_PROXY 0x0800 /* this server talks the PROXY protocol */ #define SRV_NON_STICK 0x1000 /* never add connections allocated to this server to a stick table */ diff --git a/src/checks.c b/src/checks.c index 706ed31..88a34ea 100644 --- a/src/checks.c +++ b/src/checks.c @@ -228,11 +228,19 @@ static void set_server_check_status(struct check *check, short status, const cha tv_zero(&check->start); } + /* Failure to connect to the agent as a secondary check should not + * cause the server to be marked down. So only log status changes + * for HCHK_STATUS_* statuses */ + if (check == &s->agent && check->status < HCHK_STATUS_L7TOUT) + return; + if (s->proxy->options2 & PR_O2_LOGHCHKS && (((check->health != 0) && (check->result & SRV_CHK_FAILED)) || ((check->health != s->rise + s->fall - 1) && (check->result & SRV_CHK_PASSED)) || ((s->state & SRV_GOINGDOWN) && !(check->result & SRV_CHK_DISABLE)) || - (!(s->state & SRV_GOINGDOWN) && (check->result & SRV_CHK_DISABLE)))) { + (!(s->state & SRV_GOINGDOWN) && (check->result & SRV_CHK_DISABLE)) || + ((s->state & SRV_WILLDRAIN) && !(s->state & SRV_DRAIN)) || + (!(s->state & SRV_WILLDRAIN) && (s->state & SRV_DRAIN)))) { int health, rise, fall, state; @@ -284,7 +292,8 @@ static void set_server_check_status(struct check *check, short status, const cha chunk_appendf(&trash, ", status: %d/%d %s", (state & SRV_RUNNING) ? (health - rise + 1) : (health), (state & SRV_RUNNING) ? (fall) : (rise), - (state & SRV_RUNNING)?"UP":"DOWN"); + (state & SRV_RUNNING) ? + ((state & SRV_WILLDRAIN) ? "DRAIN" : "UP") : "DOWN"); Warning("%s.\n", trash.str); send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str); @@ -611,6 +620,27 @@ static void set_server_enabled(struct server *s) { set_server_enabled(srv); } +static void check_failed(struct check *check) +{ + struct server *s = check->server; + + /* The agent secondary check should only cause a server to be marked + * as down if check->status is HCHK_STATUS_L7STS, which indicates + * that the agent returned "fail", "stopped" or "down". + * The implication here is that failure to connect to the agent + * as a secondary check should not cause the server to be marked + * down. */ + if (check == &s->agent && check->status != HCHK_STATUS_L7STS) + return; + + if (check->health > s->rise) { + check->health--; /* still good */ + s->counters.failed_checks++; + } + else + set_server_down(check); +} + void health_adjust(struct server *s, short status) { int failed; @@ -668,13 +698,7 @@ void health_adjust(struct server *s, short status) case HANA_ONERR_FAILCHK: /* simulate a failed health check */ set_server_check_status(&s->check, HCHK_STATUS_HANA, trash.str); - - if (s->check.health > s->rise) { - s->check.health--; /* still good */ - s->counters.failed_checks++; - } - else - set_server_down(&s->check); + check_failed(&s->check); break; @@ -854,11 +878,6 @@ static void agent_expect(struct check *check, char *data) down_cmd = "fail"; } - if (drain) - check->server->state |= SRV_DRAIN; - else - check->server->state &= ~SRV_DRAIN; - if (down_cmd) { const char *end = data + strlen(down_cmd); /* @@ -871,7 +890,23 @@ static void agent_expect(struct check *check, char *data) } } + /* Signal intended drain state. + * A miss-match between the SRV_WILLDRAIN and SRV_DRAIN bits + * is used by set_server_check_status to log changes + * in state */ + if (drain) + check->server->state |= SRV_WILLDRAIN; + else if (!down_cmd) + check->server->state &= ~SRV_WILLDRAIN; + set_server_check_status(check, status, desc); + + /* Set or clear SRV_DRAIN now that any change in drain + * state has been detected by set_server_check_status(). */ + if (drain) + check->server->state |= SRV_DRAIN; + else if (!down_cmd) + check->server->state &= ~SRV_DRAIN; } /* @@ -1315,12 +1350,7 @@ static void process_result(struct check *check) struct server *s = check->server; if (check->result & SRV_CHK_FAILED) { /* a failure or timeout detected */ - if (check->health > s->rise) { - check->health--; /* still good */ - s->counters.failed_checks++; - } - else - set_server_down(check); + check_failed(check); } else { /* check was OK */ /* we may have to add/remove this server from the LB group */ @@ -1462,12 +1492,7 @@ static struct task *process_chk(struct task *t) /* here, we have seen a synchronous error, no fd was allocated */ check->state &= ~CHK_RUNNING; - if (check->health > s->rise) { - check->health--; /* still good */ - s->counters.failed_checks++; - } - else - set_server_down(check); + check_failed(check); /* we allow up to min(inter, timeout.connect) for a connection * to establish but only when timeout.check is set -- 1.8.3.2