On Sun, 30 Aug 2009, Willy Tarreau wrote:
On Sun, Aug 30, 2009 at 04:18:58PM +0200, Krzysztof Oledzki wrote:
I think you wanted to put HCHK_STATUS_L57OK here, not OKD since we're
in the 2xx/3xx state and not 404 disable. Or maybe I misunderstood the
OKD status ?
OKD means we have Layer5-7 data avalible, like for example http code.
Several times I found that some of my servers were misconfigured and were
returning a 3xx code redirecting to a page-not-found webpage instead of
doing a proper healt-check, so I think it is good to know what was the
response, even if it was OK (2xx/3xx).
Ah OK that makes sense now. It's a good idea to note that data is
available, for later when we want to capture it whole. Indeed, I'd
like to reuse the same capture principle as is used in proxies for
errors. It does not take *that* much space and is so much useful
already that we ought to implement it soon there too !
OK, I found where your confusion comes from - the diff was incomplete,
there was no include/types/checks.h file that explains how
HCHK_STATUS_L57OK differs from HCHK_STATUS_L57OKD and also makes it
possible to compile the code. :(
Dmitry, could you please use this patch instead? ;)
Best regards,
Krzysztof Olędzki
diff -Nur haproxy-1.4-dev2-orig/doc/configuration.txt
haproxy-1.4-dev2/doc/configuration.txt
--- haproxy-1.4-dev2-orig/doc/configuration.txt 2009-08-09 22:57:09.000000000
+0200
+++ haproxy-1.4-dev2/doc/configuration.txt 2009-08-30 16:44:41.000000000
+0200
@@ -6408,6 +6408,13 @@
31. tracked: id of proxy/server if tracking is enabled
32. type (0=frontend, 1=backend, 2=server)
33. rate (number of sessions per second over last elapsed second)
+ 34. check_status: status of last healt check, one of:
+ UNK -> Unknown
+ SOCKERR -> Socket error
+ (...)
+ OLE - FIXME!!!
+ 35. check_code: layer5-7 code, if available
+ 36. check_duration: time in ms took to finish last health check
9.2. Unix Socket commands
diff -Nur haproxy-1.4-dev2-orig/include/proto/checks.h
haproxy-1.4-dev2/include/proto/checks.h
--- haproxy-1.4-dev2-orig/include/proto/checks.h 2009-08-09
22:57:09.000000000 +0200
+++ haproxy-1.4-dev2/include/proto/checks.h 2009-08-28 19:35:41.000000000
+0200
@@ -25,6 +25,8 @@
#include <types/task.h>
#include <common/config.h>
+const char *get_check_status_description(short check_status);
+const char *get_check_status_info(short check_status);
struct task *process_chk(struct task *t);
int start_checks();
diff -Nur haproxy-1.4-dev2-orig/include/types/checks.h
haproxy-1.4-dev2/include/types/checks.h
--- haproxy-1.4-dev2-orig/include/types/checks.h 1970-01-01
01:00:00.000000000 +0100
+++ haproxy-1.4-dev2/include/types/checks.h 2009-08-30 16:49:05.000000000
+0200
@@ -0,0 +1,31 @@
+/*
+ * Health-checks.
+ *
+ * Copyright 2008-2009 Krzysztof Piotr Oledzki <o...@ans.pl>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* check status */
+enum {
+ HCHK_STATUS_UNKNOWN = 0, /* Unknown */
+
+ HCHK_STATUS_SOCKERR, /* Socket error */
+
+ HCHK_STATUS_L14OK, /* L1-4 check passed, for example tcp
connect */
+ HCHK_STATUS_L14TMOUT, /* L1-4 timeout */
+ HCHK_STATUS_L14UNR, /* L1-4 reset, for example "Connection
refused" (tcp rst) or "No route to host" (icmp) */
+ HCHK_STATUS_L57OK, /* Check passed */
+ HCHK_STATUS_L57TMOUT, /* L5-7 (HTTP/SMTP/SSL) timeout */
+ HCHK_STATUS_L57INVRSP, /* L5-7 invalid response - protocol
error */
+
+ HCHK_STATUS_L57DATA, /* Below we have layer 5-7 data
avaliable */
+ HCHK_STATUS_L57OKD, /* L5-7 check passed*/
+ HCHK_STATUS_L57RSPERR, /* L5-7 response error, for example
HTTP 5xx */
+
+ HCHK_STATUS_SIZE
+};
diff -Nur haproxy-1.4-dev2-orig/include/types/server.h
haproxy-1.4-dev2/include/types/server.h
--- haproxy-1.4-dev2-orig/include/types/server.h 2009-08-09
22:57:09.000000000 +0200
+++ haproxy-1.4-dev2/include/types/server.h 2009-08-30 16:43:43.000000000
+0200
@@ -35,6 +35,7 @@
#include <types/proxy.h>
#include <types/queue.h>
#include <types/task.h>
+#include <types/checks.h>
/* server flags */
@@ -74,7 +75,7 @@
struct server *next;
int state; /* server state (SRV_*) */
int prev_state; /* server state before last
change (SRV_*) */
- int cklen; /* the len of the cookie, to
speed up checks */
+ int cklen; /* the len of the cookie, to
speed up checks */
int rdr_len; /* the length of the
redirection prefix */
char *cookie; /* the id set in the cookie */
char *rdr_pfx; /* the redirection prefix */
@@ -121,9 +122,12 @@
long long failed_checks, down_trans; /* failed checks and up-down
transitions */
unsigned down_time; /* total time the server was
down */
time_t last_change; /* last time, when the state
was changed */
+ struct timeval check_start; /* last health check start time
*/
+ unsigned long check_duration; /* time in ms took to finish
last health check */
+ short check_status, check_code; /* check result, check code */
long long failed_conns, failed_resp; /* failed connect() and
responses */
- long long retries, redispatches; /* retried and
redispatched connections */
+ long long retries, redispatches; /* retried and redispatched
connections */
long long failed_secu; /* blocked responses because of
security concerns */
struct freq_ctr sess_per_sec; /* sessions per second on this
server */
unsigned int sps_max; /* maximum of new sessions per
second seen on this server */
diff -Nur haproxy-1.4-dev2-orig/src/checks.c haproxy-1.4-dev2/src/checks.c
--- haproxy-1.4-dev2-orig/src/checks.c 2009-08-09 22:57:09.000000000 +0200
+++ haproxy-1.4-dev2/src/checks.c 2009-08-30 16:45:15.000000000 +0200
@@ -2,7 +2,7 @@
* Health-checks functions.
*
* Copyright 2000-2009 Willy Tarreau <w...@1wt.eu>
- * Copyright 2007-2008 Krzysztof Piotr Oledzki <o...@ans.pl>
+ * Copyright 2007-2009 Krzysztof Piotr Oledzki <o...@ans.pl>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -12,6 +12,7 @@
*/
#include <assert.h>
+#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -33,6 +34,7 @@
#include <types/global.h>
#include <proto/backend.h>
+#include <proto/checks.h>
#include <proto/buffers.h>
#include <proto/fd.h>
#include <proto/log.h>
@@ -44,6 +46,85 @@
#include <proto/server.h>
#include <proto/task.h>
+const char *check_status_description[HCHK_STATUS_SIZE] = {
+ [HCHK_STATUS_UNKNOWN] = "Unknown",
+
+ [HCHK_STATUS_SOCKERR] = "Socket error",
+
+ [HCHK_STATUS_L14OK] = "Layer1-4 check passed",
+ [HCHK_STATUS_L14TMOUT] = "Layer1-4 timeout",
+ [HCHK_STATUS_L14UNR] = "Layer1-4 unreachable",
+
+ [HCHK_STATUS_L57OK] = "Layer5-7 check passed",
+ [HCHK_STATUS_L57TMOUT] = "Layer5-7 timeout",
+ [HCHK_STATUS_L57INVRSP] = "Layer5-7 invalid response",
+
+ [HCHK_STATUS_L57OKD] = "Layer5-7 check passed",
+ [HCHK_STATUS_L57RSPERR] = "Layer5-7 response error",
+};
+
+
+const char *check_status_info[HCHK_STATUS_SIZE] = {
+ [HCHK_STATUS_UNKNOWN] = "UNK",
+
+ [HCHK_STATUS_SOCKERR] = "SOCKERR",
+
+ [HCHK_STATUS_L14OK] = "L14OK",
+ [HCHK_STATUS_L14TMOUT] = "L14TMOUT",
+ [HCHK_STATUS_L14UNR] = "L14UNR",
+
+ [HCHK_STATUS_L57OK] = "L57OK",
+ [HCHK_STATUS_L57TMOUT] = "L57TMOUT",
+ [HCHK_STATUS_L57INVRSP] = "L57INVRSP",
+
+ [HCHK_STATUS_L57OKD] = "L57OK",
+ [HCHK_STATUS_L57RSPERR] = "L57RSPERR",
+};
+
+/*
+ * Convert check_status code to description
+ */
+const char *get_check_status_description(short check_status) {
+
+ const char *desc;
+
+ if (check_status < HCHK_STATUS_SIZE)
+ desc = check_status_description[check_status];
+ else
+ desc = NULL;
+
+ if (desc && *desc)
+ return desc;
+ else
+ return check_status_description[HCHK_STATUS_UNKNOWN];
+}
+
+/*
+ * Convert check_status code to short info
+ */
+const char *get_check_status_info(short check_status) {
+
+ const char *info;
+
+ if (check_status < HCHK_STATUS_SIZE)
+ info = check_status_info[check_status];
+ else
+ info = NULL;
+
+ if (info && *info)
+ return info;
+ else
+ return check_status_info[HCHK_STATUS_UNKNOWN];
+}
+
+/*
+ * Set check_status and update check_duration
+ */
+static void set_server_check_status(struct server *s, short status) {
+ s->check_status = status;
+ s->check_duration = tv_ms_elapsed(&s->check_start, &now);
+}
+
/* sends a log message when a backend goes down, and also sets last
* change date.
*/
@@ -144,6 +225,12 @@
chunk_printf(&msg, sizeof(trash), " via %s/%s",
s->tracked->proxy->id, s->tracked->id);
+ chunk_printf(&msg, sizeof(trash), ", reason: %s(%d)",
get_check_status_description(s->check_status), s->check_status);
+ if (s->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), ", code: %d",
s->check_code);
+
+ chunk_printf(&msg, sizeof(trash), ", check duration: %lums",
s->check_duration);
+
chunk_printf(&msg, sizeof(trash), ". %d active and %d backup
servers left.%s"
" %d sessions active, %d requeued, %d remaining in
queue.\n",
s->proxy->srv_act, s->proxy->srv_bck,
@@ -219,6 +306,10 @@
chunk_printf(&msg, sizeof(trash), " via %s/%s",
s->tracked->proxy->id, s->tracked->id);
+ chunk_printf(&msg, sizeof(trash), ", reason: %s",
get_check_status_description(s->check_status));
+ if (s->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash), ", code: %d",
s->check_code);
+
chunk_printf(&msg, sizeof(trash), ". %d active and %d backup
servers online.%s"
" %d sessions requeued, %d total in queue.\n",
s->proxy->srv_act, s->proxy->srv_bck,
@@ -338,8 +429,10 @@
struct server *s = t->context;
//fprintf(stderr, "event_srv_chk_w, state=%ld\n",
unlikely(fdtab[fd].state));
- if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev &
FD_POLL_ERR)))
+ if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev &
FD_POLL_ERR))) {
+ set_server_check_status(s, HCHK_STATUS_L14UNR);
goto out_error;
+ }
/* here, we know that the connection is established */
@@ -373,8 +466,10 @@
}
else if (ret == 0 || errno == EAGAIN)
goto out_poll;
- else
+ else {
+ set_server_check_status(s, 99); //Ole FIXME
goto out_error;
+ }
}
else {
/* We have no data to send to check the connection, and
@@ -398,8 +493,10 @@
if (errno == EALREADY || errno == EINPROGRESS)
goto out_poll;
- if (errno && errno != EISCONN)
+ if (errno && errno != EISCONN) {
+ set_server_check_status(s, 101); //Ole
FIXME
goto out_error;
+ }
/* good TCP connection is enough */
s->result |= SRV_CHK_RUNNING;
@@ -452,6 +549,7 @@
(skerr != 0))) {
/* in case of TCP only, this tells us if the connection failed
*/
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, 66); //Ole FIXME
goto out_wakeup;
}
@@ -474,40 +572,67 @@
if (s->proxy->options & PR_O_HTTP_CHK) {
/* Check if the server speaks HTTP 1.X */
if ((len < strlen("HTTP/1.0 000\r")) ||
- (memcmp(trash, "HTTP/1.", 7) != 0)) {
+ (memcmp(trash, "HTTP/1.", 7) != 0 ||
+ (trash[12] != ' ' && trash[12] != '\r')) ||
+ !isdigit(trash[9]) || !isdigit(trash[10]) ||
!isdigit(trash[11])) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L57INVRSP);
goto out_wakeup;
}
+ s->check_code = str2uic(&trash[9]);
+
/* check the reply : HTTP/1.X 2xx and 3xx are OK */
- if (trash[9] == '2' || trash[9] == '3')
+ if (trash[9] == '2' || trash[9] == '3') {
s->result |= SRV_CHK_RUNNING;
- else if ((s->proxy->options & PR_O_DISABLE404) &&
+ set_server_check_status(s, HCHK_STATUS_L57OKD);
+ } else if ((s->proxy->options & PR_O_DISABLE404) &&
(s->state & SRV_RUNNING) &&
- (memcmp(&trash[9], "404", 3) == 0)) {
+ (s->check_code == 404)) {
/* 404 may be accepted as "stopping" only if the server
was up */
s->result |= SRV_CHK_RUNNING | SRV_CHK_DISABLE;
+ set_server_check_status(s, HCHK_STATUS_L57OKD);
}
- else
+ else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L57RSPERR);
+ }
}
else if (s->proxy->options & PR_O_SSL3_CHK) {
/* Check for SSLv3 alert or handshake */
- if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16))
+ if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16)) {
s->result |= SRV_CHK_RUNNING;
- else
+ set_server_check_status(s, HCHK_STATUS_L57OK);
+ } else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L57INVRSP);
+ }
}
else if (s->proxy->options & PR_O_SMTP_CHK) {
+ /* Check if the server speaks SMTP */
+ if ((len < strlen("000\r")) ||
+ (trash[3] != ' ' && trash[3] != '\r') ||
+ !isdigit(trash[0]) || !isdigit(trash[1]) ||
!isdigit(trash[2])) {
+ s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L57INVRSP);
+ goto out_wakeup;
+ }
+
+ s->check_code = str2uic(&trash[0]);
+
/* Check for SMTP code 2xx (should be 250) */
- if ((len >= 3) && (trash[0] == '2'))
+ if (trash[0] == '2') {
s->result |= SRV_CHK_RUNNING;
- else
+ set_server_check_status(s, HCHK_STATUS_L57OKD);
+ } else {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s, HCHK_STATUS_L57RSPERR);
+ }
}
else {
/* other checks are valid if the connection succeeded anyway */
s->result |= SRV_CHK_RUNNING;
+ set_server_check_status(s, HCHK_STATUS_L14OK);
}
out_wakeup:
@@ -557,6 +682,7 @@
}
/* we'll initiate a new check */
+ s->check_start = now;
s->result = SRV_CHK_UNKNOWN; /* no result yet */
if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) != -1) {
if ((fd < global.maxsock) &&
@@ -633,6 +759,7 @@
if (ret) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s,
HCHK_STATUS_SOCKERR);
switch (ret) {
case 1:
Alert("Cannot bind to
source address before connect() for server %s/%s. Aborting.\n",
@@ -664,6 +791,7 @@
ret = tcpv4_bind_socket(fd, flags,
&s->proxy->source_addr, remote);
if (ret) {
s->result |= SRV_CHK_ERROR;
+ set_server_check_status(s,
HCHK_STATUS_SOCKERR);
switch (ret) {
case 1:
Alert("Cannot bind to
source address before connect() for %s '%s'. Aborting.\n",
@@ -721,6 +849,7 @@
}
else if (errno != EALREADY && errno !=
EISCONN && errno != EAGAIN) {
s->result |= SRV_CHK_ERROR;
/* a real error */
+ set_server_check_status(s, 67);
//Ole FIXME
}
}
}
@@ -815,6 +944,9 @@
goto new_chk;
}
else if ((s->result & SRV_CHK_ERROR) ||
tick_is_expired(t->expire, now_ms)) {
+ if (!(s->result & SRV_CHK_ERROR))
+ set_server_check_status(s, EV_FD_ISSET(fd,
DIR_RD)?HCHK_STATUS_L57TMOUT:HCHK_STATUS_L14TMOUT);
+
//fprintf(stderr, "process_chk: 10\n");
/* failure or timeout detected */
if (s->health > s->rise) {
@@ -901,6 +1033,7 @@
t->expire = tick_add(now_ms,
MS_TO_TICKS(((mininter && mininter
>= srv_getinter(s)) ?
mininter :
srv_getinter(s)) * srvpos / nbchk));
+ s->check_start = now;
task_queue(t);
srvpos++;
diff -Nur haproxy-1.4-dev2-orig/src/dumpstats.c haproxy-1.4-dev2/src/dumpstats.c
--- haproxy-1.4-dev2-orig/src/dumpstats.c 2009-08-09 22:57:09.000000000
+0200
+++ haproxy-1.4-dev2/src/dumpstats.c 2009-08-30 14:19:56.000000000 +0200
@@ -2,6 +2,7 @@
* Functions dedicated to statistics output
*
* Copyright 2000-2009 Willy Tarreau <w...@1wt.eu>
+ * Copyright 2007-2009 Krzysztof Piotr Oledzki <o...@ans.pl>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -39,6 +40,7 @@
#include <proto/backend.h>
#include <proto/buffers.h>
+#include <proto/checks.h>
#include <proto/dumpstats.h>
#include <proto/fd.h>
#include <proto/freq_ctr.h>
@@ -177,6 +179,7 @@
"chkfail,chkdown,lastchg,downtime,qlimit,"
"pid,iid,sid,throttle,lbtot,tracked,type,"
"rate,rate_lim,rate_max,"
+ "check_status,check_code,check_duration"
"\n");
}
@@ -670,7 +673,7 @@
"<th colspan=3>Session rate</th><th
colspan=5>Sessions</th>"
"<th colspan=2>Bytes</th><th
colspan=2>Denied</th>"
"<th colspan=3>Errors</th><th
colspan=2>Warnings</th>"
- "<th colspan=8>Server</th>"
+ "<th colspan=9>Server</th>"
"</tr>\n"
"<tr align=\"center\" class=\"titre\">"
"<th>Cur</th><th>Max</th><th>Limit</th>"
@@ -678,7 +681,7 @@
"<th>Limit</th><th>Total</th><th>LbTot</th><th>In</th><th>Out</th>"
"<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>"
"<th>Resp</th><th>Retr</th><th>Redis</th>"
- "<th>Status</th><th>Wght</th><th>Act</th>"
+ "<th>Status</th><th>Last
check</th><th>Wght</th><th>Act</th>"
"<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>"
"<th>Thrtle</th>\n"
"</tr>",
@@ -722,7 +725,7 @@
/* server status : reflect frontend status
*/
"<td align=center>%s</td>"
/* rest of server: nothing */
- "<td align=center colspan=7></td></tr>"
+ "<td align=center colspan=8></td></tr>"
"",
U2H0(px->denied_req),
U2H1(px->denied_resp),
U2H2(px->failed_req),
@@ -748,8 +751,10 @@
",,,,,,,,"
/* pid, iid, sid, throttle, lbtot,
tracked, type */
"%d,%d,0,,,,%d,"
- /* rate, rate_lim, rate_max, */
+ /* rate, rate_lim, rate_max */
"%u,%u,%u,"
+ /* check_status, check_code,
check_duration */
+ ",,,"
"\n",
px->id,
px->feconn, px->feconn_max, px->maxconn,
px->cum_feconn,
@@ -854,7 +859,7 @@
U2H3(sv->failed_conns),
U2H4(sv->failed_resp),
sv->retries, sv->redispatches);
- /* status */
+ /* status, lest check */
chunk_printf(&msg, sizeof(trash), "<td
nowrap>");
if (sv->state & SRV_CHECKED)
@@ -866,6 +871,12 @@
(svs->state & SRV_RUNNING) ? (svs->health
- svs->rise + 1) : (svs->health),
(svs->state & SRV_RUNNING) ? (svs->fall) :
(svs->rise));
+ chunk_printf(&msg, sizeof(trash), "</td><td
nowrap> %s", get_check_status_info(sv->check_status));
+ if (sv->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash),
"/%d", sv->check_code);
+
+ chunk_printf(&msg, sizeof(trash), " in %lums",
sv->check_duration);
+
chunk_printf(&msg, sizeof(trash),
/* weight */
"</td><td>%d</td>"
@@ -990,6 +1001,18 @@
read_freq_ctr(&sv->sess_per_sec),
sv->sps_max);
+ /* check_status */
+ chunk_printf(&msg, sizeof(trash), "%s,",
get_check_status_info(sv->check_status));
+
+ /* check_code */
+ if (sv->check_status >= HCHK_STATUS_L57DATA)
+ chunk_printf(&msg, sizeof(trash),
"%u,", sv->check_code);
+ else
+ chunk_printf(&msg, sizeof(trash), ",");
+
+ /* check_duration */
+ chunk_printf(&msg, sizeof(trash), "%lu,",
sv->check_duration);
+
/* finish with EOL */
chunk_printf(&msg, sizeof(trash), "\n");
}
@@ -1038,7 +1061,7 @@
* if the backend has known working
servers or if it has no server at
* all (eg: for stats). Then we display
the total weight, number of
* active and backups. */
- "<td align=center nowrap>%s %s</td><td
align=center>%d</td>"
+ "<td align=center nowrap>%s %s</td><td
align=center> </td><td align=center>%d</td>"
"<td align=center>%d</td><td
align=center>%d</td>"
"",
U2H0(px->denied_req),
U2H1(px->denied_resp),
@@ -1086,6 +1109,8 @@
"%d,%d,0,,%lld,,%d,"
/* rate, rate_lim, rate_max, */
"%u,,%u,"
+ /* check_status, check_code,
check_duration */
+ ",,,"
"\n",
px->id,
px->nbpend /* or px->totpend ? */,
px->nbpend_max,