Hi all,
My scenario is Apache httpd + mod_jk + N Tomcat's in. The default
behaviour of load balanced workers in mod_jk in my testing is that when
a client requests a page (GET / POST / Whatever), the LB worker tries
the request to every ajp worker. This in contrast with what i read here:
http://people.apache.org/~mturk/docs/article/ftwai.html
<http://people.apache.org/%7Emturk/docs/article/ftwai.html>
Expecially this part:
When having multiple nodes in a cluster you can improve your application
availability by implementing failover. The failover means that if the
particular elected node can not fulfill the request the another node
will be selected automatically. In case of three nodes you are actually
doubling your application availability. The application response time
will be slower during failover, but none of your users will be rejected.
Inside the mod_jk configuration there is a special configuration
parameter called worker.retries that has default value of 3, but that
needs to be adjusted to the actual number of nodes in the cluster.
...
worker.list=lbworker
worker.lbworker.type=lb
# Adjust to the number of workers
worker.retries=4
worker.lbworker.balance_workers=node1,node2,node3,node4
If you add more then three workers to the load balancer adjust the
retries parameter to reflect that number. It will ensure that even in
the worse case scenario the request gets served if there is a single
operable node.
From that it seems that the "retries" parameter in a load balancer
worker context should mean the number of real (AJP) workers to try.
(what i need indeed) but in my testing, that LB worker parameter is the
number of times that all the AJP workers that are part of the LB worker
get a round retry. In eg, having a LB worker with 4 AJP workers, setting
LB Worker's retries = 2, the behaviour i see is that the AJP workers get
called this way:
AJP1 -> timeout
[...]
AJP4 -> timeout
===> repeat again (retries == 2)
AJP1 -> timeout
[...]
AJP4 -> timeout
--> LB sends an error to the client.
Now from the online documentation the meaning of that parameter in a
load balancer worker context is'nt that clear, but from the link i
provided seems it was exactly what i needed, not the number of retries
to all AJP workers, but the number of single AJP workers to try..
If that is not correct i can fill a bug report. If instead it's by
design, the attached patch adds a new parameter, "lb_retries", that does
what i need. Of course it's a bit rough, but works.
Any comments? Am I getting stuff wrong?
Thanks in advance,
Frederik
diff --git a/native/.gitignore b/native/.gitignore
new file mode 100644
index 0000000..1f8d345
--- /dev/null
+++ b/native/.gitignore
@@ -0,0 +1,5 @@
+Makefile.in
+aclocal.m4
+config.log
+config.nice
+configure
diff --git a/native/common/.gitignore b/native/common/.gitignore
new file mode 100644
index 0000000..2a9005d
--- /dev/null
+++ b/native/common/.gitignore
@@ -0,0 +1 @@
+config.h.in
diff --git a/native/common/jk_ajp_common.c b/native/common/jk_ajp_common.c
index 08bcc02..9a12a89 100644
--- a/native/common/jk_ajp_common.c
+++ b/native/common/jk_ajp_common.c
@@ -2904,6 +2904,9 @@ int ajp_init(jk_worker_t *pThis,
p->retries =
jk_get_worker_retries(props, p->name,
JK_RETRIES);
+ p->lb_retries =
+ jk_get_worker_lb_retries(props, p->name,
+ JK_LB_RETRIES);
p->max_packet_size =
jk_get_max_packet_size(props, p->name);
diff --git a/native/common/jk_ajp_common.h b/native/common/jk_ajp_common.h
index 0c1636c..7b342d0 100644
--- a/native/common/jk_ajp_common.h
+++ b/native/common/jk_ajp_common.h
@@ -363,6 +363,13 @@ struct ajp_worker
*/
int retries;
+ /*
+ * Public property used in load balancer workers, meaning
+ * the maximum number of failover attempts between ajp
+ * workers of cluster.
+ */
+ int lb_retries;
+
unsigned int max_packet_size; /* Maximum AJP Packet size */
int retry_interval; /* Number of milliseconds to sleep before doing a retry */
diff --git a/native/common/jk_lb_worker.c b/native/common/jk_lb_worker.c
index a9894eb..d6e0251 100644
--- a/native/common/jk_lb_worker.c
+++ b/native/common/jk_lb_worker.c
@@ -1159,6 +1159,7 @@ static int JK_METHOD service(jk_endpoint_t *e,
if (p->worker->sequence < p->worker->s->h.sequence)
jk_lb_pull(p->worker, JK_FALSE, l);
for (i = 0; i < num_of_workers; i++) {
+ jk_log(l, JK_LOG_DEBUG, "LB - num_of_workers: %d, retry: %d, lb_retries: %d", num_of_workers, i, p->worker->lb_retries);
lb_sub_worker_t *rec = &(p->worker->lb_workers[i]);
if (rec->s->state == JK_LB_STATE_BUSY) {
if (ajp_has_endpoint(rec->worker, l)) {
@@ -1203,7 +1204,10 @@ static int JK_METHOD service(jk_endpoint_t *e,
"service sticky_session=%d id='%s'",
p->worker->sticky_session, sessionid ? sessionid : "empty");
- while (recoverable == JK_TRUE) {
+ while (recoverable == JK_TRUE && attempt <= p->worker->lb_retries) {
+ if (JK_IS_DEBUG_LEVEL(l))
+ jk_log(l, JK_LOG_DEBUG, "attempt %d, max attempts %d",
+ attempt, p->worker->lb_retries);
if (attempt >= num_of_workers) {
retry++;
if (retry >= p->worker->retries) {
@@ -1806,6 +1810,8 @@ static int JK_METHOD init(jk_worker_t *pThis,
p->worker.we = we;
p->retries = jk_get_worker_retries(props, p->name,
JK_RETRIES);
+ p->lb_retries = jk_get_worker_lb_retries(props, p->name,
+ JK_LB_RETRIES);
p->retry_interval =
jk_get_worker_retry_interval(props, p->name,
JK_SLEEP_DEF);
diff --git a/native/common/jk_lb_worker.h b/native/common/jk_lb_worker.h
index 71ee58d..d5260db 100644
--- a/native/common/jk_lb_worker.h
+++ b/native/common/jk_lb_worker.h
@@ -190,6 +190,7 @@ struct lb_worker
int error_escalation_time;
int max_reply_timeouts;
int retries;
+ int lb_retries;
int retry_interval;
int lbmethod;
int lblock;
diff --git a/native/common/jk_service.h b/native/common/jk_service.h
index 5eb6bac..a84af93 100644
--- a/native/common/jk_service.h
+++ b/native/common/jk_service.h
@@ -36,6 +36,8 @@
#include "jk_msg_buff.h"
#define JK_RETRIES 2
+/* fredi - default */
+#define JK_LB_RETRIES 2
#ifdef __cplusplus
extern "C"
diff --git a/native/common/jk_util.c b/native/common/jk_util.c
index 656af40..b309328 100644
--- a/native/common/jk_util.c
+++ b/native/common/jk_util.c
@@ -109,6 +109,7 @@
#define DEFAULT_WORKER_TYPE JK_AJP13_WORKER_NAME
#define SECRET_KEY_OF_WORKER "secretkey"
#define RETRIES_OF_WORKER "retries"
+#define LB_RETRIES_OF_WORKER "lb_retries"
#define STATUS_FAIL_OF_WORKER "fail_on_status"
#define DEFAULT_WORKER JK_AJP13_WORKER_NAME
@@ -228,6 +229,7 @@ static const char *unique_properties[] = {
STYLE_SHEET_OF_WORKER,
READ_ONLY_OF_WORKER,
RETRIES_OF_WORKER,
+ LB_RETRIES_OF_WORKER,
WORKER_MAINTAIN_PROPERTY_NAME,
NAMESPACE_OF_WORKER,
XML_NAMESPACE_OF_WORKER,
@@ -336,6 +338,7 @@ static const char *supported_properties[] = {
BAD_RATING_OF_WORKER,
SECRET_KEY_OF_WORKER,
RETRIES_OF_WORKER,
+ LB_RETRIES_OF_WORKER,
STATUS_FAIL_OF_WORKER,
LIST_PROPERTY_NAME,
MAINTAIN_PROPERTY_NAME,
@@ -1219,6 +1222,24 @@ int jk_get_worker_retries(jk_map_t *m, const char *wname, int def)
return rv;
}
+int jk_get_worker_lb_retries(jk_map_t *m, const char *wname, int def)
+{
+ char buf[1024];
+ int rv;
+ if (!m || !wname) {
+ return -1;
+ }
+
+ MAKE_WORKER_PARAM(LB_RETRIES_OF_WORKER);
+
+ rv = jk_map_get_int(m, buf, def);
+ if (rv < 1)
+ rv = 1;
+
+ return rv;
+}
+
+
int jk_get_worker_recovery_opts(jk_map_t *m, const char *wname, int def)
{
char buf[PARAM_BUFFER_SIZE];
diff --git a/native/scripts/build/unix/.gitignore b/native/scripts/build/unix/.gitignore
new file mode 100644
index 0000000..9eac9d3
--- /dev/null
+++ b/native/scripts/build/unix/.gitignore
@@ -0,0 +1,3 @@
+install-sh
+ltmain.sh
+missing
---------------------------------------------------------------------
To unsubscribe, e-mail: users-unsubscr...@tomcat.apache.org
For additional commands, e-mail: users-h...@tomcat.apache.org