Hi all,

My scenario is Apache httpd + mod_jk + N Tomcat's in. The default behaviour of load balanced workers in mod_jk in my testing is that when a client requests a page (GET / POST / Whatever), the LB worker tries the request to every ajp worker. This in contrast with what i read here:


http://people.apache.org/~mturk/docs/article/ftwai.html <http://people.apache.org/%7Emturk/docs/article/ftwai.html>

Expecially this part:


When having multiple nodes in a cluster you can improve your application availability by implementing failover. The failover means that if the particular elected node can not fulfill the request the another node will be selected automatically. In case of three nodes you are actually doubling your application availability. The application response time will be slower during failover, but none of your users will be rejected. Inside the mod_jk configuration there is a special configuration parameter called worker.retries that has default value of 3, but that needs to be adjusted to the actual number of nodes in the cluster.

    ...
    worker.list=lbworker
    worker.lbworker.type=lb
    # Adjust to the number of workers
    worker.retries=4
    worker.lbworker.balance_workers=node1,node2,node3,node4
If you add more then three workers to the load balancer adjust the retries parameter to reflect that number. It will ensure that even in the worse case scenario the request gets served if there is a single operable node.

From that it seems that the "retries" parameter in a load balancer worker context should mean the number of real (AJP) workers to try. (what i need indeed) but in my testing, that LB worker parameter is the number of times that all the AJP workers that are part of the LB worker get a round retry. In eg, having a LB worker with 4 AJP workers, setting LB Worker's retries = 2, the behaviour i see is that the AJP workers get called this way:

AJP1 -> timeout
[...]
AJP4 -> timeout

===> repeat again (retries == 2)

AJP1 -> timeout
[...]
AJP4 -> timeout

--> LB sends an error to the client.



Now from the online documentation the meaning of that parameter in a load balancer worker context is'nt that clear, but from the link i provided seems it was exactly what i needed, not the number of retries to all AJP workers, but the number of single AJP workers to try..

If that is not correct i can fill a bug report. If instead it's by design, the attached patch adds a new parameter, "lb_retries", that does what i need. Of course it's a bit rough, but works.

Any comments? Am I getting stuff wrong?


Thanks in advance,


Frederik


diff --git a/native/.gitignore b/native/.gitignore
new file mode 100644
index 0000000..1f8d345
--- /dev/null
+++ b/native/.gitignore
@@ -0,0 +1,5 @@
+Makefile.in
+aclocal.m4
+config.log
+config.nice
+configure
diff --git a/native/common/.gitignore b/native/common/.gitignore
new file mode 100644
index 0000000..2a9005d
--- /dev/null
+++ b/native/common/.gitignore
@@ -0,0 +1 @@
+config.h.in
diff --git a/native/common/jk_ajp_common.c b/native/common/jk_ajp_common.c
index 08bcc02..9a12a89 100644
--- a/native/common/jk_ajp_common.c
+++ b/native/common/jk_ajp_common.c
@@ -2904,6 +2904,9 @@ int ajp_init(jk_worker_t *pThis,
         p->retries =
             jk_get_worker_retries(props, p->name,
                                   JK_RETRIES);
+        p->lb_retries =
+            jk_get_worker_lb_retries(props, p->name,
+                                  JK_LB_RETRIES);
 
         p->max_packet_size =
             jk_get_max_packet_size(props, p->name);
diff --git a/native/common/jk_ajp_common.h b/native/common/jk_ajp_common.h
index 0c1636c..7b342d0 100644
--- a/native/common/jk_ajp_common.h
+++ b/native/common/jk_ajp_common.h
@@ -363,6 +363,13 @@ struct ajp_worker
      */
     int retries;
 
+    /*
+     * Public property used in load balancer workers, meaning
+     * the maximum number of failover attempts between ajp
+     * workers of cluster.
+     */
+    int lb_retries;
+
     unsigned int max_packet_size;  /*  Maximum AJP Packet size */
 
     int retry_interval;            /*  Number of milliseconds to sleep before doing a retry */
diff --git a/native/common/jk_lb_worker.c b/native/common/jk_lb_worker.c
index a9894eb..d6e0251 100644
--- a/native/common/jk_lb_worker.c
+++ b/native/common/jk_lb_worker.c
@@ -1159,6 +1159,7 @@ static int JK_METHOD service(jk_endpoint_t *e,
     if (p->worker->sequence < p->worker->s->h.sequence)
         jk_lb_pull(p->worker, JK_FALSE, l);
     for (i = 0; i < num_of_workers; i++) {
+        jk_log(l, JK_LOG_DEBUG, "LB - num_of_workers: %d, retry: %d, lb_retries: %d", num_of_workers, i, p->worker->lb_retries);
         lb_sub_worker_t *rec = &(p->worker->lb_workers[i]);
         if (rec->s->state == JK_LB_STATE_BUSY) {
             if (ajp_has_endpoint(rec->worker, l)) {
@@ -1203,7 +1204,10 @@ static int JK_METHOD service(jk_endpoint_t *e,
                "service sticky_session=%d id='%s'",
                p->worker->sticky_session, sessionid ? sessionid : "empty");
 
-    while (recoverable == JK_TRUE) {
+    while (recoverable == JK_TRUE && attempt <= p->worker->lb_retries) {
+        if (JK_IS_DEBUG_LEVEL(l))
+	    jk_log(l, JK_LOG_DEBUG, "attempt %d, max attempts %d",
+		      attempt, p->worker->lb_retries);
         if (attempt >= num_of_workers) {
             retry++;
             if (retry >= p->worker->retries) {
@@ -1806,6 +1810,8 @@ static int JK_METHOD init(jk_worker_t *pThis,
     p->worker.we = we;
     p->retries = jk_get_worker_retries(props, p->name,
                                        JK_RETRIES);
+    p->lb_retries = jk_get_worker_lb_retries(props, p->name,
+                                       JK_LB_RETRIES);
     p->retry_interval =
             jk_get_worker_retry_interval(props, p->name,
                                         JK_SLEEP_DEF);
diff --git a/native/common/jk_lb_worker.h b/native/common/jk_lb_worker.h
index 71ee58d..d5260db 100644
--- a/native/common/jk_lb_worker.h
+++ b/native/common/jk_lb_worker.h
@@ -190,6 +190,7 @@ struct lb_worker
     int          error_escalation_time;
     int          max_reply_timeouts;
     int          retries;
+    int          lb_retries;
     int          retry_interval;
     int          lbmethod;
     int          lblock;
diff --git a/native/common/jk_service.h b/native/common/jk_service.h
index 5eb6bac..a84af93 100644
--- a/native/common/jk_service.h
+++ b/native/common/jk_service.h
@@ -36,6 +36,8 @@
 #include "jk_msg_buff.h"
 
 #define JK_RETRIES 2
+/* fredi - default */
+#define JK_LB_RETRIES 2
 
 #ifdef __cplusplus
 extern "C"
diff --git a/native/common/jk_util.c b/native/common/jk_util.c
index 656af40..b309328 100644
--- a/native/common/jk_util.c
+++ b/native/common/jk_util.c
@@ -109,6 +109,7 @@
 #define DEFAULT_WORKER_TYPE         JK_AJP13_WORKER_NAME
 #define SECRET_KEY_OF_WORKER        "secretkey"
 #define RETRIES_OF_WORKER           "retries"
+#define LB_RETRIES_OF_WORKER        "lb_retries"
 #define STATUS_FAIL_OF_WORKER       "fail_on_status"
 
 #define DEFAULT_WORKER              JK_AJP13_WORKER_NAME
@@ -228,6 +229,7 @@ static const char *unique_properties[] = {
     STYLE_SHEET_OF_WORKER,
     READ_ONLY_OF_WORKER,
     RETRIES_OF_WORKER,
+    LB_RETRIES_OF_WORKER,
     WORKER_MAINTAIN_PROPERTY_NAME,
     NAMESPACE_OF_WORKER,
     XML_NAMESPACE_OF_WORKER,
@@ -336,6 +338,7 @@ static const char *supported_properties[] = {
     BAD_RATING_OF_WORKER,
     SECRET_KEY_OF_WORKER,
     RETRIES_OF_WORKER,
+    LB_RETRIES_OF_WORKER,
     STATUS_FAIL_OF_WORKER,
     LIST_PROPERTY_NAME,
     MAINTAIN_PROPERTY_NAME,
@@ -1219,6 +1222,24 @@ int jk_get_worker_retries(jk_map_t *m, const char *wname, int def)
     return rv;
 }
 
+int jk_get_worker_lb_retries(jk_map_t *m, const char *wname, int def)
+{
+    char buf[1024];
+    int rv;
+    if (!m || !wname) {
+        return -1;
+    }
+
+    MAKE_WORKER_PARAM(LB_RETRIES_OF_WORKER);
+
+    rv = jk_map_get_int(m, buf, def);
+    if (rv < 1)
+        rv = 1;
+
+    return rv;
+}
+
+
 int jk_get_worker_recovery_opts(jk_map_t *m, const char *wname, int def)
 {
     char buf[PARAM_BUFFER_SIZE];
diff --git a/native/scripts/build/unix/.gitignore b/native/scripts/build/unix/.gitignore
new file mode 100644
index 0000000..9eac9d3
--- /dev/null
+++ b/native/scripts/build/unix/.gitignore
@@ -0,0 +1,3 @@
+install-sh
+ltmain.sh
+missing


---------------------------------------------------------------------
To unsubscribe, e-mail: users-unsubscr...@tomcat.apache.org
For additional commands, e-mail: users-h...@tomcat.apache.org

Reply via email to