On Tue, Oct 26, 2010 at 12:26 PM, Jeff Trawick <[email protected]> wrote:
> The biggest problem here is potentially waiting a long time for a
> specific child process before moving on to the next. That's
> potentially 30 seconds * huge-number.  Also, I think it is worthwhile
> setting up the code so that it is easy to tweak the sleeps and
> actions, like the MPMs have.
>
> Look for ap_reclaim_child_processes() at
> http://svn.apache.org/viewvc/httpd/httpd/trunk/server/mpm_unix.c?view=markup

a lightly tested fix is attached
Index: modules/fcgid/fcgid_pm_main.c
===================================================================
--- modules/fcgid/fcgid_pm_main.c       (revision 1027924)
+++ modules/fcgid/fcgid_pm_main.c       (working copy)
@@ -360,49 +360,108 @@
     }
 }
 
-static void kill_all_subprocess(server_rec * main_server)
+typedef enum action_t {DO_NOTHING, KILL_GRACEFULLY, KILL_FORCEFULLY,
+                       GIVE_UP} action_t;
+
+static int reclaim_one_pid(server_rec *main_server, fcgid_procnode *proc,
+                           action_t action)
 {
-    size_t i;
     int exitcode;
     apr_exit_why_e exitwhy;
-    fcgid_procnode *proc_table = proctable_get_table_array();
 
-    /* Kill gracefully */
-    for (i = 0; i < proctable_get_table_size(); i++) {
-        if (proc_table[i].proc_pool)
-            proc_kill_gracefully(&proc_table[i], main_server);
+    if (apr_proc_wait(&proc->proc_id, &exitcode, &exitwhy,
+                      APR_NOWAIT) != APR_CHILD_NOTDONE) {
+        proc->diewhy = FCGID_DIE_SHUTDOWN;
+        proc_print_exit_info(proc, exitcode, exitwhy,
+                             main_server);
+        apr_pool_destroy(proc->proc_pool);
+        proc->proc_pool = NULL;
+        return 1;
     }
-    apr_sleep(apr_time_from_sec(1));
 
-    /* Kill with SIGKILL if it doesn't work */
-    for (i = 0; i < proctable_get_table_size(); i++) {
-        if (proc_table[i].proc_pool) {
-            if (apr_proc_wait(&(proc_table[i].proc_id), &exitcode, &exitwhy,
-                              APR_NOWAIT) != APR_CHILD_NOTDONE) {
-                proc_table[i].diewhy = FCGID_DIE_SHUTDOWN;
-                proc_print_exit_info(&proc_table[i], exitcode, exitwhy,
-                                     main_server);
-                apr_pool_destroy(proc_table[i].proc_pool);
-                proc_table[i].proc_pool = NULL;
-            }
-            else
-                proc_kill_force(&proc_table[i], main_server);
-        }
+    switch(action) {
+    case DO_NOTHING:
+        break;
+
+    case KILL_GRACEFULLY:
+        proc_kill_gracefully(proc, main_server);
+        break;
+
+    case KILL_FORCEFULLY:
+        ap_log_error(APLOG_MARK, APLOG_ERR, 0, main_server,
+                     "FastCGI process %" APR_PID_T_FMT
+                     " still did not exit, "
+                     "terminating forcefully",
+                     proc->proc_id.pid);
+        proc_kill_force(proc, main_server);
+        break;
+
+    case GIVE_UP:
+        ap_log_error(APLOG_MARK, APLOG_ERR, 0, main_server,
+                     "could not make FastCGI process %" APR_PID_T_FMT
+                     " exit",
+                     proc->proc_id.pid);
+        break;
     }
 
-    /* Wait again */
-    for (i = 0; i < proctable_get_table_size(); i++) {
-        if (proc_table[i].proc_pool) {
-            if (apr_proc_wait(&(proc_table[i].proc_id), &exitcode, &exitwhy,
-                              APR_WAIT) != APR_CHILD_NOTDONE) {
-                proc_table[i].diewhy = FCGID_DIE_SHUTDOWN;
-                proc_print_exit_info(&proc_table[i], exitcode, exitwhy,
-                                     main_server);
-                apr_pool_destroy(proc_table[i].proc_pool);
-                proc_table[i].proc_pool = NULL;
+    return 0;
+}
+
+static void kill_all_subprocess(server_rec *main_server)
+{
+    apr_time_t waittime = 1024 * 16;
+    int i;
+    int not_dead_yet;
+    int cur_action, next_action;
+    apr_time_t starttime = apr_time_now();
+    struct {
+        action_t action;
+        apr_time_t action_time;
+    } action_table[] = {
+        {DO_NOTHING,      0},  /* index 0 must be DO_NOTHING */
+        {KILL_GRACEFULLY, apr_time_from_sec(0)},
+        {KILL_GRACEFULLY, apr_time_from_sec(1)},
+        {KILL_FORCEFULLY, apr_time_from_sec(8)},
+        {GIVE_UP,         apr_time_from_sec(10)}
+    };
+    fcgid_procnode *proc_table = proctable_get_table_array();
+
+    next_action = 1;
+    do {
+        apr_sleep(waittime);
+        /* don't let waittime get longer than 1 second; otherwise, we don't
+         * react quickly to the last child exiting, and taking action can
+         * be delayed
+         */
+        waittime = waittime * 4;
+        if (waittime > apr_time_from_sec(1)) {
+            waittime = apr_time_from_sec(1);
+        }
+
+        /* see what action to take, if any */
+        if (action_table[next_action].action_time <= apr_time_now() - 
starttime) {
+            cur_action = next_action;
+            ++next_action;
+        }
+        else {
+            cur_action = 0; /* index of DO_NOTHING entry */
+        }
+
+        /* now see who is done */
+        not_dead_yet = 0;
+        for (i = 0; i < proctable_get_table_size(); i++) {
+            if (!proc_table[i].proc_pool) {
+                continue; /* unused */
             }
+
+            if (!reclaim_one_pid(main_server, &proc_table[i],
+                                 action_table[cur_action].action)) {
+                ++not_dead_yet;
+            }
         }
-    }
+        
+    } while (not_dead_yet &&
+             action_table[cur_action].action != GIVE_UP);
 }
 
 /* This should be proposed as a stand-alone improvement to the httpd module,

Reply via email to