Users find it discouraging to check BOINC and find that an application hasn't 
made any progress in hours, and though the eventual cutoff based on 
rsc_fpops_bound is needed it is hardly the best we can do. IMO what I'm 
suggesting will be an improvement.

The proposed change provides an option for the timer thread to check whether a 
science application seems to still be doing something useful. It's based on the 
assumption that correct operation will update the fraction_done frequently, and 
if that doesn't happen within a reasonable time the application should be shut 
down. That's done like the no heartbeat case, since at least some cases can be 
cured by a restart. Even if it's not a direct help, having BOINC trying to 
correct the situation ought to be less discouraging to users.

I've based the "reasonable time" on the rsc_fpops_est/host_info.p_fpops runtime 
approximation divided by 100. Although that's not in any sense accurate it does provide 
for old slow systems. If the values to calculate that time are not available the period 
is defaulted to 1800 seconds, and on the short end there's a minimum of 120 seconds. The 
actual count used is based on the running_interrupt_count value of course, to exclude 
time when the application is suspended.

I've defaulted the option off so application builds using trunk code won't have 
the feature unless a project decides to use it. The changes needed are in the 
attached diffs. I've done some testing with builds of the S@H v7 Beta 
application including those changes plus code to simulate an unintended looping 
condition. That is, the change builds and runs as I intended.
--
                                                         Joe
Index: boinc/api/boinc_api.cpp
===================================================================
--- boinc/api/boinc_api.cpp     (revision 23362)
+++ boinc/api/boinc_api.cpp     (working copy)
@@ -137,6 +137,11 @@
 static volatile int running_interrupt_count = 0;
     // number of timer interrupts while not suspended.
     // Used to compute elapsed time
+static volatile int progress_delay = 0;
+    // running_interrupt_count at last fraction_done change
+static volatile int progress_delay_limit = 99999;
+    // limit for no fraction_done change to assume application looping
+    // init for safety only
 static double fpops_per_cpu_sec = 0;
 static double fpops_cumulative = 0;
 static double intops_per_cpu_sec = 0;
@@ -381,6 +386,7 @@
     }

     boinc_status.no_heartbeat = false;
+    boinc_status.no_progress = false;
     boinc_status.suspended = false;
     boinc_status.quit_request = false;
     boinc_status.abort_request = false;
@@ -447,11 +453,20 @@
     }
     heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT;

+    if (options.check_progress) {
+        double x;
+        if ((aid.rsc_fpops_est > 0.) && (aid.host_info.p_fpops > 0.)) {
+            x = aid.rsc_fpops_est/aid.host_info.p_fpops/100.;
+            if (x < 120.) x = 120.;
+        } else x = 1800.;
+        progress_delay_limit = (int)(x/TIMER_PERIOD);
+    }
     return 0;
 }

 int boinc_get_status(BOINC_STATUS *s) {
     s->no_heartbeat = boinc_status.no_heartbeat;
+    s->no_progress = boinc_status.no_progress;
     s->suspended = boinc_status.suspended;
     s->quit_request = boinc_status.quit_request;
     s->reread_init_data_file = boinc_status.reread_init_data_file;
@@ -1025,6 +1040,27 @@
         }
     }

+    // see if we're making any progress, else need to die
+    // (unless we're in a critical section)
+    //
+    if (in_critical_section==0 && options.check_progress) {
+        if ((running_interrupt_count - progress_delay) >= 
progress_delay_limit) {
+            boinc_msg_prefix(buf, sizeof(buf));
+            buf[sizeof(buf) - 1] = 0;  // paranoia
+            fputs(buf, stderr);
+            fprintf(stderr,
+                " No progress in %.0f seconds run time - exiting\n",
+                (double)progress_delay_limit*TIMER_PERIOD
+            );
+
+            if (options.direct_process_action) {
+                exit_from_timer_thread(0);
+            } else {
+                boinc_status.no_progress = true;
+            }
+        }
+    }
+
     // don't bother reporting CPU time etc. if we're suspended
     //
     if (options.send_status_msgs && !boinc_status.suspended) {
@@ -1222,7 +1258,10 @@
 }

 int boinc_fraction_done(double x) {
-    fraction_done = x;
+    if (x != fraction_done) {
+        progress_delay = running_interrupt_count;
+        fraction_done = x;
+    }
     return 0;
 }

Index: boinc/api/boinc_api.h
===================================================================
--- boinc/api/boinc_api.h       (revision 23362)
+++ boinc/api/boinc_api.h       (working copy)
@@ -47,6 +47,8 @@
     int check_heartbeat;
         // check for timeout of heartbeats from the client;
         // action is determined by direct_process_action (see below)
+    int check_progress;
+        // action is determined by direct_process_action (see below)
     int handle_trickle_ups;
         // periodically check for trickle-up msgs from the app
         // must set this to use boinc_send_trickle_up()
@@ -59,13 +61,14 @@
     int send_status_msgs;
         // whether runtime system should send CPU time / fraction done msgs
     int direct_process_action;
-        // if heartbeat fail, or get process control msg, take
-        // direction action (exit, suspend, resume).
+        // if heartbeat fail, no recent progress, or get process control msg,
+        // take direct action (exit, suspend, resume).
         // Otherwise just set flag in BOINC status
 } BOINC_OPTIONS;

 typedef struct BOINC_STATUS {
     int no_heartbeat;
+    int no_progress;
     int suspended;
     int quit_request;
     int reread_init_data_file;
@@ -148,6 +151,7 @@
 inline void boinc_options_defaults(BOINC_OPTIONS& b) {
     b.main_program = 1;
     b.check_heartbeat = 1;
+    b.check_progress = 0;
     b.handle_trickle_ups = 1;
     b.handle_trickle_downs = 1;
     b.handle_process_control = 1;
_______________________________________________
boinc_dev mailing list
[email protected]
http://lists.ssl.berkeley.edu/mailman/listinfo/boinc_dev
To unsubscribe, visit the above URL and
(near bottom of page) enter your email address.

Reply via email to