Author: challngr
Date: Wed Aug  7 15:05:38 2013
New Revision: 1511347

URL: http://svn.apache.org/r1511347
Log:
UIMA-3085 Handle race with instance death and restart which was prematurely
          terminating dependent jobs.

Modified:
    
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
    
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java

Modified: 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java?rev=1511347&r1=1511346&r2=1511347&view=diff
==============================================================================
--- 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
 (original)
+++ 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
 Wed Aug  7 15:05:38 2013
@@ -650,15 +650,28 @@ public class ServiceHandler
                 resolveState(id, s);
             } 
 
-            // now factor in cumulative state of the implementors and manage 
the ping thread as needed
-            sset.establish(id, w.getJobState());
-
-            // State is established.  Now, if the instance died, remove it - 
OR will keep publishing it for a while and we want to ignore those
+            // See what happened to the instance ...
             if (  w.isActive() ) {
                 // Hard to know for sure, if there are a bunch of instances, 
some working and some not, how to manage this.
                 // But this is a state *change* of something, and the 
something is active, so probably the service is OK now
                 // if it hadn't been before.
-                //  sset.resetRunFailures();
+                
+                // Need to be cautious here - this will get reset if ANYthing 
is running.  So we could have a bunch
+                // of live instances and some new ones, where the live ones 
are ok but for some reason we can't start
+                // new ones, in which case this gets set too often.
+                //
+                // This seems like it would be rare and since we aren't 
actually pounding restarts (only attempts every
+                // SM cycle) maybe its ok.  The alternative is to track state 
changes which is added complexity - for
+                // waht gain, we need to determine with experience.
+                //
+                // I suppose the ServiceManagerHandler could easily track the 
per-process state change - we'd have to
+                // modify the thing in the map it passes in to show 'before' 
and 'after' states instead of just passing
+                // in the DuccWork thing.
+                //
+                JobState          state = w.getJobState();
+                if ( state == JobState.Running ) {         // only if we 
confirm it's alive
+                    sset.resetRunFailures();
+                }
             } else {
                 JobState          state = w.getJobState();
                 
@@ -686,6 +699,9 @@ public class ServiceHandler
                 }
             }
 
+            // Now factor in cumulative state of the implementors and manage 
the ping thread as needed
+            sset.establish(id, w.getJobState());
+
             if ( (sset.getServiceState() == ServiceState.NotAvailable) && 
(sset.countReferences() == 0) && (sset.countImplementors() == 0) ) {
                 // this service is now toast.  remove from our maps asap to 
avoid clashes if it gets
                 // resubmitted before the OR can purge it.

Modified: 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java?rev=1511347&r1=1511346&r2=1511347&view=diff
==============================================================================
--- 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
 (original)
+++ 
uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
 Wed Aug  7 15:05:38 2013
@@ -957,6 +957,12 @@ public class ServiceSet
         if ( true ) {
             implementors.put(id, job_state);
             ServiceState cumulative = cumulativeJobState();
+            //
+            // Note on the CUMULATIVE state: this is the cumulative state as 
determined by service processes.  If they
+            // should all die at once through some temporary glitch the state 
could go to Unavailable even though the
+            // SM would now be in active retry - the states below avoid 
regression state if CUMULATIVE goes to
+            // Unavailable but the retry count indicates retry is still in 
progress.
+            //
 
             //
             // The ping state is pretty much always the right state.  But if 
we're
@@ -995,8 +1001,13 @@ public class ServiceSet
                             setServiceState(ServiceState.Initializing);
                             break;
                         case NotAvailable:
-                            setServiceState(ServiceState.NotAvailable);
-                            stopPingThread();
+                            if ( failure_run >= failure_max ) {
+                                setServiceState(ServiceState.NotAvailable);
+                                stopPingThread();
+                            } else {
+                                // don't regress if we're in retry
+                                logger.info(methodName, id, "RETRY RETRY RETRY 
prevents state regression from Initializing");
+                            }
                           break;
                     }
                     break;
@@ -1029,8 +1040,13 @@ public class ServiceSet
                             setServiceState(ServiceState.Initializing);
                             break;
                         case NotAvailable:
-                            stopPingThread();
-                            setServiceState(ServiceState.NotAvailable);
+                            if ( failure_run >= failure_max ) {
+                                setServiceState(ServiceState.NotAvailable);
+                                stopPingThread();
+                            } else {
+                                // don't regress if we're in retry
+                                logger.info(methodName, id, "RETRY RETRY RETRY 
prevents state regression from Available");
+                            }
                             break;
                     }
 
@@ -1049,8 +1065,13 @@ public class ServiceSet
                             break;
                         case Waiting:
                             break;
-                        case NotAvailable:                
-                            setServiceState(ServiceState.NotAvailable);
+                        case NotAvailable:   
+                            if ( failure_run >= failure_max ) {
+                                setServiceState(ServiceState.NotAvailable);
+                            } else {
+                                // don't regress if we're in retry
+                                logger.info(methodName, id, "RETRY RETRY RETRY 
prevents state regression from Waiting");
+                            }
                             stopPingThread();
                             break;
                     }
@@ -1098,7 +1119,7 @@ public class ServiceSet
     synchronized boolean excessiveRunFailures()
     {
         String methodName = "runFailures";
-        if ( (++failure_run) > failure_max ) {
+        if ( (++failure_run) >= failure_max ) {
             logger.debug(methodName, id, "RUN FAILURES EXCEEDED");
             return true;
         }


Reply via email to