Author: challngr Date: Wed Aug 7 15:05:38 2013 New Revision: 1511347 URL: http://svn.apache.org/r1511347 Log: UIMA-3085 Handle race with instance death and restart which was prematurely terminating dependent jobs.
Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java?rev=1511347&r1=1511346&r2=1511347&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java (original) +++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java Wed Aug 7 15:05:38 2013 @@ -650,15 +650,28 @@ public class ServiceHandler resolveState(id, s); } - // now factor in cumulative state of the implementors and manage the ping thread as needed - sset.establish(id, w.getJobState()); - - // State is established. Now, if the instance died, remove it - OR will keep publishing it for a while and we want to ignore those + // See what happened to the instance ... if ( w.isActive() ) { // Hard to know for sure, if there are a bunch of instances, some working and some not, how to manage this. // But this is a state *change* of something, and the something is active, so probably the service is OK now // if it hadn't been before. - // sset.resetRunFailures(); + + // Need to be cautious here - this will get reset if ANYthing is running. So we could have a bunch + // of live instances and some new ones, where the live ones are ok but for some reason we can't start + // new ones, in which case this gets set too often. + // + // This seems like it would be rare and since we aren't actually pounding restarts (only attempts every + // SM cycle) maybe its ok. The alternative is to track state changes which is added complexity - for + // waht gain, we need to determine with experience. + // + // I suppose the ServiceManagerHandler could easily track the per-process state change - we'd have to + // modify the thing in the map it passes in to show 'before' and 'after' states instead of just passing + // in the DuccWork thing. + // + JobState state = w.getJobState(); + if ( state == JobState.Running ) { // only if we confirm it's alive + sset.resetRunFailures(); + } } else { JobState state = w.getJobState(); @@ -686,6 +699,9 @@ public class ServiceHandler } } + // Now factor in cumulative state of the implementors and manage the ping thread as needed + sset.establish(id, w.getJobState()); + if ( (sset.getServiceState() == ServiceState.NotAvailable) && (sset.countReferences() == 0) && (sset.countImplementors() == 0) ) { // this service is now toast. remove from our maps asap to avoid clashes if it gets // resubmitted before the OR can purge it. Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java?rev=1511347&r1=1511346&r2=1511347&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java (original) +++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java Wed Aug 7 15:05:38 2013 @@ -957,6 +957,12 @@ public class ServiceSet if ( true ) { implementors.put(id, job_state); ServiceState cumulative = cumulativeJobState(); + // + // Note on the CUMULATIVE state: this is the cumulative state as determined by service processes. If they + // should all die at once through some temporary glitch the state could go to Unavailable even though the + // SM would now be in active retry - the states below avoid regression state if CUMULATIVE goes to + // Unavailable but the retry count indicates retry is still in progress. + // // // The ping state is pretty much always the right state. But if we're @@ -995,8 +1001,13 @@ public class ServiceSet setServiceState(ServiceState.Initializing); break; case NotAvailable: - setServiceState(ServiceState.NotAvailable); - stopPingThread(); + if ( failure_run >= failure_max ) { + setServiceState(ServiceState.NotAvailable); + stopPingThread(); + } else { + // don't regress if we're in retry + logger.info(methodName, id, "RETRY RETRY RETRY prevents state regression from Initializing"); + } break; } break; @@ -1029,8 +1040,13 @@ public class ServiceSet setServiceState(ServiceState.Initializing); break; case NotAvailable: - stopPingThread(); - setServiceState(ServiceState.NotAvailable); + if ( failure_run >= failure_max ) { + setServiceState(ServiceState.NotAvailable); + stopPingThread(); + } else { + // don't regress if we're in retry + logger.info(methodName, id, "RETRY RETRY RETRY prevents state regression from Available"); + } break; } @@ -1049,8 +1065,13 @@ public class ServiceSet break; case Waiting: break; - case NotAvailable: - setServiceState(ServiceState.NotAvailable); + case NotAvailable: + if ( failure_run >= failure_max ) { + setServiceState(ServiceState.NotAvailable); + } else { + // don't regress if we're in retry + logger.info(methodName, id, "RETRY RETRY RETRY prevents state regression from Waiting"); + } stopPingThread(); break; } @@ -1098,7 +1119,7 @@ public class ServiceSet synchronized boolean excessiveRunFailures() { String methodName = "runFailures"; - if ( (++failure_run) > failure_max ) { + if ( (++failure_run) >= failure_max ) { logger.debug(methodName, id, "RUN FAILURES EXCEEDED"); return true; }