Author: challngr
Date: Tue Jul 16 20:29:54 2013
New Revision: 1503870

URL: http://svn.apache.org/r1503870
Log:
UIMA-3081 Verify configuration before allowing startup.

Modified:
    uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
    uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
    uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc
    
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Tue Jul 16 20:29:54 
2013
@@ -59,6 +59,10 @@ class CheckDucc(DuccUtil):
         print "        Check for agents on the nodes in nodefile.  This option 
may be specified multiple time"
         print "        for multiple nodefiles.  The 'local' node is always 
checked"
         print ""
+        print "    -c --configuration"
+        print "        Do basic sanity checking on the configuration only.  
Note that configuration checking is always"
+        print "        performed with most options.  The [-c, --configuration] 
option does ONLY configuration checking."
+        print ""
         print "    -u --user userid"
         print "        Userid is the user whose processes check_ducc searches 
for.  If not specified,"
         print "        the user executing check_ducc is used.  If specified as 
'all' then all ducc processes"
@@ -66,7 +70,15 @@ class CheckDucc(DuccUtil):
         print ""
         print "    -k --kill"
         print "       Force-kill any DUCC process you find on a node (if 
normal stop_ducc isn't working.  This"
-        print "       uses kill -9 and only kills processes owned by the 
invoking user."
+        print "       uses kill -KILL (-9) and only kills processes owned by 
the invoking user."
+        print "" 
+        print "    -i --int"
+        print "       Force-kill any DUCC process you find on a node (if 
normal stop_ducc isn't working.  This"
+        print "       uses kill -INT (-2) and only kills processes owned by 
the invoking user."
+        print "" 
+        print "    -q --quit"
+        print "       Force-kill any DUCC process you find on a node (if 
normal stop_ducc isn't working.  This"
+        print "       uses kill -QUIT (-3) and only kills processes owned by 
the invoking user."
         print "" 
         print "    -p --pids"
         print "       Rewrite the PID file. The PID file is always rewritten 
if any changes to processes are made.  Sometimes"
@@ -85,43 +97,55 @@ class CheckDucc(DuccUtil):
     
     def main(self, argv):
 
-        self.show_ducc_environment()
-
         try:
-            opts, args = getopt.getopt(argv, 'kn:prs:u:h?v')
+            opts, args = getopt.getopt(argv, 'cikn:pqrs:u:h?v', 
['--configuration', '--nodelist=', '--user=', '--int', '--quit', '--kill', 
'--pids', '--reap', '--version'])
         except:
             self.usage("Invalid arguments " + ' '.join(argv))
     
         nodefiles = []
         user = os.environ['LOGNAME']
         victim_user = None
-        kill = False
+        kill_signal = None
         reap = False
         redo_pids = False
         process_changes = False
         do_validate = False
         checkdate = 0
-        
+        config_only = False
+
         for ( o, a ) in opts:
-            if ( o == '-n' ) :
+            if o in ('-c', '--configuration'):
+                config_only = True
+            elif o in ('-n', '--nodelist'):
                 nodefiles.append(a)
-            elif ( o == '-k' ) :
-                kill = True
-            elif ( o == '-u' ) :
+            elif o in ('-i', '--int'):
+                if ( kill_signal != None ):
+                    print 'Conflicting kill signals: -INT and', kill_signal
+                    return
+                kill_signal = '-INT'
+            elif o in ('-q', '--quit'):
+                if ( kill_signal != None ):
+                    print 'Conflicting kill signals: -QUIT and', kill_signal
+                    return
+                kill_signal = '-QUIT'
+            elif o in ('-k', '--kill'):
+                if ( kill_signal != None ):
+                    print 'Conflicting kill signals: -KILL and', kill_signal
+                    return
+                kill_signal = '-KILL'
+            elif o in ('-u', '--user'):
                 victim_user = a
-            elif ( o == '-v'):
-                ducc_util.version()
-            elif ( o == '-r'):
+            elif o in ('-r', '--reap'):
                 reap = True
-            elif ( o == '-p'):
+            elif o in ('-p', '--pids'):
                 redo_pids = True
-            elif ( o == '-s'):
+            elif o in ('-s'):
                 # intended to be called recursively from check_ducc, NOT from 
the command line
                 do_validate = True
                 checkdate = float(a)
-            elif ( o in ('-h', '-?') ):
+            elif o in ('-h', '-?', '--help'):
                 self.usage(None)
-            elif ( o == '-v'):
+            elif o in ('-v', '--version'):
                 self.version(None)
             else:
                 print 'badarg', a
@@ -134,6 +158,13 @@ class CheckDucc(DuccUtil):
             self.validate(checkdate)
             return
 
+        os.system('cat ' + self.DUCC_HOME + '/state/duccling.version')
+        # not -s option, do this only on local node
+        env = self.show_ducc_environment()
+        for e in env:
+            print e
+
+
         if ( reap and (user == 'ducc') ):
             usage('Can only reap non-udcc users')
 
@@ -172,6 +203,13 @@ class CheckDucc(DuccUtil):
         if ( len(localnodes) > 0 ):
             nodes['local'] = localnodes
 
+        self.verify_jvm()
+        if self.verify_class_configuration(nodes):
+            print "OK: Class configuration checked"
+
+        if ( config_only ):
+            return
+
         # checking starts here        
         checked = {}
         for (nodefile, nodelist) in nodes.items():
@@ -203,14 +241,14 @@ class CheckDucc(DuccUtil):
                             continue
 
                         process_id = found_user + ' ' + component + '@' + node 
+ ' PID ' + pid 
-                        if ( kill ) :
+                        if ( kill_signal != None ) :
                             if ( user != found_user ):
                                 print spacer, "Not killing someone else's 
process.", process_id
                             elif ( component == 'unknown-java' ):
                                 print spacer, 'Not killing non-ducc process', 
process_id
                             else:
-                                print spacer, 'Killing (kill -9)', process_id
-                                self.kill_process(node, proc)
+                                print spacer, 'Killing (' +  kill_signal + 
')', process_id
+                                self.kill_process(node, proc, kill_signal)
                                 pids.delete(pid)
                                 process_changes = True
                         elif ( reap ):
@@ -236,7 +274,7 @@ class CheckDucc(DuccUtil):
                 else:
                     print 'no processes found.'
 
-                if ( not (kill or reap) ):                    
+                if ( not ((kill_signal != None) or reap) ):                    
                     lines = self.ssh(node, True, self.DUCC_HOME + 
"/admin/check_ducc", "-s", str(int(time())))
                     while 1:
                         line = lines.readline()
@@ -249,7 +287,7 @@ class CheckDucc(DuccUtil):
         if ( reap ):
             return
 
-        if ( kill ):
+        if ( kill_signal != None ):
             self.stop_broker()
             self.remove_orchestrator_lock()
                 

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Tue Jul 16 
20:29:54 2013
@@ -235,10 +235,10 @@ class DuccUtil(DuccBase):
 
     def verify_jvm(self):
         jvm = self.java()
-        CMD = jvm + ' -fullversion > /dev/null 2>&1'
+        CMD = jvm + ' -version > /dev/null 2>&1'
         rc = os.system(CMD)
         if ( rc != 0 ):
-            print 'NOTOK', CMD, 'returns', rc, '.  Must return rc 0.  Startup 
cannot continue.'
+            print 'NOTOK', CMD, 'returns', int(rc), '.  Must return rc 0.  
Startup cannot continue.'
             return False
         return True
 
@@ -477,8 +477,8 @@ class DuccUtil(DuccBase):
         except:
             print 'Unable to remove orchestrator lock'
 
-    def kill_process(self, node, proc):
-        self.ssh(node, False, 'kill', '-KILL', proc[1])
+    def kill_process(self, node, proc, signal):
+        self.ssh(node, False, 'kill', signal, proc[1])
                 
     def clean_shutdown(self):
         DUCC_JVM_OPTS = ' -Dducc.deploy.configuration=' + self.DUCC_HOME + 
"/resources/ducc.properties "
@@ -547,7 +547,7 @@ class DuccUtil(DuccBase):
 
             manifest = DuccProperties()
             manifest.load_from_manifest(self.DUCC_HOME + '/lib/' + j)
-            response.append('ENV: %25s %18s %12s %s' % (j + ':', 
manifest.get('Ducc-Version'), 'compiled at', manifest.get('Ducc-Build-Date')))
+            response.append('ENV: %25s %18s %12s %s' % (j + ':', 
manifest.get('Ducc-Version'), 'compiled at', manifest.get('Build-Date')))
 
         return response
 
@@ -590,6 +590,147 @@ class DuccUtil(DuccBase):
         #print 'RETURN', nodefile, ret
         return ret
 
+    def compare_nodes(self, n1, n2):
+
+        if ( n1 == n2 ):             # exact match - covers both short and 
both long
+            return True
+
+        if ( n1.find('.') >= 0 ):    # shortened n1 == n2?
+            t1 = n1.split('.')
+            n1A = t1[0]
+            if ( n1A == n2 ):
+                return True
+
+        if ( n2.find('.') >= 0 ):    # n1 == shortened n2?
+            t2 = n2.split('.')
+            n2A = t2[0]
+            if ( n1 == n2A ):
+                return True
+        return False
+
+    #
+    # Make sure all the nodes in the configured nodepools are also in the 
startup list
+    #
+    def check_nodepools(self, classprops, allnodes):
+        #
+        # First make sure that all the nodepools that are declared have 
definition files
+        # and that the defined nodes are in some nodelist.
+        #
+        nodepools = classprops.get('scheduling.nodepool').split()
+        nodepools_ok = True
+        for np in nodepools:
+            npkey = 'scheduling.nodepool.' + np
+            npfilename = classprops.get(npkey)
+            if ( npfilename == None ):
+                print 'NOTOK: Missing nodepool definition file for Nodepool', 
np
+                nodepools_ok = False
+                continue
+
+            npfile = self.DUCC_HOME + '/resources/' + npfilename
+            if ( not os.path.exists(npfile) ):
+                print 'NOTOK: Cannot find nodepool file', npfile
+                errors = errors + 1
+                continue
+
+            npnodes = {}
+            npnodes = self.read_nodefile(npfile, npnodes)
+            found = False
+            for ( impfile, nodes ) in npnodes.items():
+                for node in nodes:
+                    for (nodefile, nodelist) in allnodes.items():
+                        for n in nodelist:                        
+                            if ( self.compare_nodes(n, node)):
+                                found = True
+                                break                        
+                if ( not found ):
+                    print 'NOTOK: Cannot find node defined in pool "' +np+'" 
in any nodefile:', node
+                    nodepools_ok = False
+
+        #
+        # Now make sure that all classes that reference nodepools have 
corresponding
+        # nodepool definitions
+        #
+
+        for ( k, v ) in classprops.items():
+            if ( k.startswith('scheduling.class.') and k.endswith('.nodepool') 
):
+                if ( not ( v in nodepools ) ):
+                    toks = k.split('.')
+                    classname = toks[2]
+                    print 'NOTOK: Class', classname, 'references non-existent 
nodepool', v
+                    nodepools_ok = False
+
+        if ( nodepools_ok ):
+            print 'OK: All nodepools are verified'
+        else:
+            print 'NOTOK: some nodepools are not correctly defined.'
+
+        return nodepools_ok
+
+    def verify_class_configuration(self, allnodes):
+        answer = True
+        # first, find the class definition
+        classfile = self.ducc_properties.get('ducc.rm.class.definitions')
+        classfile = self.resolve(classfile, self.propsfile)    # resolve the 
classfile relative to ducc.properties
+
+        print 'Class definition file is', classfile
+        classprops = DuccProperties()
+        try:
+            classprops.load(classfile)
+        except:
+            print 'NOTOK: Cannot read properties file', classfile
+            return False
+
+        # Verify nodepool definitions.
+        if ( not self.check_nodepools(classprops, allnodes) ):
+            # this check will emit necessary messages
+            answer = False
+
+        nodepools = classprops.get('scheduling.nodepool').split()
+        class_set = classprops.get('scheduling.class_set').split()
+        # first, make sure every class that is defined exists, has a policy, 
and a priority
+        # FAIR_SHARE classes, they must also have a weight
+        # if a nodeppol is assigned, it must also be one of the defined, and 
now verified, nodepools
+        for cl in class_set:
+            po = classprops.get('scheduling.class.' + cl +'.policy')
+            if ( po == None ):
+                print 'NOTOK: Missing policy definition for class "' + cl + '"'
+                answer = False
+            else:
+                we = classprops.get('scheduling.class.' + cl +'.share_weight')
+                if ( po == 'FAIR_SHARE' and we == None ):
+                    print 'NOTOK: Missing "weight" definition for class: "' + 
cl + '"'
+                    answer = False
+                    
+            pr = classprops.get('scheduling.class.' + cl +'.priority')
+            if ( pr == None ):
+                print 'NOTOK: Missing priority definition for class: "' + cl + 
'"'
+                answer = False
+            
+            clnp = classprops.get('scheduling.class.' + cl +'.nodepool')
+            if ( clnp != None ):
+                if ( not clnp in nodepools ):
+                    print 'NOTOK: Nodepool "' + clnp + '" is configured for 
class "' + cl + '" but has no definition.'
+                    answer = False
+
+        # Dig out the jobdriver class and insure it exists.  
+        jdclass = self.ducc_properties.get('ducc.jd.host.class')
+        if ( not jdclass in class_set ):
+            print 'NOTOK: Job Driver class "' + jdclass + '" is not defined 
(see ducc.properties: ducc.jd.host.class).'
+            answer = False
+
+        # if a default.name and/or default.name.reserve class is defined, make 
sure they exist
+        default_class = classprops.get('scheduling.default.name')
+        if ( (default_class != None) and (not default_class in class_set) ):
+            print 'NOTOK: Default class "' + default_class + '" is not 
defined.'
+            answer = False
+
+        default_reserve_class = 
classprops.get('scheduling.default.name.reserve')
+        if ( (default_reserve_class != None) and (not default_reserve_class in 
class_set) ):
+            print 'NOTOK: Default reserve class "' + default_reserve_class + 
'" is not defined.'
+            answer = False
+
+        return answer
+
     def __init__(self):
         DuccBase.__init__(self)
         self.duccling = None

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Tue Jul 16 20:29:54 
2013
@@ -289,6 +289,12 @@ class StartDucc(DuccUtil):
                 print "Can't read nodefile", nf
                 ok = False
 
+        if ok and self.verify_class_configuration(nodes):
+            print "OK: Class configuration checked"
+        else:
+            print "NOTOK: Bad configuration, cannot start."
+            ok = False
+
         if ( not ok ):
             sys.exit(1)
                 

Modified: 
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java?rev=1503870&r1=1503869&r2=1503870&view=diff
==============================================================================
--- 
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
 (original)
+++ 
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java
 Tue Jul 16 20:29:54 2013
@@ -341,10 +341,12 @@ public class Scheduler
 
     Map<String, String> readNodepoolFile(String npfile)
     {
+        String methodName = "readNodepoolFile";
         String my_domain = getDomainName();
         String ducc_home = System.getProperty("DUCC_HOME");
         npfile = ducc_home + "/resources/" + npfile;
 
+        logger.info(methodName, null, "Domain name:", my_domain);
         Map<String, String> response = new HashMap<String, String>();
 
         try {
@@ -401,7 +403,7 @@ public class Scheduler
         // read in nodepools
         String npn = props.getProperty("scheduling.nodepool");
         if ( npn != null ) {
-            String[] npnames = npn.split(" ");
+            String[] npnames = npn.split("\\s+");
             for ( String nodepoolName : npnames ) {
                 int nporder = props.getIntProperty("scheduling.nodepool." + 
nodepoolName + ".order", 100);                
                 String npfile = props.getProperty("scheduling.nodepool." + 
nodepoolName).trim();


Reply via email to