Author: challngr Date: Tue Jul 16 20:29:54 2013 New Revision: 1503870 URL: http://svn.apache.org/r1503870 Log: UIMA-3081 Verify configuration before allowing startup.
Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Tue Jul 16 20:29:54 2013 @@ -59,6 +59,10 @@ class CheckDucc(DuccUtil): print " Check for agents on the nodes in nodefile. This option may be specified multiple time" print " for multiple nodefiles. The 'local' node is always checked" print "" + print " -c --configuration" + print " Do basic sanity checking on the configuration only. Note that configuration checking is always" + print " performed with most options. The [-c, --configuration] option does ONLY configuration checking." + print "" print " -u --user userid" print " Userid is the user whose processes check_ducc searches for. If not specified," print " the user executing check_ducc is used. If specified as 'all' then all ducc processes" @@ -66,7 +70,15 @@ class CheckDucc(DuccUtil): print "" print " -k --kill" print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" - print " uses kill -9 and only kills processes owned by the invoking user." + print " uses kill -KILL (-9) and only kills processes owned by the invoking user." + print "" + print " -i --int" + print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + print " uses kill -INT (-2) and only kills processes owned by the invoking user." + print "" + print " -q --quit" + print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" + print " uses kill -QUIT (-3) and only kills processes owned by the invoking user." print "" print " -p --pids" print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes" @@ -85,43 +97,55 @@ class CheckDucc(DuccUtil): def main(self, argv): - self.show_ducc_environment() - try: - opts, args = getopt.getopt(argv, 'kn:prs:u:h?v') + opts, args = getopt.getopt(argv, 'cikn:pqrs:u:h?v', ['--configuration', '--nodelist=', '--user=', '--int', '--quit', '--kill', '--pids', '--reap', '--version']) except: self.usage("Invalid arguments " + ' '.join(argv)) nodefiles = [] user = os.environ['LOGNAME'] victim_user = None - kill = False + kill_signal = None reap = False redo_pids = False process_changes = False do_validate = False checkdate = 0 - + config_only = False + for ( o, a ) in opts: - if ( o == '-n' ) : + if o in ('-c', '--configuration'): + config_only = True + elif o in ('-n', '--nodelist'): nodefiles.append(a) - elif ( o == '-k' ) : - kill = True - elif ( o == '-u' ) : + elif o in ('-i', '--int'): + if ( kill_signal != None ): + print 'Conflicting kill signals: -INT and', kill_signal + return + kill_signal = '-INT' + elif o in ('-q', '--quit'): + if ( kill_signal != None ): + print 'Conflicting kill signals: -QUIT and', kill_signal + return + kill_signal = '-QUIT' + elif o in ('-k', '--kill'): + if ( kill_signal != None ): + print 'Conflicting kill signals: -KILL and', kill_signal + return + kill_signal = '-KILL' + elif o in ('-u', '--user'): victim_user = a - elif ( o == '-v'): - ducc_util.version() - elif ( o == '-r'): + elif o in ('-r', '--reap'): reap = True - elif ( o == '-p'): + elif o in ('-p', '--pids'): redo_pids = True - elif ( o == '-s'): + elif o in ('-s'): # intended to be called recursively from check_ducc, NOT from the command line do_validate = True checkdate = float(a) - elif ( o in ('-h', '-?') ): + elif o in ('-h', '-?', '--help'): self.usage(None) - elif ( o == '-v'): + elif o in ('-v', '--version'): self.version(None) else: print 'badarg', a @@ -134,6 +158,13 @@ class CheckDucc(DuccUtil): self.validate(checkdate) return + os.system('cat ' + self.DUCC_HOME + '/state/duccling.version') + # not -s option, do this only on local node + env = self.show_ducc_environment() + for e in env: + print e + + if ( reap and (user == 'ducc') ): usage('Can only reap non-udcc users') @@ -172,6 +203,13 @@ class CheckDucc(DuccUtil): if ( len(localnodes) > 0 ): nodes['local'] = localnodes + self.verify_jvm() + if self.verify_class_configuration(nodes): + print "OK: Class configuration checked" + + if ( config_only ): + return + # checking starts here checked = {} for (nodefile, nodelist) in nodes.items(): @@ -203,14 +241,14 @@ class CheckDucc(DuccUtil): continue process_id = found_user + ' ' + component + '@' + node + ' PID ' + pid - if ( kill ) : + if ( kill_signal != None ) : if ( user != found_user ): print spacer, "Not killing someone else's process.", process_id elif ( component == 'unknown-java' ): print spacer, 'Not killing non-ducc process', process_id else: - print spacer, 'Killing (kill -9)', process_id - self.kill_process(node, proc) + print spacer, 'Killing (' + kill_signal + ')', process_id + self.kill_process(node, proc, kill_signal) pids.delete(pid) process_changes = True elif ( reap ): @@ -236,7 +274,7 @@ class CheckDucc(DuccUtil): else: print 'no processes found.' - if ( not (kill or reap) ): + if ( not ((kill_signal != None) or reap) ): lines = self.ssh(node, True, self.DUCC_HOME + "/admin/check_ducc", "-s", str(int(time()))) while 1: line = lines.readline() @@ -249,7 +287,7 @@ class CheckDucc(DuccUtil): if ( reap ): return - if ( kill ): + if ( kill_signal != None ): self.stop_broker() self.remove_orchestrator_lock() Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1503870&r1=1503869&r2=1503870&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Tue Jul 16 20:29:54 2013 @@ -235,10 +235,10 @@ class DuccUtil(DuccBase): def verify_jvm(self): jvm = self.java() - CMD = jvm + ' -fullversion > /dev/null 2>&1' + CMD = jvm + ' -version > /dev/null 2>&1' rc = os.system(CMD) if ( rc != 0 ): - print 'NOTOK', CMD, 'returns', rc, '. Must return rc 0. Startup cannot continue.' + print 'NOTOK', CMD, 'returns', int(rc), '. Must return rc 0. Startup cannot continue.' return False return True @@ -477,8 +477,8 @@ class DuccUtil(DuccBase): except: print 'Unable to remove orchestrator lock' - def kill_process(self, node, proc): - self.ssh(node, False, 'kill', '-KILL', proc[1]) + def kill_process(self, node, proc, signal): + self.ssh(node, False, 'kill', signal, proc[1]) def clean_shutdown(self): DUCC_JVM_OPTS = ' -Dducc.deploy.configuration=' + self.DUCC_HOME + "/resources/ducc.properties " @@ -547,7 +547,7 @@ class DuccUtil(DuccBase): manifest = DuccProperties() manifest.load_from_manifest(self.DUCC_HOME + '/lib/' + j) - response.append('ENV: %25s %18s %12s %s' % (j + ':', manifest.get('Ducc-Version'), 'compiled at', manifest.get('Ducc-Build-Date'))) + response.append('ENV: %25s %18s %12s %s' % (j + ':', manifest.get('Ducc-Version'), 'compiled at', manifest.get('Build-Date'))) return response @@ -590,6 +590,147 @@ class DuccUtil(DuccBase): #print 'RETURN', nodefile, ret return ret + def compare_nodes(self, n1, n2): + + if ( n1 == n2 ): # exact match - covers both short and both long + return True + + if ( n1.find('.') >= 0 ): # shortened n1 == n2? + t1 = n1.split('.') + n1A = t1[0] + if ( n1A == n2 ): + return True + + if ( n2.find('.') >= 0 ): # n1 == shortened n2? + t2 = n2.split('.') + n2A = t2[0] + if ( n1 == n2A ): + return True + return False + + # + # Make sure all the nodes in the configured nodepools are also in the startup list + # + def check_nodepools(self, classprops, allnodes): + # + # First make sure that all the nodepools that are declared have definition files + # and that the defined nodes are in some nodelist. + # + nodepools = classprops.get('scheduling.nodepool').split() + nodepools_ok = True + for np in nodepools: + npkey = 'scheduling.nodepool.' + np + npfilename = classprops.get(npkey) + if ( npfilename == None ): + print 'NOTOK: Missing nodepool definition file for Nodepool', np + nodepools_ok = False + continue + + npfile = self.DUCC_HOME + '/resources/' + npfilename + if ( not os.path.exists(npfile) ): + print 'NOTOK: Cannot find nodepool file', npfile + errors = errors + 1 + continue + + npnodes = {} + npnodes = self.read_nodefile(npfile, npnodes) + found = False + for ( impfile, nodes ) in npnodes.items(): + for node in nodes: + for (nodefile, nodelist) in allnodes.items(): + for n in nodelist: + if ( self.compare_nodes(n, node)): + found = True + break + if ( not found ): + print 'NOTOK: Cannot find node defined in pool "' +np+'" in any nodefile:', node + nodepools_ok = False + + # + # Now make sure that all classes that reference nodepools have corresponding + # nodepool definitions + # + + for ( k, v ) in classprops.items(): + if ( k.startswith('scheduling.class.') and k.endswith('.nodepool') ): + if ( not ( v in nodepools ) ): + toks = k.split('.') + classname = toks[2] + print 'NOTOK: Class', classname, 'references non-existent nodepool', v + nodepools_ok = False + + if ( nodepools_ok ): + print 'OK: All nodepools are verified' + else: + print 'NOTOK: some nodepools are not correctly defined.' + + return nodepools_ok + + def verify_class_configuration(self, allnodes): + answer = True + # first, find the class definition + classfile = self.ducc_properties.get('ducc.rm.class.definitions') + classfile = self.resolve(classfile, self.propsfile) # resolve the classfile relative to ducc.properties + + print 'Class definition file is', classfile + classprops = DuccProperties() + try: + classprops.load(classfile) + except: + print 'NOTOK: Cannot read properties file', classfile + return False + + # Verify nodepool definitions. + if ( not self.check_nodepools(classprops, allnodes) ): + # this check will emit necessary messages + answer = False + + nodepools = classprops.get('scheduling.nodepool').split() + class_set = classprops.get('scheduling.class_set').split() + # first, make sure every class that is defined exists, has a policy, and a priority + # FAIR_SHARE classes, they must also have a weight + # if a nodeppol is assigned, it must also be one of the defined, and now verified, nodepools + for cl in class_set: + po = classprops.get('scheduling.class.' + cl +'.policy') + if ( po == None ): + print 'NOTOK: Missing policy definition for class "' + cl + '"' + answer = False + else: + we = classprops.get('scheduling.class.' + cl +'.share_weight') + if ( po == 'FAIR_SHARE' and we == None ): + print 'NOTOK: Missing "weight" definition for class: "' + cl + '"' + answer = False + + pr = classprops.get('scheduling.class.' + cl +'.priority') + if ( pr == None ): + print 'NOTOK: Missing priority definition for class: "' + cl + '"' + answer = False + + clnp = classprops.get('scheduling.class.' + cl +'.nodepool') + if ( clnp != None ): + if ( not clnp in nodepools ): + print 'NOTOK: Nodepool "' + clnp + '" is configured for class "' + cl + '" but has no definition.' + answer = False + + # Dig out the jobdriver class and insure it exists. + jdclass = self.ducc_properties.get('ducc.jd.host.class') + if ( not jdclass in class_set ): + print 'NOTOK: Job Driver class "' + jdclass + '" is not defined (see ducc.properties: ducc.jd.host.class).' + answer = False + + # if a default.name and/or default.name.reserve class is defined, make sure they exist + default_class = classprops.get('scheduling.default.name') + if ( (default_class != None) and (not default_class in class_set) ): + print 'NOTOK: Default class "' + default_class + '" is not defined.' + answer = False + + default_reserve_class = classprops.get('scheduling.default.name.reserve') + if ( (default_reserve_class != None) and (not default_reserve_class in class_set) ): + print 'NOTOK: Default reserve class "' + default_reserve_class + '" is not defined.' + answer = False + + return answer + def __init__(self): DuccBase.__init__(self) self.duccling = None Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc?rev=1503870&r1=1503869&r2=1503870&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Tue Jul 16 20:29:54 2013 @@ -289,6 +289,12 @@ class StartDucc(DuccUtil): print "Can't read nodefile", nf ok = False + if ok and self.verify_class_configuration(nodes): + print "OK: Class configuration checked" + else: + print "NOTOK: Bad configuration, cannot start." + ok = False + if ( not ok ): sys.exit(1) Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java?rev=1503870&r1=1503869&r2=1503870&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java (original) +++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Scheduler.java Tue Jul 16 20:29:54 2013 @@ -341,10 +341,12 @@ public class Scheduler Map<String, String> readNodepoolFile(String npfile) { + String methodName = "readNodepoolFile"; String my_domain = getDomainName(); String ducc_home = System.getProperty("DUCC_HOME"); npfile = ducc_home + "/resources/" + npfile; + logger.info(methodName, null, "Domain name:", my_domain); Map<String, String> response = new HashMap<String, String>(); try { @@ -401,7 +403,7 @@ public class Scheduler // read in nodepools String npn = props.getProperty("scheduling.nodepool"); if ( npn != null ) { - String[] npnames = npn.split(" "); + String[] npnames = npn.split("\\s+"); for ( String nodepoolName : npnames ) { int nporder = props.getIntProperty("scheduling.nodepool." + nodepoolName + ".order", 100); String npfile = props.getProperty("scheduling.nodepool." + nodepoolName).trim();