Author: challngr Date: Tue May 21 13:35:25 2013 New Revision: 1484806 URL: http://svn.apache.org/r1484806 Log: UIMA-2929 Move some site-local checks to GUCC-global checks.
Removed: uima/sandbox/uima-ducc/trunk/src/main/admin/DuccHello.java Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1484806&r1=1484805&r2=1484806&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Tue May 21 13:35:25 2013 @@ -39,6 +39,7 @@ class CheckDucc(DuccUtil): def validate(self, checkdate): verify_slave_node(checkdate, self.ducc_properties) + self.check_clock_skew(checkdate) def verify_activemq(self): if ( self.is_amq_active() ): Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py?rev=1484806&r1=1484805&r2=1484806&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py Tue May 21 13:35:25 2013 @@ -23,6 +23,7 @@ import os import os.path import sys import getopt +import time from ducc_util import DuccUtil from ducc_util import DuccProperties @@ -76,6 +77,12 @@ class Ducc(DuccUtil): print "Must start agents separately" sys.exit(1) + if ( not self.verify_jvm() ): + return + + if ( not self.check_clock_skew(localdate) ): + return + dok = self.verify_duccling() if ( not dok ): print 'NOTOK ducc_ling is not set up correctly on node', self.localhost @@ -86,6 +93,8 @@ class Ducc(DuccUtil): # we assume that verify_local_node is spewing a line of the form # NOTOK error message # if all is not fine + print '0 ONE RETURNS' + return jvm_opts.append('-Djava.library.path=' + self.DUCC_HOME) @@ -172,7 +181,7 @@ class Ducc(DuccUtil): if ( args != None ): cmd.append(args) - #print 'CMD', cmd + #print 'CMD', cmd if ( pid == None ): if ( background ): pid = self.nohup(cmd) Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1484806&r1=1484805&r2=1484806&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Tue May 21 13:35:25 2013 @@ -458,6 +458,48 @@ class DuccUtil: CLASSPATH = LIB + '/ducc-submit.jar' os.environ['CLASSPATH'] = CLASSPATH + def check_clock_skew(self, localdate): + user = os.environ['LOGNAME'] + bypass = (user != 'ducc') + + if bypass: + tag = 'NOTE' + else: + tag = 'NOTOK' + + # Check clock skew + ok = True + acceptable_skew = 300 + skew = abs(long(localdate) - long(time.time())) + if ( skew > (acceptable_skew) ): + ok = False + print tag, 'Clock skew[', skew, '] on', os.uname()[1], ". Remote time is", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime()) + return ok or bypass + + def check_orchestrator_lock(self): + lock = self.DUCC_HOME + '/state/orchestrator.lock' + if ( os.path.exists(lock) ): + print 'NOTOK WARNING The Orchestrator lock file', lock, 'exists. WARNING NOTOK' + print 'NOTOK WARNING Insure the Orchestrator is not running and clear this lock. WARNING NOTOK' + print 'NOTOK WARNING When the lock is clear try restarting the Orchestrator coponent. WARNING NOTOK' + time.sleep(5) + + return False + return True + + def get_duccling_version(self): + CMD = self.duccling + ' -v >' + self.DUCC_HOME + '/state/duccling.version' + os.system(CMD) + + def verify_jvm(self): + jvm = self.java() + CMD = jvm + ' -fullversion > /dev/null' + rc = os.system(CMD) + if ( rc != 0 ): + print 'NOTOK', CMD, 'returns', rc, '. Must return rc 0. Startup cannot continue.' + return False + return True + def verify_duccling(self): check_permission = True # if we're not ducc we don't care about permissions @@ -510,6 +552,25 @@ class DuccUtil: print "Missing ducc_ling" return False + # now make sure the version matches that on the master node + lines = self.popen(self.duccling + ' -v') + version_from_head = lines.readline().strip(); + + version_file = self.DUCC_HOME + '/state/duccling.version'; + if ( os.path.exists(version_file) ): + verfile = open(version_file) + for line in verfile: + line = line.strip(); + if ( line != version_from_head ): + print "Mismatched ducc_ling versions:" + print "MASTER version:", version_from_head + print "LOCAL version:", line + return False + verfile.close() + else: + print "ducc_ling version file missing, cannot verify version." + return Ffalse; + print 'ducc_ling OK' return True Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py?rev=1484806&r1=1484805&r2=1484806&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py Tue May 21 13:35:25 2013 @@ -1,26 +1,180 @@ #!/usr/bin/python +import subprocess +import os +import time +import grp, pwd +import platform +import resource + # -# Returns a list of 3-tuples of processes other than known ducc processes, for -# display by check_ducc. Each 3-tupls is of the form -# (proccessnamd, pid, user) -# where -# processid is the name of the process (preferably the short name, like 'java') -# pid is the pid of the process -# user is the owner of the process +# this is a watson plugin to check_ducc to find java-remote2 processes # + def find_other_processes(pid, user, line): - return [] + if ( line.find('-DBobsComponent=jr2') >= 0 ): + return [('java-remote2', pid, user)] -# -# This performs any installation-wise checking needed on each of the slave ducc nodes. -# If False is returned, ducc will not start an agent on the node. + return [('unknown-java', pid, user)] + +# +# This is run via ssh on every node in ducc.properties +# +# This performs watson-local node verification on startup and prevents +# agents from starting on non-viable nodes. +# +# Make sure the new home filesystem is used: +# df /home and make sure it is mounted from bluej670 +# Make sure all nfs mounts are good: /data/admin/test-mounts.pl => +# this returns OK or problems (From eae' stest-mounts.pl) +# +# Make sure the system clock is close to bluej672 +# +# Make sure user and group permissions are good: +# id ducc => make sure it has group=ducc and also has access to group=saicluster +# id challngr => make sure group=sailimited and has access to group=saicluster +# +# Make sure network DNS is good: ping bluej333 and make sure it is +# trying to: PING bluej333.bluej.net (192.168.2.103) 56(84) bytes of +# data +# +# Make sure prolog is installed ok: +# ls -l /usr/lib/sicstus-4.1.2/bin/jasper.jar => check that it is there +# ldd /usr/lib/libspnative4.1.2.so => check for no errors +# +# +# Make sure /tmp is not full :) +# def verify_slave_node(localdate, properties): - return True + + # if not ducc, don't enforce sanity, it's testing or a sim + user = os.environ['LOGNAME'] + bypass = (user != 'ducc') + + if bypass: + tag = 'NOTE' + else: + tag = 'NOTOK' + + # Check mounts and filesystems + cmd = '/data/admin/test-mounts.pl' + proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + p = proc.stdout + ok = True + while 1: + line = p.readline().strip() + #print os.uname()[1], line + if ( not line ): + break + if ( line != 'OK' ): + ok = False + + if ( not ok ): + print tag, 'Bad mounts on', os.uname()[1] + + # Make sure user ducc and group ducc exist user ducc is in group sailimited + # Verification of group and user ducc is done while verifying duccling + + if ( user == 'ducc' ): # in test, we don't want to make this check + specialgroup = 'sailimited' + grpinfo = grp.getgrnam('sailimited') + grmembers = grpinfo.gr_mem + + pwinfo = pwd.getpwnam('ducc') + if ( (grpinfo.gr_gid != pwinfo.pw_gid) and ( not ('ducc' in grmembers) ) ): + ok = False + print tag, 'User ducc in not in group "sailimited"' + + # make sure ping bluej333 is ok + node = 'bluej333' + #node = 'bluej658' + #node = 'bubba' + cmd = 'ping -c1 ' + node + proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + (sin, serr) = proc.communicate() + rc = proc.returncode + if ( rc == 1 ): + print tag, 'Ping resolves', node, 'but got no reply:' + ok = False + elif ( rc == 2 ): + print tag, 'Cannot ping', node, ':', sin.strip() + ok = False + + plat = platform.machine() + # make sure prolog is installed correctly + # ls -l /usr/lib/sicstus-4.1.2/bin/jasper.jar => check that it is there + if ( plat == 'x86_64' ): + prolog_jasper = '/usr/lib/sicstus-4.1.2/bin/jasper.jar' + prolog_so = '/usr/lib/libspnative4.1.2.so' + else: # ppc64 + prolog_jasper = '/usr/lib/sicstus-4.1.2/bin/jasper.jar' + prolog_so = '/usr/lib64/libspnative.so' + + try: + os.stat(prolog_jasper) + except: + print tag, "Cannot find", prolog_jasper + ok = False + + cmd = 'ldd ' + prolog_so + proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + (sin, sout) = proc.communicate() + rc = proc.returncode + + if ( rc != 0 ): + print tag, "Bad or missing prolog lib:", sin.strip() + ok = False + else: + lines = sin.split('\n') + for l in lines: + if ( l.find('not found') >= 0 ): + print tag, "Problem in ", prolog_so, ":", l.strip() + ok = False + + # make sure /tmp is not full + cmd = 'df -Pm /tmp' + proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + (sin, sout) = proc.communicate() + rc = proc.returncode + mintmp = 2000 # in mb + for l in sin.split('\n'): + if ( l.startswith('/') ): + toks = l.split() + if ( int(toks[3]) <= mintmp ): + print tag, '/tmp space is less than minimum of', int(mintmp), ":", line + ok = False + + # verify java. A hello must have been compiled by the master for this to work + here = os.getcwd() + os.chdir(os.environ['DUCC_HOME'] + '/admin') + java = properties.get('ducc.jvm') + if ( java == None ): + print tag, 'WARN: "ducc.jvm" is not configured, using "java" instead.' + java = 'java' + cmd = java + ' DuccHello' + proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, shell=True, stderr=subprocess.STDOUT) + (sin, sout) = proc.communicate() + rc = proc.returncode + os.chdir(here) + if ( sin.strip() != 'hiya' ): + print tag, 'Cannot run java, HelloWorld failed.' + ok = False + + # check rlimits - rss and virtual must be unlimited + (softrss , hardrss) = resource.getrlimit(resource.RLIMIT_RSS) + (softvmem, hardvmem) = resource.getrlimit(resource.RLIMIT_AS) + if ( softrss != -1 ): + print tag, 'RSS limit is not unlimited:', softrss + ok = False + if ( softvmem != -1 ): + print tag, 'VMEM limit is not unlimited:', softvmem + ok = False + + return ok or bypass # -# This performs any installation-wise chacking on the master ducc node. -# If False is returned, ducc will not start. +# This is run on the master node (the "ducc head") before any of the verify_slave_node +# calls are made, to allow common setup or special tests # def verify_master_node(properties): return True Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc?rev=1484806&r1=1484805&r2=1484806&view=diff ============================================================================== --- uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc (original) +++ uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Tue May 21 13:35:25 2013 @@ -91,7 +91,7 @@ class StartDucc(DuccUtil): line = lines.readline().strip() if ( not line ): break - # print '[] ' + line + #print '[] ' + line if ( line.startswith('PID') ): toks = line.split(' ') # get the PID print node, 'PID', toks[1] @@ -115,7 +115,7 @@ class StartDucc(DuccUtil): lines = self.ssh(host, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c' 'agent', '-b', '-d', str(time.time()), '--nodup', "'") while 1: line = lines.readline().strip() - #print '[]' + line + #print '[l]' + line if ( line.startswith('PID') ): toks = line.split(' ') pid = toks[1] @@ -210,6 +210,11 @@ class StartDucc(DuccUtil): for e in environ: print e + if ( not self.verify_jvm() ): + sys.exit(1); + + self.get_duccling_version(); + nodefiles = [] components = [] management = False @@ -256,6 +261,9 @@ class StartDucc(DuccUtil): if ( management ): components = self.default_components + if ( 'or' in components ): + self.check_orchestrator_lock() + if ( not verify_master_node(self.ducc_properties) ): print 'FAIL: Cannot run javac to run java verification' return @@ -276,7 +284,7 @@ class StartDucc(DuccUtil): if ( not ok ): sys.exit(1) - + # activeMQ needs to be started externally before starting any DUCC processes if ( self.automanage and ('broker' in components) ): if ( self.is_amq_active() ): @@ -297,6 +305,7 @@ class StartDucc(DuccUtil): # if we are asked to start any of the managemnt processes, do this first if ( len(components) != 0 ): print 'Starting', or_parms + for com in components: if ( com == 'broker' ): pass # already started