Author: challngr
Date: Tue May 21 13:35:25 2013
New Revision: 1484806

URL: http://svn.apache.org/r1484806
Log:
UIMA-2929 Move some site-local checks to GUCC-global checks.

Removed:
    uima/sandbox/uima-ducc/trunk/src/main/admin/DuccHello.java
Modified:
    uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
    uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py
    uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
    uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py
    uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1484806&r1=1484805&r2=1484806&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Tue May 21 13:35:25 
2013
@@ -39,6 +39,7 @@ class CheckDucc(DuccUtil):
 
     def validate(self, checkdate):
         verify_slave_node(checkdate, self.ducc_properties)
+        self.check_clock_skew(checkdate)
 
     def verify_activemq(self):
         if ( self.is_amq_active() ):

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py?rev=1484806&r1=1484805&r2=1484806&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc.py Tue May 21 13:35:25 2013
@@ -23,6 +23,7 @@ import os
 import os.path
 import sys
 import getopt
+import time
 
 from ducc_util import DuccUtil
 from ducc_util import DuccProperties
@@ -76,6 +77,12 @@ class Ducc(DuccUtil):
                     print "Must start agents separately"
                     sys.exit(1)
                     
+                if ( not self.verify_jvm() ):
+                    return
+
+                if ( not self.check_clock_skew(localdate) ):
+                    return
+
                 dok = self.verify_duccling()
                 if ( not dok ):
                     print 'NOTOK ducc_ling is not set up correctly on node', 
self.localhost
@@ -86,6 +93,8 @@ class Ducc(DuccUtil):
                     # we assume that verify_local_node is spewing a line of 
the form
                     #    NOTOK error message
                     # if all is not fine
+                    print '0 ONE RETURNS'
+
                     return
 
                 jvm_opts.append('-Djava.library.path=' + self.DUCC_HOME) 
@@ -172,7 +181,7 @@ class Ducc(DuccUtil):
         if ( args != None ):
             cmd.append(args)
 
-        #print 'CMD', cmd
+            #print 'CMD', cmd
         if ( pid == None ):
             if ( background ):
                 pid = self.nohup(cmd)

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1484806&r1=1484805&r2=1484806&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Tue May 21 
13:35:25 2013
@@ -458,6 +458,48 @@ class DuccUtil:
         CLASSPATH = LIB + '/ducc-submit.jar'
         os.environ['CLASSPATH'] = CLASSPATH
 
+    def check_clock_skew(self, localdate):
+        user = os.environ['LOGNAME']
+        bypass = (user != 'ducc')
+        
+        if bypass:
+            tag = 'NOTE'
+        else:
+            tag = 'NOTOK'
+
+        # Check clock skew
+        ok = True
+        acceptable_skew = 300
+        skew = abs(long(localdate) - long(time.time()))
+        if ( skew > (acceptable_skew) ):
+            ok = False
+            print tag, 'Clock skew[', skew, '] on', os.uname()[1], ". Remote 
time is", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.localtime())
+        return ok or bypass
+
+    def check_orchestrator_lock(self):
+        lock = self.DUCC_HOME + '/state/orchestrator.lock'
+        if ( os.path.exists(lock) ):
+            print 'NOTOK WARNING The Orchestrator lock file', lock, 'exists. 
WARNING NOTOK'
+            print 'NOTOK WARNING Insure the Orchestrator is not running and 
clear this lock.                             WARNING NOTOK'
+            print 'NOTOK WARNING When the lock is clear try restarting the 
Orchestrator coponent.                        WARNING NOTOK'
+            time.sleep(5)
+
+            return False
+        return True
+
+    def get_duccling_version(self):
+        CMD = self.duccling + ' -v >' + self.DUCC_HOME + 
'/state/duccling.version'
+        os.system(CMD)
+
+    def verify_jvm(self):
+        jvm = self.java()
+        CMD = jvm + ' -fullversion > /dev/null'
+        rc = os.system(CMD)
+        if ( rc != 0 ):
+            print 'NOTOK', CMD, 'returns', rc, '.  Must return rc 0.  Startup 
cannot continue.'
+            return False
+        return True
+
     def verify_duccling(self):
         
         check_permission = True                        # if we're not ducc we 
don't care about permissions
@@ -510,6 +552,25 @@ class DuccUtil:
                 print "Missing ducc_ling"
                 return False
              
+        # now make sure the version matches that on the master node
+        lines = self.popen(self.duccling + ' -v')
+        version_from_head = lines.readline().strip();
+
+        version_file = self.DUCC_HOME + '/state/duccling.version';
+        if ( os.path.exists(version_file) ):
+            verfile = open(version_file)
+            for line in verfile:
+                line = line.strip();
+                if ( line != version_from_head ):
+                    print "Mismatched ducc_ling versions:"
+                    print "MASTER version:", version_from_head
+                    print "LOCAL  version:", line
+                    return False
+            verfile.close()
+        else:
+            print "ducc_ling version file missing, cannot verify version."
+            return Ffalse;
+
         print 'ducc_ling OK'
         return True
 

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py?rev=1484806&r1=1484805&r2=1484806&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/local_hooks.py Tue May 21 
13:35:25 2013
@@ -1,26 +1,180 @@
 #!/usr/bin/python
 
+import subprocess
+import os
+import time
+import grp, pwd
+import platform
+import resource
+
 #
-# Returns a list of 3-tuples of processes other than known ducc processes, for
-# display by check_ducc.  Each 3-tupls is of the form
-#   (proccessnamd, pid, user)
-# where
-#   processid is the name of the process (preferably the short name, like 
'java')
-#   pid is the pid of the process
-#   user is the owner of the process
+# this is a watson plugin to check_ducc to find java-remote2 processes
 #
+
 def find_other_processes(pid, user, line):
-    return []
+    if ( line.find('-DBobsComponent=jr2') >= 0 ):
+        return [('java-remote2', pid, user)]
 
-# 
-# This performs any installation-wise checking needed on each of the slave 
ducc nodes.
-# If False is returned, ducc will not start an agent on the node.
+    return [('unknown-java', pid, user)]
+
+#
+# This is run via ssh on every node in ducc.properties
+#
+# This performs watson-local node verification on startup and prevents
+# agents from starting on non-viable nodes.
+#
+# Make sure the new home filesystem is used: 
+#       df /home and make sure it is mounted from bluej670
+#       Make sure all nfs mounts are good: /data/admin/test-mounts.pl =>
+#          this returns OK or problems (From eae' stest-mounts.pl)
+#
+# Make sure the system clock is close to bluej672
+#
+# Make sure user and group permissions are good: 
+# id ducc => make sure it has group=ducc and also has access to 
group=saicluster
+# id challngr => make sure group=sailimited and has access to group=saicluster
+#
+# Make sure network DNS is good: ping bluej333 and make sure it is
+# trying to: PING bluej333.bluej.net (192.168.2.103) 56(84) bytes of
+# data
+#
+# Make sure prolog is installed ok:
+# ls -l /usr/lib/sicstus-4.1.2/bin/jasper.jar => check that it is there
+# ldd /usr/lib/libspnative4.1.2.so => check for no errors
+#
+#
+# Make sure /tmp is not full :)
+#
 def verify_slave_node(localdate, properties):
-    return True
+
+    # if not ducc, don't enforce sanity, it's testing or a sim
+    user = os.environ['LOGNAME']
+    bypass = (user != 'ducc')
+
+    if bypass:
+        tag = 'NOTE'
+    else:
+        tag = 'NOTOK'
+
+    # Check mounts and filesystems
+    cmd = '/data/admin/test-mounts.pl'
+    proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, 
shell=True, stderr=subprocess.STDOUT)
+    p = proc.stdout
+    ok = True
+    while 1:
+        line = p.readline().strip()
+        #print os.uname()[1], line
+        if ( not line ):
+            break
+        if ( line != 'OK' ):
+            ok = False
+
+    if ( not ok ):
+        print tag, 'Bad mounts on', os.uname()[1]
+
+    # Make sure user ducc and group ducc exist user ducc is in group sailimited
+    # Verification of group and user ducc is done while verifying duccling
+
+    if ( user == 'ducc' ):              # in test, we don't want to make this 
check
+        specialgroup = 'sailimited'
+        grpinfo = grp.getgrnam('sailimited')
+        grmembers = grpinfo.gr_mem
+        
+        pwinfo = pwd.getpwnam('ducc')
+        if ( (grpinfo.gr_gid != pwinfo.pw_gid) and ( not ('ducc' in grmembers) 
) ):        
+            ok = False
+            print tag, 'User ducc in not in group "sailimited"'
+
+    # make sure ping bluej333 is ok
+    node = 'bluej333'
+    #node = 'bluej658'
+    #node = 'bubba'
+    cmd = 'ping -c1 ' + node
+    proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, 
shell=True, stderr=subprocess.STDOUT)
+    (sin, serr) = proc.communicate()
+    rc = proc.returncode
+    if ( rc == 1 ):
+        print tag, 'Ping resolves', node, 'but got no reply:'
+        ok = False
+    elif ( rc == 2 ):
+        print tag, 'Cannot ping', node, ':', sin.strip()
+        ok = False
+
+    plat = platform.machine()
+    # make sure prolog is installed correctly
+    # ls -l /usr/lib/sicstus-4.1.2/bin/jasper.jar => check that it is there
+    if ( plat == 'x86_64' ):
+        prolog_jasper = '/usr/lib/sicstus-4.1.2/bin/jasper.jar'
+        prolog_so     = '/usr/lib/libspnative4.1.2.so'
+    else:      # ppc64
+        prolog_jasper = '/usr/lib/sicstus-4.1.2/bin/jasper.jar'
+        prolog_so     = '/usr/lib64/libspnative.so'
+
+    try:
+        os.stat(prolog_jasper)
+    except:
+        print tag, "Cannot find", prolog_jasper 
+        ok = False
+
+    cmd = 'ldd ' + prolog_so
+    proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, 
shell=True, stderr=subprocess.STDOUT)
+    (sin, sout) = proc.communicate()
+    rc = proc.returncode
+
+    if ( rc != 0 ):
+        print tag, "Bad or missing prolog lib:", sin.strip()
+        ok = False
+    else:
+        lines = sin.split('\n')
+        for l in lines:
+            if ( l.find('not found') >= 0 ):
+                print tag, "Problem in ", prolog_so, ":", l.strip()
+                ok = False
+
+    # make sure /tmp is not full
+    cmd = 'df -Pm /tmp'
+    proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, 
shell=True, stderr=subprocess.STDOUT)
+    (sin, sout) = proc.communicate()
+    rc = proc.returncode
+    mintmp = 2000 # in mb
+    for l in sin.split('\n'):
+       if ( l.startswith('/') ):
+           toks = l.split()
+           if ( int(toks[3]) <= mintmp ):
+               print tag, '/tmp space is less than minimum of', int(mintmp), 
":", line
+               ok = False
+
+    # verify java.  A hello must have been compiled by the master for this to 
work
+    here = os.getcwd()
+    os.chdir(os.environ['DUCC_HOME'] + '/admin')
+    java = properties.get('ducc.jvm')
+    if ( java == None ):
+        print tag, 'WARN: "ducc.jvm" is not configured, using "java" instead.'
+        java = 'java'
+    cmd = java + ' DuccHello'
+    proc = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, 
shell=True, stderr=subprocess.STDOUT)
+    (sin, sout) = proc.communicate()
+    rc = proc.returncode
+    os.chdir(here)
+    if ( sin.strip() != 'hiya' ):
+        print tag, 'Cannot run java, HelloWorld failed.'
+        ok = False
+
+    # check rlimits - rss and virtual must be unlimited
+    (softrss , hardrss)  = resource.getrlimit(resource.RLIMIT_RSS)
+    (softvmem, hardvmem) = resource.getrlimit(resource.RLIMIT_AS)
+    if ( softrss != -1 ):
+        print tag, 'RSS limit is not unlimited:', softrss
+        ok = False
+    if ( softvmem != -1 ):
+        print tag, 'VMEM limit is not unlimited:', softvmem
+        ok = False
+        
+    return ok or bypass
 
 #
-# This performs any installation-wise chacking on the master ducc node.
-# If False is returned, ducc will not start.
+# This is run on the master node (the "ducc head") before any of the 
verify_slave_node
+# calls are made, to allow common setup or special tests
 #
 def verify_master_node(properties):
     return True

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc?rev=1484806&r1=1484805&r2=1484806&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/start_ducc Tue May 21 13:35:25 
2013
@@ -91,7 +91,7 @@ class StartDucc(DuccUtil):
             line = lines.readline().strip()
             if ( not line ):
                 break
-            # print '[] ' + line
+            #print '[] ' + line
             if ( line.startswith('PID') ):
                 toks = line.split(' ')    # get the PID
                 print node, 'PID', toks[1]
@@ -115,7 +115,7 @@ class StartDucc(DuccUtil):
             lines = self.ssh(host, True, "'", self.DUCC_HOME + 
'/admin/ducc.py', '-c' 'agent', '-b', '-d', str(time.time()), '--nodup', "'")
             while 1:
                 line = lines.readline().strip()
-                #print '[]' + line
+                #print '[l]' + line
                 if ( line.startswith('PID') ):
                     toks = line.split(' ')
                     pid = toks[1]
@@ -210,6 +210,11 @@ class StartDucc(DuccUtil):
         for e in environ:
             print e
 
+        if ( not self.verify_jvm() ):
+            sys.exit(1);
+
+        self.get_duccling_version();
+
         nodefiles = []
         components = []
         management = False
@@ -256,6 +261,9 @@ class StartDucc(DuccUtil):
         if ( management ):
             components = self.default_components
 
+        if ( 'or' in components ):
+            self.check_orchestrator_lock()
+
         if ( not verify_master_node(self.ducc_properties) ):
             print 'FAIL: Cannot run javac to run java verification'
             return
@@ -276,7 +284,7 @@ class StartDucc(DuccUtil):
 
         if ( not ok ):
             sys.exit(1)
-
+                
         # activeMQ needs to be started externally before starting any DUCC 
processes
         if ( self.automanage and ('broker' in components) ):
             if ( self.is_amq_active() ):
@@ -297,6 +305,7 @@ class StartDucc(DuccUtil):
         # if we are asked to start any of the managemnt processes, do this 
first
         if ( len(components) != 0 ):
             print 'Starting', or_parms
+
             for com in components:
                 if ( com == 'broker' ):
                     pass     # already started


Reply via email to