Author: challngr
Date: Thu May  8 19:06:20 2014
New Revision: 1593381

URL: http://svn.apache.org/r1593381
Log:
UIMA-3814 First pass, deal with hanging nodes in check_ducc.

Modified:
    uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
    uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc?rev=1593381&r1=1593380&r2=1593381&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/check_ducc Thu May  8 19:06:20 
2014
@@ -23,6 +23,7 @@ import os
 import sys
 from time import time
 import getopt
+import signal
 
 from ducc_util import DuccUtil
 from ducc_base  import DuccProperties
@@ -34,6 +35,11 @@ from ducc_util import ThreadPool
 
 class CheckDucc(DuccUtil):
 
+
+    def __init__(self):
+        DuccUtil.__init__(self)
+        self.badnodes = []
+
     def validate(self, checkdate):
         verify_slave_node(checkdate, self.ducc_properties)
         self.check_clock_skew(checkdate)
@@ -97,20 +103,34 @@ class CheckDucc(DuccUtil):
             messages.append((spacer, 'no processes found.'))
 
         if ( self.kill_signal == None ):                    
+            response = "Node health checks return."
             if ( self.single_user ) :
                 lines = self.ssh(node, True, self.DUCC_HOME + 
"/admin/check_ducc", "-s", "-x", str(int(time())))
             else:
                 lines = self.ssh(node, True, self.DUCC_HOME + 
"/admin/check_ducc", "-x", str(int(time())))
             while 1:
                 line = lines.readline()
+                if ( 'signal' in line ):
+                    response = "Node health did not complete: " + line
+                    self.badnodes.append(node)
                 if ( not line ):
                     break
                 line = line.strip()
                 messages.append((spacer, line))
                 #messages.append((spacer, '[]', line))
+            messages.append((spacer, response))
 
         return messages
 
+    def signalHandler(self, signum, frame):
+        print "-------- Caught signal", signum, "--------"
+        if ( len(self.badnodes) != 0 ):
+            print "Health checks on these nodes did not return:"
+            for n in self.badnodes:
+                print n,
+                print ''
+        sys.exit(1)
+
     def usage(self, msg):
         if ( msg != None ):
             print msg
@@ -297,6 +317,8 @@ class CheckDucc(DuccUtil):
         self.threadpool = ThreadPool(n_nodes + 5)    # more for the head 
processes
         checked = {}
 
+        signal.signal(signal.SIGINT, self.signalHandler)
+
         try:
             for (nodefile, nodelist) in nodes.items():
                 if ( nodelist == None ):

Modified: uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py
URL: 
http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py?rev=1593381&r1=1593380&r2=1593381&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/admin/ducc_util.py Thu May  8 
19:06:20 2014
@@ -411,37 +411,38 @@ class DuccUtil(DuccBase):
 
     def ssh_ok(self, node, line):
         spacer = '   '
+        messages = []
         if ( line.startswith("Permission denied") ):
-            print ' '
-            print spacer, "ALERT: Passwordless SSH is not configured correctly 
for node", node
-            print spacer, "ALERT: SSH returns '" + line + "'"
-            return False
+            messages.append(' ')
+            messages.append(spacer + "ALERT: Passwordless SSH is not 
configured correctly for node " + node)
+            messages.append(spacer + "ALERT: SSH returns '" + line + "'")
+            return messages
 
         if ( line.startswith("Host key verification failed") ):
-            print ' '
-            print spacer, "ALERT: Passwordless SSH is not configured correctly 
for node", node
-            print spacer, "ALERT: SSH returns '" + line + "'"
-            return False
+            messages.append(' ')
+            messages.append(spacer + "ALERT: Passwordless SSH is not 
configured correctly for node " + node)
+            messages.append(spacer + "ALERT: SSH returns '" + line + "'")
+            return messages
 
         if ( line.find("Connection refused") >= 0 ):
-            print ' '
-            print spacer, "ALERT: SSH is not not enabled on node", node
-            print spacer, "ALERT: SSH returns '" + line + "'"
-            return False
+            messages.append(' ')
+            messages.append(spacer + "ALERT: SSH is not not enabled on node " 
+ node)
+            messages.append(spacer + "ALERT: SSH returns '" + line + "'")
+            return messages
         
         if ( line.find("Connection timed") >= 0 ):
-            print ' '
-            print spacer, "\nALERT: SSH did not respond with timeout of 10 
secnds", node
-            print spacer, "ALERT: SSH returns '" + line + "'"
-            return False
+            messages.append(' ')
+            messages.append(spacer + "\nALERT: SSH did not respond with 
timeout of 10 secnds " + node)
+            messages.append(spacer + "ALERT: SSH returns '" + line + "'")
+            return messages
         
         if ( line.find("No route")  >= 0 ):
-            print ' '
-            print spacer, 'ALERT: SSH cannot connect to host.'
-            print spacer, "ALERT: SSH returns '" + line + "'"
-            return False
+            messages.append(' ')
+            messages.append(spacer + 'ALERT: SSH cannot connect to host.')
+            messages.append(spacer + "ALERT: SSH returns '" + line + "'")
+            return messages
 
-        return True
+        return None
         
     #
     # Input is array lines from ps command looking for ducc processes owned 
this user.
@@ -466,7 +467,10 @@ class DuccUtil(DuccBase):
             if ( line.startswith('PID')):
                 continue
 
-            if ( not self.ssh_ok(line, node) ):
+            ssh_errors = self.ssh_ok(line, node)
+            if ( ssh_errors != None ):
+                for m in ssh_errors:
+                    print m
                 ok = False
                 continue
 


Reply via email to