- Active SC will reboot if arb time somehow has big gap b/w heartbeats in watch takeover request. Active SC may still OK but be rebooted unexpectedly. - Enhance VM was frozen detection base on arb time and local time counter. --- src/osaf/consensus/plugins/tcp/tcp.plugin | 43 ++++++++++++++++++----- 1 file changed, 35 insertions(+), 8 deletions(-)
diff --git a/src/osaf/consensus/plugins/tcp/tcp.plugin b/src/osaf/consensus/plugins/tcp/tcp.plugin index 0be20fcee..aaa1c1c3f 100755 --- a/src/osaf/consensus/plugins/tcp/tcp.plugin +++ b/src/osaf/consensus/plugins/tcp/tcp.plugin @@ -23,8 +23,24 @@ import sys import time import xmlrpc.client import syslog +import threading +counter_run = False +counter_time = 0.0 + +def time_counting(hb_interval): + ''' + When node is frozen, if it is VM, clock time not jump + but if it is container, clock time still jump. + This function to help know node is frozen or arbitrator server issue + ''' + global counter_run, counter_time + counter_time = 0.0 + while (counter_run): + time.sleep(hb_interval) + counter_time += hb_interval + class ArbitratorPlugin(object): """ This class represents a TCP Plugin """ @@ -478,6 +494,8 @@ class ArbitratorPlugin(object): return ret last_arb_timestamp = 0 + global counter_run, counter_time + counter = None while True: if key == self.takeover_request: if self.is_active() is False: @@ -486,15 +504,24 @@ class ArbitratorPlugin(object): while True: try: time_at_arb = self.proxy.heartbeat(self.hostname) - if last_arb_timestamp == 0: - last_arb_timestamp = time_at_arb - break - elif (time_at_arb - last_arb_timestamp) > self.timeout: - # VM was frozen? - syslog.syslog('VM was frozen!') - ret['code'] = 126 - return ret + if counter is not None: + counter_run = False + counter.join() + if (last_arb_timestamp != 0) and \ + (time_at_arb - last_arb_timestamp > self.timeout): + if counter_time < self.timeout: + syslog.syslog('VM was frozen!') + ret['code'] = 126 + return ret + syslog.syslog('Arb server issue?') + raise socket.error('Arb server issue?') else: + counter = threading.Thread( + target=time_counting, + args=(self.heartbeat_interval,)) + counter_run = True + counter.setDaemon(True) + counter.start() last_arb_timestamp = time_at_arb break except socket.error: -- 2.17.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel