- Active SC will reboot if arb time somehow has big gap b/w heartbeats in watch takeover request. Active SC may still OK but be rebooted unexpectedly. - Enhance VM was frozen detection base on arb time and local time. If local time gap is normal, it should be arb server problem. --- src/osaf/consensus/plugins/tcp/tcp.plugin | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/osaf/consensus/plugins/tcp/tcp.plugin b/src/osaf/consensus/plugins/tcp/tcp.plugin index 0be20fcee..c3d6cd924 100755 --- a/src/osaf/consensus/plugins/tcp/tcp.plugin +++ b/src/osaf/consensus/plugins/tcp/tcp.plugin @@ -478,6 +478,7 @@ class ArbitratorPlugin(object): return ret last_arb_timestamp = 0 + last_local_time = 0 while True: if key == self.takeover_request: if self.is_active() is False: @@ -485,17 +486,20 @@ class ArbitratorPlugin(object): break while True: try: + local_time = int(time.time()) time_at_arb = self.proxy.heartbeat(self.hostname) - if last_arb_timestamp == 0: - last_arb_timestamp = time_at_arb - break - elif (time_at_arb - last_arb_timestamp) > self.timeout: + if last_arb_timestamp != 0 and \ + (time_at_arb - last_arb_timestamp) > self.timeout: + if (local_time - last_local_time ) < self.timeout: + syslog.syslog('Arbitrator server slow?') + raise socket.error("Arbitrator server slow?") # VM was frozen? syslog.syslog('VM was frozen!') ret['code'] = 126 return ret else: last_arb_timestamp = time_at_arb + last_local_time = local_time break except socket.error: # can't heartbeat, need to self-fence (if peer down) -- 2.17.1 _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel