Else a few branches would not be taken and the behaviour wasn't
quite straightforward.

Only increment tries if we really retry and log retries

Signed-off-by: Thomas Lamprecht <[email protected]>
---
 src/PVE/HA/LRM.pm     |  5 ++++-
 src/PVE/HA/Manager.pm | 22 ++++++++++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm
index d7b54da..2692ca8 100644
--- a/src/PVE/HA/LRM.pm
+++ b/src/PVE/HA/LRM.pm
@@ -588,7 +588,6 @@ sub handle_service_exitcode {
 
            $tries->{$sid} = 0 if !defined($tries->{$sid});
 
-           $tries->{$sid}++;
            if ($tries->{$sid} >= $max_restart) {
                $haenv->log('err', "unable to start service $sid on local node".
                           " after $tries->{$sid} retries");
@@ -596,6 +595,10 @@ sub handle_service_exitcode {
                return ERROR;
            }
 
+           $tries->{$sid}++;
+
+           $haenv->log('warning', "restart policy: retry number 
$tries->{$sid}" .
+                       " for service '$sid'");
            # tell CRM that we retry the start
            return ETRY_AGAIN;
        }
diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm
index 48826e7..21a34dd 100644
--- a/src/PVE/HA/Manager.pm
+++ b/src/PVE/HA/Manager.pm
@@ -556,14 +556,25 @@ sub next_state_started {
 
            my $try_next = 0;
            if ($lrm_res) {
-               if ($lrm_res->{exit_code} == ERROR) {
+               my $ec = $lrm_res->{exit_code};
+               if ($ec == SUCCESS) {
+
+                   $master_status->{relocate_trial}->{$sid} = 0;
+
+               } elsif ($ec == ETRY_AGAIN) {
+
+                   # do nothing, the LRM wants to try again
+
+               } elsif ($ec == ERROR) {
+                   # apply our relocate policy if we got ERROR from the LRM
 
                    my $try = $master_status->{relocate_trial}->{$sid} || 0;
 
                    if ($try < $cd->{max_relocate}) {
 
                        $try++;
-                       $try_next = 1; # tell select_service_node to relocate
+                       # tell select_service_node to relocate if possible
+                       $try_next = 1;
 
                        $haenv->log('warning', "starting service $sid on node".
                                   " '$sd->{node}' failed, relocating 
service.");
@@ -577,8 +588,11 @@ sub next_state_started {
                        return;
 
                    }
-               } elsif ($lrm_res->{exit_code} == SUCCESS) {
-                   $master_status->{relocate_trial}->{$sid} = 0;
+               } else {
+                   $haenv->log('err', "service '$sid' got unrecoverable error" 
.
+                               " (exit code $ec))");
+                   # we have no save way out (yet) for other errors
+                   &$change_service_state($self, $sid, 'error');
                }
            }
 
-- 
2.1.4


_______________________________________________
pve-devel mailing list
[email protected]
http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel

Reply via email to