Else a few branches would not be taken and the behaviour wasn't quite straightforward.
Only increment tries if we really retry and log retries Signed-off-by: Thomas Lamprecht <[email protected]> --- src/PVE/HA/LRM.pm | 5 ++++- src/PVE/HA/Manager.pm | 22 ++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/PVE/HA/LRM.pm b/src/PVE/HA/LRM.pm index d7b54da..2692ca8 100644 --- a/src/PVE/HA/LRM.pm +++ b/src/PVE/HA/LRM.pm @@ -588,7 +588,6 @@ sub handle_service_exitcode { $tries->{$sid} = 0 if !defined($tries->{$sid}); - $tries->{$sid}++; if ($tries->{$sid} >= $max_restart) { $haenv->log('err', "unable to start service $sid on local node". " after $tries->{$sid} retries"); @@ -596,6 +595,10 @@ sub handle_service_exitcode { return ERROR; } + $tries->{$sid}++; + + $haenv->log('warning', "restart policy: retry number $tries->{$sid}" . + " for service '$sid'"); # tell CRM that we retry the start return ETRY_AGAIN; } diff --git a/src/PVE/HA/Manager.pm b/src/PVE/HA/Manager.pm index 48826e7..21a34dd 100644 --- a/src/PVE/HA/Manager.pm +++ b/src/PVE/HA/Manager.pm @@ -556,14 +556,25 @@ sub next_state_started { my $try_next = 0; if ($lrm_res) { - if ($lrm_res->{exit_code} == ERROR) { + my $ec = $lrm_res->{exit_code}; + if ($ec == SUCCESS) { + + $master_status->{relocate_trial}->{$sid} = 0; + + } elsif ($ec == ETRY_AGAIN) { + + # do nothing, the LRM wants to try again + + } elsif ($ec == ERROR) { + # apply our relocate policy if we got ERROR from the LRM my $try = $master_status->{relocate_trial}->{$sid} || 0; if ($try < $cd->{max_relocate}) { $try++; - $try_next = 1; # tell select_service_node to relocate + # tell select_service_node to relocate if possible + $try_next = 1; $haenv->log('warning', "starting service $sid on node". " '$sd->{node}' failed, relocating service."); @@ -577,8 +588,11 @@ sub next_state_started { return; } - } elsif ($lrm_res->{exit_code} == SUCCESS) { - $master_status->{relocate_trial}->{$sid} = 0; + } else { + $haenv->log('err', "service '$sid' got unrecoverable error" . + " (exit code $ec))"); + # we have no save way out (yet) for other errors + &$change_service_state($self, $sid, 'error'); } } -- 2.1.4 _______________________________________________ pve-devel mailing list [email protected] http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
