This patch adds additional functionality if recovery returns -EAGAIN error code to not deliver this failure to the potential caller of dlm_new_lockspace(). If -EAGAIN is returned we try to run recovery again and hope with a additional schedule() it doesn't return -EAGAIN anymore. If a maximum amount is hit, we fence ourself by running panic().
Signed-off-by: Alexander Aring <aahri...@redhat.com> --- fs/dlm/recoverd.c | 71 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 90e8b7f440da..2bd3bbe53828 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -20,6 +20,7 @@ #include "requestqueue.h" #include "recoverd.h" +#define DLM_RECOVERY_MAX_RETRIES 5 /* If the start for which we're re-enabling locking (seq) has been superseded by a newer stop (ls_recover_seq), we need to leave locking disabled. @@ -259,7 +260,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) static void do_ls_recovery(struct dlm_ls *ls) { struct dlm_recover *rv = NULL; - int error; + int error, count = 0; spin_lock(&ls->ls_recover_lock); rv = ls->ls_recover_args; @@ -269,30 +270,54 @@ static void do_ls_recovery(struct dlm_ls *ls) spin_unlock(&ls->ls_recover_lock); if (rv) { - error = ls_recover(ls, rv); - switch (error) { - case 0: - ls->ls_recovery_result = 0; - complete(&ls->ls_recovery_done); - - dlm_lsop_recover_done(ls); - break; - case -EINTR: - /* if recovery was interrupted -EINTR we wait for the next - * ls_recover() iteration until it hopefully succeeds. + do { + /* we try DLM_MAX_RECOVERY_RETRIES times again to run + * recovery, if any -EAGAIN is not resolved this + * time we will let DLM_ASSERT() fence ourself. */ - log_rinfo(ls, "%s %llu interrupted and should be queued to run again", - __func__, (unsigned long long)rv->seq); - break; - default: - log_rinfo(ls, "%s %llu error %d", __func__, - (unsigned long long)rv->seq, error); + DLM_ASSERT(count < DLM_RECOVERY_MAX_RETRIES, + pr_err("%s %llu too many recovery retries %d\n", + __func__, (unsigned long long)rv->seq, + DLM_RECOVERY_MAX_RETRIES);); + + error = ls_recover(ls, rv); + switch (error) { + case 0: + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + + dlm_lsop_recover_done(ls); + break; + case -EINTR: + /* if recovery was interrupted -EINTR we wait for the next + * ls_recover() iteration until it hopefully succeeds. + */ + log_rinfo(ls, + "%s %llu interrupted and should be queued to run again", + __func__, (unsigned long long)rv->seq); + break; + case -EAGAIN: + /* either API is returning -EAGAIN or some critical errors + * returning -EAGAIN which let the recovery run again. There + * is a schedule() between it in the hope that the error resolves + * itself. If not the above DLM_ASSERT() will hit. + */ + log_rinfo(ls, "%s %llu recovery wants to run again", + __func__, (unsigned long long)rv->seq); + schedule(); + break; + default: + log_rinfo(ls, "%s %llu error %d", __func__, + (unsigned long long)rv->seq, error); - /* let new_lockspace() get aware of critical error */ - ls->ls_recovery_result = error; - complete(&ls->ls_recovery_done); - break; - } + /* let new_lockspace() get aware of critical error */ + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + break; + } + + count++; + } while (error == -EAGAIN); kfree(rv->nodes); kfree(rv); -- 2.31.1