This patch changes the behaviour in dlm_new_lockspace() function to wait until a recovery was successful or failed. Before a possible waiter in ls_members_done was waiting until dlm_recover_members() was done either if it was successful (inclusive interrupted) or failed. The result was returned to the waiter of dlm_new_lockspace(), if success the caller was able to use the lockspace at this point.
This behaviour is now changed to wait of a complete run of recovery functionality which is done by ls_recover(). The result can be either successful or failed and delivered back to a possible waiter of ls_recovery_done. A possible waiter is then able to use the lockspace or run error handling if failed. If recovery gets interrupted e.g. checked at several places if dlm_locking_stopped() is true, a possible waiter of ls_recovery_done is still waiting until ls_recover() is successful or fails. A reason why the recovery task gets interrupted is that an another dlm_ls_stop() was called while ls_recover() runs. The call of an another dlm_ls_stop() means that the recovery task will call ls_recover() again with a possible new configuration delivered by the cluster manager. Most dlm kernel users e.g. gfs2 or cluster-md have their own wait handling to wait for recovery done after calling dlm_new_lockspace(). This becomes unnecessary now but still works. Users can update their code because dlm takes care about it now. An example to simple interrupt recovery can be done by calling dlm_new_lockspace() and dlm_release_lockspace() in a loop on several cluster nodes. This has the effect that the cluster manager will interrupt the recovery with new membership information over and over again. Signed-off-by: Alexander Aring <aahri...@redhat.com> --- fs/dlm/dlm_internal.h | 4 ++-- fs/dlm/lockspace.c | 9 +++++---- fs/dlm/member.c | 13 ------------- fs/dlm/recoverd.c | 13 +++++++++++++ 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 776c3ed519f0..c03388a3875c 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -606,8 +606,8 @@ struct dlm_ls { wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; - struct completion ls_members_done; - int ls_members_result; + struct completion ls_recovery_done; + int ls_recovery_result; struct miscdevice ls_device; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 19ed41a5da93..0c3613d09c5e 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -548,8 +548,8 @@ static int new_lockspace(const char *name, const char *cluster, init_waitqueue_head(&ls->ls_uevent_wait); ls->ls_uevent_result = 0; - init_completion(&ls->ls_members_done); - ls->ls_members_result = -1; + init_completion(&ls->ls_recovery_done); + ls->ls_recovery_result = -1; mutex_init(&ls->ls_cb_mutex); INIT_LIST_HEAD(&ls->ls_cb_delay); @@ -645,8 +645,9 @@ static int new_lockspace(const char *name, const char *cluster, if (error) goto out_recoverd; - wait_for_completion(&ls->ls_members_done); - error = ls->ls_members_result; + /* wait until recovery is successful or failed */ + wait_for_completion(&ls->ls_recovery_done); + error = ls->ls_recovery_result; if (error) goto out_members; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 67b056634f03..2af2ccfe43a9 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -587,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - /* error -EINTR means that a new recovery action is triggered. - * We ignore this recovery action and let run the new one which might - * have new member configuration. - */ - if (error == -EINTR) - error = 0; - - /* new_lockspace() may be waiting to know if the config - * is good or bad - */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index b5b519cde20b..98c17f74927f 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -243,6 +243,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) jiffies_to_msecs(jiffies - start)); mutex_unlock(&ls->ls_recoverd_active); + ls->ls_recovery_result = 0; + complete(&ls->ls_recovery_done); + dlm_lsop_recover_done(ls); return 0; @@ -251,6 +254,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) log_rinfo(ls, "dlm_recover %llu error %d", (unsigned long long)rv->seq, error); mutex_unlock(&ls->ls_recoverd_active); + + /* let new_lockspace() get aware of critical error if recovery + * was interrupted -EINTR we wait for the next ls_recover() + * iteration until it succeeds. + */ + if (error != -EINTR) { + ls->ls_recovery_result = error; + complete(&ls->ls_recovery_done); + } + return error; } -- 2.31.1