Re: [Cluster-devel] [GFS2 PATCH v6 11/26] gfs2: log error reform

2019-09-04 Thread Bob Peterson
Hi Andreas,

- Original Message -
> > +*/
> > +   if (sdp->sd_log_error) {
> > +   gfs2_io_error_bh(sdp, bh);
> 
> some of the error handling here is still sketchy: the only place where
> sd_log_error is set without withdrawing the filesystem is
> quotad_error. If the filesystem has already been marked
> SDF_WITHDRAWING or SDF_WITHDRAWN, gfs2_io_error_bh will be a no-op. It
> seems that we want to set SDF_WITHDRAWING here for the quotad_error
> case instead of calling gfs2_io_error_bh?
> 
> > +   } else if (buffer_busy(bh)) {
> > continue;
> > -   if (!buffer_uptodate(bh) &&
> > -   !test_and_set_bit(SDF_AIL1_IO_ERROR, >sd_flags)) {
> > +   } else if (!buffer_uptodate(bh) &&
> > +  !cmpxchg(>sd_log_error, 0, -EIO)) {
> > gfs2_io_error_bh(sdp, bh);
> > set_bit(SDF_WITHDRAWING, >sd_flags);
> > }

The main idea was to move busy buffers to tr_ail2_list after
an errors have been flagged (before the test for buffer_busy()).
Would something like this be more acceptable?

@@ -200,10 +199,19 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, 
struct gfs2_trans *tr)
 bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
-   if (buffer_busy(bh))
+   /*
+* If another process flagged an io error, e.g. writing to the
+* journal, error all other bhs and move them off the ail1 to
+* prevent a tight loop when unmount tries to flush ail1,
+* regardless of whether they're still busy. If no outside
+* errors were found and the buffer is busy, move to the next.
+* If the ail buffer is not busy and caught an error, flag it
+* for others.
+*/
+   if (!sdp->sd_log_error && buffer_busy(bh))
continue;
if (!buffer_uptodate(bh) &&
-   !test_and_set_bit(SDF_AIL1_IO_ERROR, >sd_flags)) {
+   !cmpxchg(>sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
set_bit(SDF_WITHDRAWING, >sd_flags);
}



Regards,

Bob Peterson



Re: [Cluster-devel] [GFS2 PATCH v6 11/26] gfs2: log error reform

2019-08-20 Thread Andreas Gruenbacher
Bob,

On Thu, May 23, 2019 at 3:05 PM Bob Peterson  wrote:
> Before this patch, gfs2 kept track of journal io errors in two
> places sd_log_error and the SDF_AIL1_IO_ERROR flag in sd_flags.
> This patch consolidates the two into sd_log_error so that it
> reflects the first error encountered writing to the journal.
> In future patches, we will take advantage of this by checking
> this value rather than having to check both when reacting to
> io errors.
>
> In addition, this fixes a tight loop in unmount: If buffers
> get on the ail1 list and an io error occurs elsewhere, the
> ail1 list would never be cleared because they were always busy.
> So unmount would hang, waiting for the ail1 list to empty.
>
> Signed-off-by: Bob Peterson 
> ---
>  fs/gfs2/incore.h |  7 +++
>  fs/gfs2/log.c| 20 +++-
>  fs/gfs2/quota.c  |  2 +-
>  3 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index b261168be298..39cec5361ba5 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -620,9 +620,8 @@ enum {
> SDF_RORECOVERY  = 7, /* read only recovery */
> SDF_SKIP_DLM_UNLOCK = 8,
> SDF_FORCE_AIL_FLUSH = 9,
> -   SDF_AIL1_IO_ERROR   = 10,
> -   SDF_FS_FROZEN   = 11,
> -   SDF_WITHDRAWING = 12, /* Will withdraw eventually */
> +   SDF_FS_FROZEN   = 10,
> +   SDF_WITHDRAWING = 11, /* Will withdraw eventually */
>  };
>
>  enum gfs2_freeze_state {
> @@ -831,7 +830,7 @@ struct gfs2_sbd {
> atomic_t sd_log_in_flight;
> struct bio *sd_log_bio;
> wait_queue_head_t sd_log_flush_wait;
> -   int sd_log_error;
> +   int sd_log_error; /* First log error */
>
> atomic_t sd_reserving_log;
> wait_queue_head_t sd_reserving_log_wait;
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index 0fe11bde796b..9784763fbb4e 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -108,8 +108,7 @@ __acquires(>sd_ail_lock)
>
> if (!buffer_busy(bh)) {
> if (!buffer_uptodate(bh) &&
> -   !test_and_set_bit(SDF_AIL1_IO_ERROR,
> - >sd_flags)) {
> +   !cmpxchg(>sd_log_error, 0, -EIO)) {
> gfs2_io_error_bh(sdp, bh);
> set_bit(SDF_WITHDRAWING, >sd_flags);
> }
> @@ -203,10 +202,21 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, 
> struct gfs2_trans *tr)
>  bd_ail_st_list) {
> bh = bd->bd_bh;
> gfs2_assert(sdp, bd->bd_tr == tr);
> -   if (buffer_busy(bh))
> +   /**
> +* If another process flagged an io error, e.g. writing to the
> +* journal, error all other bhs and move them off the ail1 to
> +* prevent a tight loop when unmount tries to flush ail1,
> +* regardless of whether they're still busy. If no outside
> +* errors were found and the buffer is busy, move to the next.
> +* If the ail buffer is not busy and caught an error, flag it
> +* for others.
> +*/
> +   if (sdp->sd_log_error) {
> +   gfs2_io_error_bh(sdp, bh);

some of the error handling here is still sketchy: the only place where
sd_log_error is set without withdrawing the filesystem is
quotad_error. If the filesystem has already been marked
SDF_WITHDRAWING or SDF_WITHDRAWN, gfs2_io_error_bh will be a no-op. It
seems that we want to set SDF_WITHDRAWING here for the quotad_error
case instead of calling gfs2_io_error_bh?

> +   } else if (buffer_busy(bh)) {
> continue;
> -   if (!buffer_uptodate(bh) &&
> -   !test_and_set_bit(SDF_AIL1_IO_ERROR, >sd_flags)) {
> +   } else if (!buffer_uptodate(bh) &&
> +  !cmpxchg(>sd_log_error, 0, -EIO)) {
> gfs2_io_error_bh(sdp, bh);
> set_bit(SDF_WITHDRAWING, >sd_flags);
> }
> diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> index a8dfc86fd682..8871fca9102f 100644
> --- a/fs/gfs2/quota.c
> +++ b/fs/gfs2/quota.c
> @@ -1480,7 +1480,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const 
> char *msg, int error)
> return;
> if (!gfs2_withdrawn(sdp)) {
> fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
> -   sdp->sd_log_error = error;
> +   cmpxchg(>sd_log_error, 0, error);
> wake_up(>sd_logd_waitq);
> }
>  }
> --
> 2.21.0
>

Thanks,
Andreas



[Cluster-devel] [GFS2 PATCH v6 11/26] gfs2: log error reform

2019-05-23 Thread Bob Peterson
Before this patch, gfs2 kept track of journal io errors in two
places sd_log_error and the SDF_AIL1_IO_ERROR flag in sd_flags.
This patch consolidates the two into sd_log_error so that it
reflects the first error encountered writing to the journal.
In future patches, we will take advantage of this by checking
this value rather than having to check both when reacting to
io errors.

In addition, this fixes a tight loop in unmount: If buffers
get on the ail1 list and an io error occurs elsewhere, the
ail1 list would never be cleared because they were always busy.
So unmount would hang, waiting for the ail1 list to empty.

Signed-off-by: Bob Peterson 
---
 fs/gfs2/incore.h |  7 +++
 fs/gfs2/log.c| 20 +++-
 fs/gfs2/quota.c  |  2 +-
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b261168be298..39cec5361ba5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -620,9 +620,8 @@ enum {
SDF_RORECOVERY  = 7, /* read only recovery */
SDF_SKIP_DLM_UNLOCK = 8,
SDF_FORCE_AIL_FLUSH = 9,
-   SDF_AIL1_IO_ERROR   = 10,
-   SDF_FS_FROZEN   = 11,
-   SDF_WITHDRAWING = 12, /* Will withdraw eventually */
+   SDF_FS_FROZEN   = 10,
+   SDF_WITHDRAWING = 11, /* Will withdraw eventually */
 };
 
 enum gfs2_freeze_state {
@@ -831,7 +830,7 @@ struct gfs2_sbd {
atomic_t sd_log_in_flight;
struct bio *sd_log_bio;
wait_queue_head_t sd_log_flush_wait;
-   int sd_log_error;
+   int sd_log_error; /* First log error */
 
atomic_t sd_reserving_log;
wait_queue_head_t sd_reserving_log_wait;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 0fe11bde796b..9784763fbb4e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -108,8 +108,7 @@ __acquires(>sd_ail_lock)
 
if (!buffer_busy(bh)) {
if (!buffer_uptodate(bh) &&
-   !test_and_set_bit(SDF_AIL1_IO_ERROR,
- >sd_flags)) {
+   !cmpxchg(>sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
set_bit(SDF_WITHDRAWING, >sd_flags);
}
@@ -203,10 +202,21 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, 
struct gfs2_trans *tr)
 bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
-   if (buffer_busy(bh))
+   /**
+* If another process flagged an io error, e.g. writing to the
+* journal, error all other bhs and move them off the ail1 to
+* prevent a tight loop when unmount tries to flush ail1,
+* regardless of whether they're still busy. If no outside
+* errors were found and the buffer is busy, move to the next.
+* If the ail buffer is not busy and caught an error, flag it
+* for others.
+*/
+   if (sdp->sd_log_error) {
+   gfs2_io_error_bh(sdp, bh);
+   } else if (buffer_busy(bh)) {
continue;
-   if (!buffer_uptodate(bh) &&
-   !test_and_set_bit(SDF_AIL1_IO_ERROR, >sd_flags)) {
+   } else if (!buffer_uptodate(bh) &&
+  !cmpxchg(>sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
set_bit(SDF_WITHDRAWING, >sd_flags);
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a8dfc86fd682..8871fca9102f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1480,7 +1480,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char 
*msg, int error)
return;
if (!gfs2_withdrawn(sdp)) {
fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
-   sdp->sd_log_error = error;
+   cmpxchg(>sd_log_error, 0, error);
wake_up(>sd_logd_waitq);
}
 }
-- 
2.21.0