On Sun, Nov 04, 2007 at 12:19:19PM +0100, Torsten Kaiser wrote:
> On 11/2/07, David Chinner <[EMAIL PROTECTED]> wrote:
> > That's stalled waiting on the inode cluster buffer lock. That implies
> > that the inode lcuser is already being written out and the inode has
> > been redirtied during writeout.
> >
> > Does the kernel you are testing have the "flush inodes in ascending
> > inode number order" patches applied? If so, can you remove that
> > patch and see if the problem goes away?
> 
> I can now confirm, that I see this also with the current mainline-git-version
> I used 2.6.24-rc1-git-b4f555081fdd27d13e6ff39d455d5aefae9d2c0c
> plus the fix for the sg changes in ieee1394.

Ok, so it's probably a side effect of the writeback changes.

Attached are two patches (two because one was in a separate patchset as
a standalone change) that should prevent async writeback from blocking
on locked inode cluster buffers. Apply the xfs-factor-inotobp patch first.
Can you see if this fixes the problem?

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group


---
 fs/xfs/xfs_inode.c |  283 ++++++++++++++++++++++++-----------------------------
 1 file changed, 129 insertions(+), 154 deletions(-)

Index: 2.6.x-xfs-new/fs/xfs/xfs_inode.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_inode.c       2007-09-12 15:41:22.000000000 
+1000
+++ 2.6.x-xfs-new/fs/xfs/xfs_inode.c    2007-09-13 08:57:06.395641940 +1000
@@ -124,6 +124,126 @@ xfs_inobp_check(
 #endif
 
 /*
+ * Simple wrapper for calling xfs_imap() that includes error
+ * and bounds checking
+ */
+STATIC int
+xfs_ino_to_imap(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_ino_t       ino,
+       xfs_imap_t      *imap,
+       uint            imap_flags)
+{
+       int             error;
+
+       error = xfs_imap(mp, tp, ino, imap, imap_flags);
+       if (error) {
+               cmn_err(CE_WARN, "xfs_ino_to_imap: xfs_imap()  returned an "
+                               "error %d on %s.  Returning error.",
+                               error, mp->m_fsname);
+               return error;
+       }
+
+       /*
+        * If the inode number maps to a block outside the bounds
+        * of the file system then return NULL rather than calling
+        * read_buf and panicing when we get an error from the
+        * driver.
+        */
+       if ((imap->im_blkno + imap->im_len) >
+           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+               xfs_fs_cmn_err(CE_ALERT, mp, "xfs_ino_to_imap: "
+                       "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                       " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                       (unsigned long long) imap->im_blkno,
+                       (unsigned long long) imap->im_len,
+                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+               return XFS_ERROR(EINVAL);
+       }
+       return 0;
+}
+
+/*
+ * Find the buffer associated with the given inode map
+ * We do basic validation checks on the buffer once it has been
+ * retrieved from disk.
+ */
+STATIC int
+xfs_imap_to_bp(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_imap_t      *imap,
+       xfs_buf_t       **bpp,
+       uint            buf_flags,
+       uint            imap_flags)
+{
+       int             error;
+       int             i;
+       int             ni;
+       xfs_buf_t       *bp;
+
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+                                  (int)imap->im_len, XFS_BUF_LOCK, &bp);
+       if (error) {
+               cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                               "an error %d on %s.  Returning error.",
+                               error, mp->m_fsname);
+               return error;
+       }
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+        */
+#ifdef DEBUG
+       ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
+#else  /* usual case */
+       ni = 1;
+#endif
+
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC 
&&
+                           XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       if (imap_flags & XFS_IMAP_BULKSTAT) {
+                               xfs_trans_brelse(tp, bp);
+                               return XFS_ERROR(EINVAL);
+                       }
+                       XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
+                                               XFS_ERRLEVEL_HIGH, mp, dip);
+#ifdef DEBUG
+                       cmn_err(CE_PANIC,
+                                       "Device %s - bad inode magic/vsn "
+                                       "daddr %lld #%d (magic=%x)",
+                               XFS_BUFTARG_NAME(mp->m_ddev_targp),
+                               (unsigned long long)imap->im_blkno, i,
+                               be16_to_cpu(dip->di_core.di_magic));
+#endif
+                       xfs_trans_brelse(tp, bp);
+                       return XFS_ERROR(EFSCORRUPTED);
+               }
+       }
+
+       xfs_inobp_check(mp, bp);
+
+       /*
+        * Mark the buffer as an inode buffer now that it looks good
+        */
+       XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+       *bpp = bp;
+       return 0;
+}
+
+/*
  * This routine is called to map an inode number within a file
  * system to the buffer containing the on-disk version of the
  * inode.  It returns a pointer to the buffer containing the
@@ -145,72 +265,19 @@ xfs_inotobp(
        xfs_buf_t       **bpp,
        int             *offset)
 {
-       int             di_ok;
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-       xfs_dinode_t    *dip;
 
-       /*
-        * Call the space management code to find the location of the
-        * inode on disk.
-        */
        imap.im_blkno = 0;
-       error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
-       if (error != 0) {
-               cmn_err(CE_WARN,
-       "xfs_inotobp: xfs_imap()  returned an "
-       "error %d on %s.  Returning error.", error, mp->m_fsname);
+       error = xfs_ino_to_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+       if (error)
                return error;
-       }
-
-       /*
-        * If the inode number maps to a block outside the bounds of the
-        * file system then return NULL rather than calling read_buf
-        * and panicing when we get an error from the driver.
-        */
-       if ((imap.im_blkno + imap.im_len) >
-           XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-               cmn_err(CE_WARN,
-       "xfs_inotobp: inode number (%llu + %d) maps to a block outside the 
bounds "
-       "of the file system %s.  Returning EINVAL.",
-                       (unsigned long long)imap.im_blkno,
-                       imap.im_len, mp->m_fsname);
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-        * default to just a read_buf() call.
-        */
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                  (int)imap.im_len, XFS_BUF_LOCK, &bp);
 
-       if (error) {
-               cmn_err(CE_WARN,
-       "xfs_inotobp: xfs_trans_read_buf()  returned an "
-       "error %d on %s.  Returning error.", error, mp->m_fsname);
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+       if (error)
                return error;
-       }
-       dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
-       di_ok =
-               be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
-               XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-       if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
-                       XFS_RANDOM_ITOBP_INOTOBP))) {
-               XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
-               xfs_trans_brelse(tp, bp);
-               cmn_err(CE_WARN,
-       "xfs_inotobp: XFS_TEST_ERROR()  returned an "
-       "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
-       xfs_inobp_check(mp, bp);
 
-       /*
-        * Set *dipp to point to the on-disk inode in the buffer.
-        */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        *offset = imap.im_boffset;
@@ -251,41 +318,15 @@ xfs_itobp(
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-       int             i;
-       int             ni;
 
        if (ip->i_blkno == (xfs_daddr_t)0) {
-               /*
-                * Call the space management code to find the location of the
-                * inode on disk.
-                */
                imap.im_blkno = bno;
-               if ((error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                       XFS_IMAP_LOOKUP | imap_flags)))
+               error = xfs_ino_to_imap(mp, tp, ip->i_ino, &imap,
+                                       XFS_IMAP_LOOKUP | imap_flags);
+               if (error)
                        return error;
 
                /*
-                * If the inode number maps to a block outside the bounds
-                * of the file system then return NULL rather than calling
-                * read_buf and panicing when we get an error from the
-                * driver.
-                */
-               if ((imap.im_blkno + imap.im_len) >
-                   XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-#ifdef DEBUG
-                       xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                                       "(imap.im_blkno (0x%llx) "
-                                       "+ imap.im_len (0x%llx)) > "
-                                       " XFS_FSB_TO_BB(mp, "
-                                       "mp->m_sb.sb_dblocks) (0x%llx)",
-                                       (unsigned long long) imap.im_blkno,
-                                       (unsigned long long) imap.im_len,
-                                       XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-#endif /* DEBUG */
-                       return XFS_ERROR(EINVAL);
-               }
-
-               /*
                 * Fill in the fields in the inode that will be used to
                 * map the inode to its buffer from now on.
                 */
@@ -303,76 +344,10 @@ xfs_itobp(
        }
        ASSERT(bno == 0 || bno == imap.im_blkno);
 
-       /*
-        * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
-        * default to just a read_buf() call.
-        */
-       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
-                                  (int)imap.im_len, XFS_BUF_LOCK, &bp);
-       if (error) {
-#ifdef DEBUG
-               xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
-                               "xfs_trans_read_buf() returned error %d, "
-                               "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
-                               error, (unsigned long long) imap.im_blkno,
-                               (unsigned long long) imap.im_len);
-#endif /* DEBUG */
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+       if (error)
                return error;
-       }
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-        * No validation is done here in userspace (xfs_repair).
-        */
-#if !defined(__KERNEL__)
-       ni = 0;
-#elif defined(DEBUG)
-       ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
-#else  /* usual case */
-       ni = 1;
-#endif
-
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
-
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC 
&&
-                           XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       if (imap_flags & XFS_IMAP_BULKSTAT) {
-                               xfs_trans_brelse(tp, bp);
-                               return XFS_ERROR(EINVAL);
-                       }
-#ifdef DEBUG
-                       cmn_err(CE_ALERT,
-                                       "Device %s - bad inode magic/vsn "
-                                       "daddr %lld #%d (magic=%x)",
-                               XFS_BUFTARG_NAME(mp->m_ddev_targp),
-                               (unsigned long long)imap.im_blkno, i,
-                               be16_to_cpu(dip->di_core.di_magic));
-#endif
-                       XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
-                                            mp, dip);
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
-       }
-
-       xfs_inobp_check(mp, bp);
 
-       /*
-        * Mark the buffer as an inode buffer now that it looks good
-        */
-       XFS_BUF_SET_VTYPE(bp, B_FS_INO);
-
-       /*
-        * Set *dipp to point to the on-disk inode in the buffer.
-        */
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        return 0;
---
 fs/xfs/linux-2.6/xfs_super.c |    3 +-
 fs/xfs/linux-2.6/xfs_vnode.h |    5 ---
 fs/xfs/xfs_inode.c           |   33 ++++++++++++++++---------
 fs/xfs/xfs_inode.h           |    7 +++--
 fs/xfs/xfs_vnodeops.c        |   55 +++++++++----------------------------------
 5 files changed, 41 insertions(+), 62 deletions(-)

Index: 2.6.x-xfs-new/fs/xfs/xfs_inode.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_inode.c       2007-11-05 10:17:36.000000000 
+1100
+++ 2.6.x-xfs-new/fs/xfs/xfs_inode.c    2007-11-05 10:33:49.590268027 +1100
@@ -306,14 +306,15 @@ xfs_inotobp(
  * 0 for the disk block address.
  */
 int
-xfs_itobp(
+xfs_itobp_flags(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_inode_t     *ip,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
        xfs_daddr_t     bno,
-       uint            imap_flags)
+       uint            imap_flags,
+       uint            buf_flags)
 {
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
@@ -344,10 +345,17 @@ xfs_itobp(
        }
        ASSERT(bno == 0 || bno == imap.im_blkno);
 
-       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
        if (error)
                return error;
 
+       if (!bp) {
+               ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+               ASSERT(tp == NULL);
+               *bpp = NULL;
+               return EAGAIN;
+       }
+
        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
        *bpp = bp;
        return 0;
@@ -3068,15 +3076,6 @@ xfs_iflush(
        }
 
        /*
-        * Get the buffer containing the on-disk inode.
-        */
-       error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
-       if (error) {
-               xfs_ifunlock(ip);
-               return error;
-       }
-
-       /*
         * Decide how buffer will be flushed out.  This is done before
         * the call to xfs_iflush_int because this field is zeroed by it.
         */
@@ -3125,6 +3124,16 @@ xfs_iflush(
        }
 
        /*
+        * Get the buffer containing the on-disk inode.
+        */
+       error = xfs_itobp_flags(mp, NULL, ip, &dip, &bp, 0, 0,
+                       (flags == INT_ASYNC) ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+       if (error ||!bp) {
+               xfs_ifunlock(ip);
+               return error;
+       }
+
+       /*
         * First flush out the inode that xfs_iflush was called with.
         */
        error = xfs_iflush_int(ip, bp);
Index: 2.6.x-xfs-new/fs/xfs/xfs_inode.h
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_inode.h       2007-11-02 13:44:46.000000000 
+1100
+++ 2.6.x-xfs-new/fs/xfs/xfs_inode.h    2007-11-05 10:25:44.885153248 +1100
@@ -488,9 +488,12 @@ int                xfs_finish_reclaim_all(struct xfs_m
 /*
  * xfs_inode.c prototypes.
  */
-int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+int            xfs_itobp_flags(struct xfs_mount *, struct xfs_trans *,
                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf 
**,
-                         xfs_daddr_t, uint);
+                         xfs_daddr_t, uint, uint);
+#define xfs_itobp(mp, tp, ip, dipp, bpp, bno, iflags)  \
+       xfs_itobp_flags(mp, tp, ip, dipp, bpp, bno, iflags, XFS_BUF_LOCK)
+
 int            xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                          xfs_inode_t **, xfs_daddr_t, uint);
 int            xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
Index: 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/linux-2.6/xfs_super.c     2007-11-02 
13:44:50.000000000 +1100
+++ 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_super.c  2007-11-05 10:39:05.969204451 
+1100
@@ -840,7 +840,8 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
-       int                     error = 0, flags = FLUSH_INODE;
+       int                     error = 0;
+       int                     flags = 0;
 
        xfs_itrace_entry(XFS_I(inode));
        if (sync) {
Index: 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_vnode.h
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/linux-2.6/xfs_vnode.h     2007-10-02 
16:01:47.000000000 +1000
+++ 2.6.x-xfs-new/fs/xfs/linux-2.6/xfs_vnode.h  2007-11-05 10:40:49.103817818 
+1100
@@ -73,12 +73,9 @@ typedef enum bhv_vrwlock {
 #define IO_INVIS       0x00020         /* don't update inode timestamps */
 
 /*
- * Flags for vop_iflush call
+ * Flags for xfs_inode_flush
  */
 #define FLUSH_SYNC             1       /* wait for flush to complete   */
-#define FLUSH_INODE            2       /* flush the inode itself       */
-#define FLUSH_LOG              4       /* force the last log entry for
-                                        * this inode out to disk       */
 
 /*
  * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
Index: 2.6.x-xfs-new/fs/xfs/xfs_vnodeops.c
===================================================================
--- 2.6.x-xfs-new.orig/fs/xfs/xfs_vnodeops.c    2007-11-05 10:02:05.000000000 
+1100
+++ 2.6.x-xfs-new/fs/xfs/xfs_vnodeops.c 2007-11-05 10:37:53.398623943 +1100
@@ -3556,29 +3556,6 @@ xfs_inode_flush(
            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
                return 0;
 
-       if (flags & FLUSH_LOG) {
-               if (iip && iip->ili_last_lsn) {
-                       xlog_t          *log = mp->m_log;
-                       xfs_lsn_t       sync_lsn;
-                       int             s, log_flags = XFS_LOG_FORCE;
-
-                       s = GRANT_LOCK(log);
-                       sync_lsn = log->l_last_sync_lsn;
-                       GRANT_UNLOCK(log, s);
-
-                       if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
-                               if (flags & FLUSH_SYNC)
-                                       log_flags |= XFS_LOG_SYNC;
-                               error = xfs_log_force(mp, iip->ili_last_lsn, 
log_flags);
-                               if (error)
-                                       return error;
-                       }
-
-                       if (ip->i_update_core == 0)
-                               return 0;
-               }
-       }
-
        /*
         * We make this non-blocking if the inode is contended,
         * return EAGAIN to indicate to the caller that they
@@ -3586,30 +3563,22 @@ xfs_inode_flush(
         * blocking on inodes inside another operation right
         * now, they get caught later by xfs_sync.
         */
-       if (flags & FLUSH_INODE) {
-               int     flush_flags;
-
-               if (flags & FLUSH_SYNC) {
-                       xfs_ilock(ip, XFS_ILOCK_SHARED);
-                       xfs_iflock(ip);
-               } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-                       if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
-                               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                               return EAGAIN;
-                       }
-               } else {
+       if (flags & FLUSH_SYNC) {
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+               xfs_iflock(ip);
+       } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
                        return EAGAIN;
                }
-
-               if (flags & FLUSH_SYNC)
-                       flush_flags = XFS_IFLUSH_SYNC;
-               else
-                       flush_flags = XFS_IFLUSH_ASYNC;
-
-               error = xfs_iflush(ip, flush_flags);
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       } else {
+               return EAGAIN;
        }
 
+       error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
+                                                   : XFS_IFLUSH_ASYNC);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
        return error;
 }
 

Reply via email to