This is updated version of patch aimed to fix direct_io error handling issue
i've previously sent 2 wheeks ago. If you don't like anything in this patch
plese let me know.

Changes:
  - comments added. I think now it is clearly describe things.
  - patch prepared against 2.6.20-mm2

How this patch tested:
  - LTP test, and other readv/writev op tests.
  - fsstress test.
  - manual direct_io tests.

Log:
 - Move common segment checks to separate helper function.
 - Trim off blocks after generic_file_direct_write() has failed.
 - Update out of date comments about direct_io locking rules.

Signed-off-by: Monakhov Dmitriy <[EMAIL PROTECTED]>
---
 mm/filemap.c |  107 +++++++++++++++++++++++++++++++++++----------------------
 1 files changed, 66 insertions(+), 41 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index a9284c2..c81184c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1147,6 +1147,38 @@ success:
        return size;
 }
 
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Adjust number of segments and amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs,
+               size_t *count, unsigned long access_flags)
+{
+       unsigned long   seg;
+       for (seg = 0; seg < *nr_segs; seg++) {
+               const struct iovec *iv = &iov[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               *count += iv->iov_len;
+               if (unlikely((ssize_t)(*count|iv->iov_len) < 0))
+                       return -EINVAL;
+               if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+                       continue;
+               if (seg == 0)
+                       return -EFAULT;
+               *nr_segs = seg;
+               *count -= iv->iov_len;  /* This segment is no good */
+               break;
+       }
+       return 0;
+}
+
 /**
  * generic_file_aio_read - generic filesystem read routine
  * @iocb:      kernel I/O control block
@@ -1168,24 +1200,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct 
iovec *iov,
        loff_t *ppos = &iocb->ki_pos;
 
        count = 0;
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *iv = &iov[seg];
-
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               count += iv->iov_len;
-               if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-                       return -EINVAL;
-               if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-                       continue;
-               if (seg == 0)
-                       return -EFAULT;
-               nr_segs = seg;
-               count -= iv->iov_len;   /* This segment is no good */
-               break;
-       }
+       retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+       if (retval)
+               return retval;
 
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
@@ -2080,8 +2097,9 @@ generic_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
        /*
         * Sync the fs metadata but not the minor inode changes and
         * of course not the data as we did direct DMA for the IO.
-        * i_mutex is held, which protects generic_osync_inode() from
-        * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
+        * i_mutex may not being held, if so some specific locking
+        * ordering must protect generic_osync_inode() from livelocking.
+        * AIO O_DIRECT ops attempt to sync metadata here.
         */
        if ((written >= 0 || written == -EIOCBQUEUED) &&
            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2271,30 +2289,14 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, 
const struct iovec *iov,
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
-       unsigned long   seg;
        loff_t          pos;
        ssize_t         written;
        ssize_t         err;
 
        ocount = 0;
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *iv = &iov[seg];
-
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               ocount += iv->iov_len;
-               if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-                       return -EINVAL;
-               if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-                       continue;
-               if (seg == 0)
-                       return -EFAULT;
-               nr_segs = seg;
-               ocount -= iv->iov_len;  /* This segment is no good */
-               break;
-       }
+       err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+       if (err)
+               return err;
        if (unlikely(aio_restarted())) {
                /* nothing to transfer, may just need to sync data */
                return ocount;
@@ -2420,6 +2422,29 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const 
struct iovec *iov,
        mutex_lock(&inode->i_mutex);
        ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
                        &iocb->ki_pos);
+       /* 
+        * If __generic_file_aio_write_nolock has failed.
+        * This may happen because of:
+        * 1) Bad segment found (failed before actual write attempt)
+        * 2) Segments are good, but actual write operation failed
+        *    and may have instantiated a few blocks outside i_size.
+        *   a) in case of buffered write these blocks was already
+        *      trimmed by generic_file_buffered_write()
+        *   b) in case of O_DIRECT these blocks weren't trimmed yet.
+        *
+        * In case of (2b) these blocks have to be trimmed off again.
+        */
+       if (unlikely( ret < 0 && file->f_flags & O_DIRECT)) {
+               unsigned long nr_segs_avail = nr_segs;
+               size_t count = 0;
+               if (!generic_segment_checks(iov, &nr_segs_avail, &count,
+                               VERIFY_READ)) {
+                       /*It is (2b) case, because segments are good*/
+                       loff_t isize = i_size_read(inode);
+                       if (pos + count > isize)
+                               vmtruncate(inode, isize);
+               }
+       }
        mutex_unlock(&inode->i_mutex);
 
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -2436,8 +2461,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const 
struct iovec *iov,
 EXPORT_SYMBOL(generic_file_aio_write);
 
 /*
- * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if 
something
- * went wrong during pagecache shootdown.
+ * May be called without i_mutex for writes to S_ISREG files.
+ * Returns -EIO if something went wrong during pagecache shootdown.
  */
 static ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-- 
1.5.0.1

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to