Re: [f2fs-dev] [PATCH 07/11] iomap: update ki_pos in iomap_file_buffered_write

2023-05-25 Thread Andreas Gruenbacher
On Wed, May 24, 2023 at 8:54 AM Christoph Hellwig  wrote:
> All callers of iomap_file_buffered_write need to updated ki_pos, move it
> into common code.

Thanks for this set of cleanups, especially for the patch killing
current->backing_dev_info.

Reviewed-by: Andreas Gruenbacher 

> Signed-off-by: Christoph Hellwig 
> Acked-by: Damien Le Moal 
> Reviewed-by: Darrick J. Wong 
> ---
>  fs/gfs2/file.c | 4 +---
>  fs/iomap/buffered-io.c | 9 ++---
>  fs/xfs/xfs_file.c  | 2 --
>  fs/zonefs/file.c   | 4 +---
>  4 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 904a0d6ac1a1a9..c6a7555d5ad8bb 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -1044,10 +1044,8 @@ static ssize_t gfs2_file_buffered_write(struct kiocb 
> *iocb,
> pagefault_disable();
> ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
> pagefault_enable();
> -   if (ret > 0) {
> -   iocb->ki_pos += ret;
> +   if (ret > 0)
> written += ret;
> -   }
>
> if (inode == sdp->sd_rindex)
> gfs2_glock_dq_uninit(statfs_gh);
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 063133ec77f49e..550525a525c45c 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -864,16 +864,19 @@ iomap_file_buffered_write(struct kiocb *iocb, struct 
> iov_iter *i,
> .len= iov_iter_count(i),
> .flags  = IOMAP_WRITE,
> };
> -   int ret;
> +   ssize_t ret;
>
> if (iocb->ki_flags & IOCB_NOWAIT)
> iter.flags |= IOMAP_NOWAIT;
>
> while ((ret = iomap_iter(&iter, ops)) > 0)
> iter.processed = iomap_write_iter(&iter, i);
> -   if (iter.pos == iocb->ki_pos)
> +
> +   if (unlikely(ret < 0))
> return ret;
> -   return iter.pos - iocb->ki_pos;
> +   ret = iter.pos - iocb->ki_pos;
> +   iocb->ki_pos += ret;
> +   return ret;
>  }
>  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 431c3fd0e2b598..d57443db633637 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -720,8 +720,6 @@ xfs_file_buffered_write(
> trace_xfs_file_buffered_write(iocb, from);
> ret = iomap_file_buffered_write(iocb, from,
> &xfs_buffered_write_iomap_ops);
> -   if (likely(ret >= 0))
> -   iocb->ki_pos += ret;
>
> /*
>  * If we hit a space limit, try to free up some lingering preallocated
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index 132f01d3461f14..e212d0636f848e 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -643,9 +643,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb 
> *iocb,
> goto inode_unlock;
>
> ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
> -   if (ret > 0)
> -   iocb->ki_pos += ret;
> -   else if (ret == -EIO)
> +   if (ret == -EIO)
> zonefs_io_error(inode, true);
>
>  inode_unlock:
> --
> 2.39.2
>



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH 05/17] filemap: update ki_pos in generic_perform_write

2023-04-24 Thread Andreas Gruenbacher
On Mon, Apr 24, 2023 at 8:22 AM Christoph Hellwig  wrote:
> All callers of generic_perform_write need to updated ki_pos, move it into
> common code.

We've actually got a similar situation with
iomap_file_buffered_write() and its callers. Would it make sense to
fix that up as well?

> Signed-off-by: Christoph Hellwig 
> ---
>  fs/ceph/file.c | 2 --
>  fs/ext4/file.c | 9 +++--
>  fs/f2fs/file.c | 1 -
>  fs/nfs/file.c  | 1 -
>  mm/filemap.c   | 8 
>  5 files changed, 7 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index f4d8bf7dec88a8..feeb9882ef635a 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1894,8 +1894,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, 
> struct iov_iter *from)
>  * can not run at the same time
>  */
> written = generic_perform_write(iocb, from);
> -   if (likely(written >= 0))
> -   iocb->ki_pos = pos + written;
> ceph_end_io_write(inode);
> }
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 0b8b4499e5ca18..1026acaf1235a0 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -291,12 +291,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb 
> *iocb,
>
>  out:
> inode_unlock(inode);
> -   if (likely(ret > 0)) {
> -   iocb->ki_pos += ret;
> -   ret = generic_write_sync(iocb, ret);
> -   }
> -
> -   return ret;
> +   if (unlikely(ret <= 0))
> +   return ret;
> +   return generic_write_sync(iocb, ret);
>  }
>
>  static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t 
> offset,
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index f4ab23efcf85f8..5a9ae054b6da7d 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -4511,7 +4511,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb 
> *iocb,
> current->backing_dev_info = NULL;
>
> if (ret > 0) {
> -   iocb->ki_pos += ret;
> f2fs_update_iostat(F2FS_I_SB(inode), inode,
> APP_BUFFERED_IO, ret);
> }
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 893625eacab9fa..abdae2b29369be 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -666,7 +666,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct 
> iov_iter *from)
> goto out;
>
> written = result;
> -   iocb->ki_pos += written;
> nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
>
> if (mntflags & NFS_MOUNT_WRITE_EAGER) {
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 2723104cc06a12..0110bde3708b3f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3960,7 +3960,10 @@ ssize_t generic_perform_write(struct kiocb *iocb, 
> struct iov_iter *i)
> balance_dirty_pages_ratelimited(mapping);
> } while (iov_iter_count(i));
>
> -   return written ? written : status;
> +   if (!written)
> +   return status;
> +   iocb->ki_pos += written;

Could be turned into:
iocb->ki_pos = pos;

> +   return written;
>  }
>  EXPORT_SYMBOL(generic_perform_write);
>
> @@ -4039,7 +4042,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
> struct iov_iter *from)
> endbyte = pos + status - 1;
> err = filemap_write_and_wait_range(mapping, pos, endbyte);
> if (err == 0) {
> -   iocb->ki_pos = endbyte + 1;
> written += status;
> invalidate_mapping_pages(mapping,
>  pos >> PAGE_SHIFT,
> @@ -4052,8 +4054,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
> struct iov_iter *from)
> }
> } else {
> written = generic_perform_write(iocb, from);
> -   if (likely(written > 0))
> -   iocb->ki_pos += written;
> }
>  out:
> current->backing_dev_info = NULL;
> --
> 2.39.2
>

Thanks,
Andreas



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH v5 17/23] gfs2: Convert gfs2_write_cache_jdata() to use filemap_get_folios_tag()

2023-01-05 Thread Andreas Gruenbacher
   */
> -   *done_index = page->index + 1;
> +   *done_index = folio->index +
> +   folio_nr_pages(folio);
> ret = 1;
> break;
> }
> @@ -305,8 +310,8 @@ static int gfs2_write_cache_jdata(struct address_space 
> *mapping,
>  {
> int ret = 0;
> int done = 0;
> -   struct pagevec pvec;
> -   int nr_pages;
> +   struct folio_batch fbatch;
> +   int nr_folios;
> pgoff_t writeback_index;
> pgoff_t index;
> pgoff_t end;
> @@ -315,7 +320,7 @@ static int gfs2_write_cache_jdata(struct address_space 
> *mapping,
> int range_whole = 0;
> xa_mark_t tag;
>
> -   pagevec_init(&pvec);
> +   folio_batch_init(&fbatch);
> if (wbc->range_cyclic) {
> writeback_index = mapping->writeback_index; /* prev offset */
> index = writeback_index;
> @@ -341,17 +346,18 @@ static int gfs2_write_cache_jdata(struct address_space 
> *mapping,
> tag_pages_for_writeback(mapping, index, end);
> done_index = index;
> while (!done && (index <= end)) {
> -   nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, 
> end,
> -   tag);
> -   if (nr_pages == 0)
> +   nr_folios = filemap_get_folios_tag(mapping, &index, end,
> +   tag, &fbatch);
> +       if (nr_folios == 0)
> break;
>
> -   ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, 
> &done_index);
> +   ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch,
> +   &done_index);
> if (ret)
> done = 1;
> if (ret > 0)
> ret = 0;
> -   pagevec_release(&pvec);
> +   folio_batch_release(&fbatch);
> cond_resched();
> }
>
> --
> 2.38.1
>

Reviewed-by: Andreas Gruenbacher 

Thanks,
Andreas



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH 23/27] block: add a bdev_max_discard_sectors helper

2022-04-06 Thread Andreas Gruenbacher
On Wed, Apr 6, 2022 at 8:07 AM Christoph Hellwig  wrote:
>
> Add a helper to query the number of sectors support per each discard bio
> based on the block device and use this helper to stop various places from
> poking into the request_queue to see if discard is supported and if so how
> much.  This mirrors what is done e.g. for write zeroes as well.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  block/blk-core.c|  2 +-
>  block/blk-lib.c |  2 +-
>  block/ioctl.c   |  3 +--
>  drivers/block/drbd/drbd_main.c  |  2 +-
>  drivers/block/drbd/drbd_nl.c| 12 +++-
>  drivers/block/drbd/drbd_receiver.c  |  5 ++---
>  drivers/block/loop.c|  9 +++--
>  drivers/block/rnbd/rnbd-srv-dev.h   |  6 +-
>  drivers/block/xen-blkback/xenbus.c  |  2 +-
>  drivers/md/bcache/request.c |  4 ++--
>  drivers/md/bcache/super.c   |  2 +-
>  drivers/md/bcache/sysfs.c   |  2 +-
>  drivers/md/dm-cache-target.c|  9 +
>  drivers/md/dm-clone-target.c|  9 +
>  drivers/md/dm-io.c  |  2 +-
>  drivers/md/dm-log-writes.c  |  3 +--
>  drivers/md/dm-raid.c|  9 ++---
>  drivers/md/dm-table.c   |  4 +---
>  drivers/md/dm-thin.c|  9 +
>  drivers/md/dm.c |  2 +-
>  drivers/md/md-linear.c  |  4 ++--
>  drivers/md/raid0.c  |  2 +-
>  drivers/md/raid1.c  |  6 +++---
>  drivers/md/raid10.c |  8 
>  drivers/md/raid5-cache.c|  2 +-
>  drivers/target/target_core_device.c |  8 +++-
>  fs/btrfs/extent-tree.c  |  4 ++--
>  fs/btrfs/ioctl.c|  2 +-
>  fs/exfat/file.c |  2 +-
>  fs/exfat/super.c| 10 +++---
>  fs/ext4/ioctl.c | 10 +++---
>  fs/ext4/super.c | 10 +++---
>  fs/f2fs/f2fs.h  |  3 +--
>  fs/f2fs/segment.c   |  6 ++
>  fs/fat/file.c   |  2 +-
>  fs/fat/inode.c  | 10 +++---
>  fs/gfs2/rgrp.c  |  2 +-
>  fs/jbd2/journal.c   |  7 ++-
>  fs/jfs/ioctl.c  |  2 +-
>  fs/jfs/super.c  |  8 ++--
>  fs/nilfs2/ioctl.c   |  2 +-
>  fs/ntfs3/file.c |  2 +-
>  fs/ntfs3/super.c|  2 +-
>  fs/ocfs2/ioctl.c|  2 +-
>  fs/xfs/xfs_discard.c|  2 +-
>  fs/xfs/xfs_super.c  | 12 
>  include/linux/blkdev.h  |  5 +
>  mm/swapfile.c   | 17 ++---
>  48 files changed, 87 insertions(+), 163 deletions(-)
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 937bb6b863317..b5c3a8049134c 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -820,7 +820,7 @@ void submit_bio_noacct(struct bio *bio)
>
> switch (bio_op(bio)) {
> case REQ_OP_DISCARD:
> -   if (!blk_queue_discard(q))
> +   if (!bdev_max_discard_sectors(bdev))
> goto not_supported;
> break;
> case REQ_OP_SECURE_ERASE:
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 2ae32a722851c..8b4b66d3a9bfc 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -53,7 +53,7 @@ int __blkdev_issue_discard(struct block_device *bdev, 
> sector_t sector,
> return -EOPNOTSUPP;
> op = REQ_OP_SECURE_ERASE;
> } else {
> -   if (!blk_queue_discard(q))
> +   if (!bdev_max_discard_sectors(bdev))
> return -EOPNOTSUPP;
> op = REQ_OP_DISCARD;
> }
> diff --git a/block/ioctl.c b/block/ioctl.c
> index ad3771b268b81..c2cd3ba5290ce 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -87,14 +87,13 @@ static int blk_ioctl_discard(struct block_device *bdev, 
> fmode_t mode,
>  {
> uint64_t range[2];
> uint64_t start, len;
> -   struct request_queue *q = bdev_get_queue(bdev);
> struct inode *inode = bdev->bd_inode;
> int err;
>
> if (!(mode & FMODE_WRITE))
> return -EBADF;
>
> -   if (!blk_queue_discard(q))
> +   if (!bdev_max_discard_sectors(bdev))
> return -EOPNOTSUPP;
>
> if (copy_from_user(range, (void __user *)arg, sizeof(range)))
> diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
> index 9d43aadde19ad..8fd89a1b0b7b3 100644
> --- a/drivers/block/drbd/drbd_main.c
> +++ b/drivers/block/drbd/drbd_main.c
> @@ -942,7 +942,7 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, 
> int trigger_reply, enu
> cpu_to_be32(bdev_alignment_offset(bdev));
> p->qlim->io_min = cpu_to_be32(bd

[f2fs-dev] [PATCH 2/2] gfs2: Rework read and page fault locking

2020-06-19 Thread Andreas Gruenbacher
The cache consistency model of filesystems like gfs2 is such that if
data is found in the page cache, the data is up to date and can be used
without taking any filesystem locks.  If a page is not cached,
filesystem locks must be taken before populating the page cache.

Thus far,  gfs2 has taken the filesystem locks inside the ->readpage and
->readpages address space operations.  This was already causing lock
ordering problems, but commit d4388340ae0b ("fs: convert mpage_readpages
to mpage_readahead") made things worse: the ->readahead operation is
called with the pages to readahead locked, so grabbing the inode's glock
can now deadlock with processes which are holding the inode glock while
trying to lock the same pages.

Fix this by taking the inode glock in the ->read_iter file and ->fault
vm operations.  To avoid taking the inode glock when the data is already
cached, the ->read_iter file operation first tries to read the data with
the IOCB_CACHED flag set.  If that fails, the inode glock is locked and
the operation is repeated without the IOCB_CACHED flag.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/aops.c | 27 ++
 fs/gfs2/file.c | 61 --
 2 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 72c9560f4467..73c2fe768a3f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -513,26 +513,10 @@ static int __gfs2_readpage(void *file, struct page *page)
 
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-   struct address_space *mapping = page->mapping;
-   struct gfs2_inode *ip = GFS2_I(mapping->host);
-   struct gfs2_holder gh;
int error;
 
-   unlock_page(page);
-   gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-   error = gfs2_glock_nq(&gh);
-   if (unlikely(error))
-   goto out;
-   error = AOP_TRUNCATED_PAGE;
-   lock_page(page);
-   if (page->mapping == mapping && !PageUptodate(page))
-   error = __gfs2_readpage(file, page);
-   else
-   unlock_page(page);
-   gfs2_glock_dq(&gh);
-out:
-   gfs2_holder_uninit(&gh);
-   if (error && error != AOP_TRUNCATED_PAGE)
+   error = __gfs2_readpage(file, page);
+   if (error)
lock_page(page);
return error;
 }
@@ -598,16 +582,9 @@ static void gfs2_readahead(struct readahead_control *rac)
 {
struct inode *inode = rac->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
-   struct gfs2_holder gh;
 
-   gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-   if (gfs2_glock_nq(&gh))
-   goto out_uninit;
if (!gfs2_is_stuffed(ip))
mpage_readahead(rac, gfs2_block_map);
-   gfs2_glock_dq(&gh);
-out_uninit:
-   gfs2_holder_uninit(&gh);
 }
 
 /**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fe305e4bfd37..f729b0ff2a3c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -558,8 +558,29 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
return block_page_mkwrite_return(ret);
 }
 
+static vm_fault_t gfs2_fault(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct gfs2_inode *ip = GFS2_I(inode);
+   struct gfs2_holder gh;
+   vm_fault_t ret;
+   int err;
+
+   gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+   err = gfs2_glock_nq(&gh);
+   if (err) {
+   ret = block_page_mkwrite_return(err);
+   goto out_uninit;
+   }
+   ret = filemap_fault(vmf);
+   gfs2_glock_dq(&gh);
+out_uninit:
+   gfs2_holder_uninit(&gh);
+   return ret;
+}
+
 static const struct vm_operations_struct gfs2_vm_ops = {
-   .fault = filemap_fault,
+   .fault = gfs2_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
 };
@@ -824,15 +845,51 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, 
struct iov_iter *from)
 
 static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
+   struct gfs2_inode *ip;
+   struct gfs2_holder gh;
+   size_t written = 0;
ssize_t ret;
 
+   gfs2_holder_mark_uninitialized(&gh);
if (iocb->ki_flags & IOCB_DIRECT) {
ret = gfs2_file_direct_read(iocb, to);
if (likely(ret != -ENOTBLK))
return ret;
iocb->ki_flags &= ~IOCB_DIRECT;
}
-   return generic_file_read_iter(iocb, to);
+   iocb->ki_flags |= IOCB_CACHED;
+   ret = generic_file_read_iter(iocb, to);
+   iocb->ki_flags &= ~IOCB_CACHED;
+   if (ret >= 0) {
+   if (!iov_iter_count(to))
+   return ret;
+   written = ret;
+   } else {
+ 

[f2fs-dev] [PATCH 1/2] fs: Add IOCB_CACHED flag for generic_file_read_iter

2020-06-19 Thread Andreas Gruenbacher
Add an IOCB_CACHED flag which indicates to generic_file_read_iter that
it should only regard the page cache, without triggering any filesystem
I/O for the actual request or for readahead.  With this flag, -EAGAIN is
returned when regular I/O would be triggered similar to the IOCB_NOWAIT
flag, and -ECANCELED is returned when readahead would be triggered.

This allows the caller to perform a tentative read out of the page
cache, and to retry the read if the requested pages are not cached.

Please see the next commit for what this is used for.

Signed-off-by: Andreas Gruenbacher 
---
 include/linux/fs.h |  1 +
 mm/filemap.c   | 16 ++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..74eade571b1c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ enum rw_hint {
 #define IOCB_SYNC  (1 << 5)
 #define IOCB_WRITE (1 << 6)
 #define IOCB_NOWAIT(1 << 7)
+#define IOCB_CACHED(1 << 8)
 
 struct kiocb {
struct file *ki_filp;
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..bd11f27bf6ae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
page = find_get_page(mapping, index);
if (!page) {
-   if (iocb->ki_flags & IOCB_NOWAIT)
+   if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED))
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
@@ -2038,12 +2038,17 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
goto no_cached_page;
}
if (PageReadahead(page)) {
+   if (iocb->ki_flags & IOCB_CACHED) {
+   put_page(page);
+   error = -ECANCELED;
+   goto out;
+   }
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
-   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) {
put_page(page);
goto would_block;
}
@@ -2249,6 +2254,13 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read);
  *
  * This is the "read_iter()" routine for all filesystems
  * that can use the page cache directly.
+ *
+ * In the IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN should be
+ * returned if completing the request would require I/O; this does not prevent
+ * readahead.  The IOCB_CACHED flag indicates that -EAGAIN should be returned
+ * as under the IOCB_NOWAIT flag, and that -ECANCELED should be returned when
+ * readhead would be triggered.
+ *
  * Return:
  * * number of bytes copied, even for partial reads
  * * negative error code if nothing was read
-- 
2.26.2



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


[f2fs-dev] [RFC PATCH 0/2] gfs2 readahead regression in v5.8-rc1

2020-06-19 Thread Andreas Gruenbacher
Hello,

can the two patches in this set still be considered for v5.8?

Commit d4388340ae0b ("fs: convert mpage_readpages to mpage_readahead")
which converts gfs2 and other filesystems to use the new ->readahead
address space operation is leading to deadlocks between the inode glocks
and page locks: ->readahead is called with the pages to readahead
already locked.  When gfs2_readahead then tries to lock the associated
inode glock, another process already holding the inode glock may be
trying to lock the same pages.

We could work around this in gfs by using a LM_FLAG_TRY lock in
->readahead for now.  The real reason for this deadlock is that gfs2
shouldn't be taking the inode glock in ->readahead in the first place
though, so I'd prefer to fix this "properly" instead.  Unfortunately,
this depends on a new IOCB_CACHED flag for generic_file_read_iter.

A previous version was posted in November:

https://lore.kernel.org/linux-fsdevel/20191122235324.17245-1-agrue...@redhat.com/

Thanks,
Andreas

Andreas Gruenbacher (2):
  fs: Add IOCB_CACHED flag for generic_file_read_iter
  gfs2: Rework read and page fault locking

 fs/gfs2/aops.c | 27 ++--
 fs/gfs2/file.c | 61 --
 include/linux/fs.h |  1 +
 mm/filemap.c   | 16 ++--
 4 files changed, 76 insertions(+), 29 deletions(-)


base-commit: af42d3466bdc8f39806b26f593604fdc54140bcb
-- 
2.26.2



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH v11 16/25] fs: Convert mpage_readpages to mpage_readahead

2020-06-18 Thread Andreas Gruenbacher
On Thu, Jun 18, 2020 at 5:03 PM Matthew Wilcox  wrote:
>
> On Thu, Jun 18, 2020 at 02:46:03PM +0200, Andreas Gruenbacher wrote:
> > On Wed, Jun 17, 2020 at 4:22 AM Matthew Wilcox  wrote:
> > > On Wed, Jun 17, 2020 at 02:57:14AM +0200, Andreas Grünbacher wrote:
> > > > Right, the approach from the following thread might fix this:
> > > >
> > > > https://lore.kernel.org/linux-fsdevel/20191122235324.17245-1-agrue...@redhat.com/T/#t
> > >
> > > In general, I think this is a sound approach.
> > >
> > > Specifically, I think FAULT_FLAG_CACHED can go away.  map_pages()
> > > will bring in the pages which are in the page cache, so when we get to
> > > gfs2_fault(), we know there's a reason to acquire the glock.
> >
> > We'd still be grabbing a glock while holding a dependent page lock.
> > Another process could be holding the glock and could try to grab the
> > same page lock (i.e., a concurrent writer), leading to the same kind
> > of deadlock.
>
> What I'm saying is that gfs2_fault should just be:
>
> +static vm_fault_t gfs2_fault(struct vm_fault *vmf)
> +{
> +   struct inode *inode = file_inode(vmf->vma->vm_file);
> +   struct gfs2_inode *ip = GFS2_I(inode);
> +   struct gfs2_holder gh;
> +   vm_fault_t ret;
> +   int err;
> +
> +   gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
> +   err = gfs2_glock_nq(&gh);
> +   if (err) {
> +   ret = block_page_mkwrite_return(err);
> +   goto out_uninit;
> +   }
> +   ret = filemap_fault(vmf);
> +   gfs2_glock_dq(&gh);
> +out_uninit:
> +   gfs2_holder_uninit(&gh);
> +   return ret;
> +}
>
> because by the time gfs2_fault() is called, map_pages() has already been
> called and has failed to insert the necessary page, so we should just
> acquire the glock now instead of trying again to look for the page in
> the page cache.

Okay, that's great.

Thanks,
Andreas



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH v11 16/25] fs: Convert mpage_readpages to mpage_readahead

2020-06-18 Thread Andreas Gruenbacher
On Wed, Jun 17, 2020 at 4:22 AM Matthew Wilcox  wrote:
> On Wed, Jun 17, 2020 at 02:57:14AM +0200, Andreas Grünbacher wrote:
> > Am Mi., 17. Juni 2020 um 02:33 Uhr schrieb Matthew Wilcox 
> > :
> > >
> > > On Wed, Jun 17, 2020 at 12:36:13AM +0200, Andreas Gruenbacher wrote:
> > > > Am Mi., 15. Apr. 2020 um 23:39 Uhr schrieb Matthew Wilcox 
> > > > :
> > > > > From: "Matthew Wilcox (Oracle)" 
> > > > >
> > > > > Implement the new readahead aop and convert all callers (block_dev,
> > > > > exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
> > > > > reiserfs & udf).  The callers are all trivial except for GFS2 & OCFS2.
> > > >
> > > > This patch leads to an ABBA deadlock in xfstest generic/095 on gfs2.
> > > >
> > > > Our lock hierarchy is such that the inode cluster lock ("inode glock")
> > > > for an inode needs to be taken before any page locks in that inode's
> > > > address space.
> > >
> > > How does that work for ...
> > >
> > > writepage:  yes, unlocks (see below)
> > > readpage:   yes, unlocks
> > > invalidatepage: yes
> > > releasepage:yes
> > > freepage:   yes
> > > isolate_page:   yes
> > > migratepage:yes (both)
> > > putback_page:   yes
> > > launder_page:   yes
> > > is_partially_uptodate:  yes
> > > error_remove_page:  yes
> > >
> > > Is there a reason that you don't take the glock in the higher level
> > > ops which are called before readhead gets called?  I'm looking at XFS,
> > > and it takes the xfs_ilock SHARED in xfs_file_buffered_aio_read()
> > > (called from xfs_file_read_iter).
> >
> > Right, the approach from the following thread might fix this:
> >
> > https://lore.kernel.org/linux-fsdevel/20191122235324.17245-1-agrue...@redhat.com/T/#t
>
> In general, I think this is a sound approach.
>
> Specifically, I think FAULT_FLAG_CACHED can go away.  map_pages()
> will bring in the pages which are in the page cache, so when we get to
> gfs2_fault(), we know there's a reason to acquire the glock.

We'd still be grabbing a glock while holding a dependent page lock.
Another process could be holding the glock and could try to grab the
same page lock (i.e., a concurrent writer), leading to the same kind
of deadlock.

Andreas



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [Cluster-devel] [PATCH v11 16/25] fs: Convert mpage_readpages to mpage_readahead

2020-06-16 Thread Andreas Gruenbacher
Am Mi., 15. Apr. 2020 um 23:39 Uhr schrieb Matthew Wilcox :
> From: "Matthew Wilcox (Oracle)" 
>
> Implement the new readahead aop and convert all callers (block_dev,
> exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
> reiserfs & udf).  The callers are all trivial except for GFS2 & OCFS2.

This patch leads to an ABBA deadlock in xfstest generic/095 on gfs2.

Our lock hierarchy is such that the inode cluster lock ("inode glock")
for an inode needs to be taken before any page locks in that inode's
address space. However, the readahead address space operation is
called with the pages already locked. When we try to grab the inode
glock inside gfs2_readahead, we'll deadlock with processes that are
holding that inode glock and trying to lock one of those same pages.

One possible solution is to use a trylock on the glock in
gfs2_readahead, and to give up the readahead in case of a locking
conflict. I have no idea how this is going to affect performance.

Any other ideas?

Thanks,
Andreas



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


[f2fs-dev] [PATCH v4] fs: Fix page_mkwrite off-by-one errors

2020-01-08 Thread Andreas Gruenbacher
Hi Darrick,

here's an updated version with the latest feedback incorporated.  Hope
you find that useful.

As far as the f2fs merge conflict goes, I've been told by Linus not to
resolve those kinds of conflicts but to point them out when sending the
merge request.  So this shouldn't be a big deal.

Changes:

* Turn page_mkwrite_check_truncate into a non-inline function.
* Get rid of now-unused mapping variable in ext4_page_mkwrite.
* In btrfs_page_mkwrite, don't ignore the return value of
  block_page_mkwrite_return (no change in behavior).
* Clean up the f2fs_vm_page_mkwrite changes as suggested by
  Jaegeuk Kim.

Thanks,
Andreas

--

The check in block_page_mkwrite that is meant to determine whether an
offset is within the inode size is off by one.  This bug has been copied
into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
ceph).

Fix that by introducing a new page_mkwrite_check_truncate helper that
checks for truncate and computes the bytes in the page up to EOF.  Use
the helper in the above mentioned filesystems.

In addition, use the new helper in btrfs as well.

Signed-off-by: Andreas Gruenbacher 
Acked-by: David Sterba  (btrfs)
Acked-by: Richard Weinberger  (ubifs)
Acked-by: Theodore Ts'o  (ext4)
Acked-by: Chao Yu  (f2fs)
---
 fs/btrfs/inode.c| 16 +---
 fs/buffer.c | 16 +++-
 fs/ceph/addr.c  |  2 +-
 fs/ext4/inode.c | 15 ---
 fs/f2fs/file.c  | 19 +++
 fs/iomap/buffered-io.c  | 18 +-
 fs/ubifs/file.c |  3 +--
 include/linux/pagemap.h |  2 ++
 mm/filemap.c| 28 
 9 files changed, 56 insertions(+), 63 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e3c76645cad7..23e6f614e000 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9011,16 +9011,15 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
goto out_noreserve;
}
 
-   ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
lock_page(page);
-   size = i_size_read(inode);
 
-   if ((page->mapping != inode->i_mapping) ||
-   (page_start >= size)) {
-   /* page got truncated out from underneath us */
+   ret2 = page_mkwrite_check_truncate(page, inode);
+   if (ret2 < 0) {
+   ret = block_page_mkwrite_return(ret2);
goto out_unlock;
}
+   zero_start = ret2;
wait_on_page_writeback(page);
 
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
@@ -9041,6 +9040,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
goto again;
}
 
+   size = i_size_read(inode);
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start,
  fs_info->sectorsize);
@@ -9073,12 +9073,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
}
ret2 = 0;
 
-   /* page is wholly or partially inside EOF */
-   if (page_start + PAGE_SIZE > size)
-   zero_start = offset_in_page(size);
-   else
-   zero_start = PAGE_SIZE;
-
if (zero_start != PAGE_SIZE) {
kaddr = kmap(page);
memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
diff --git a/fs/buffer.c b/fs/buffer.c
index d8c7242426bb..53aabde57ca7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf,
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
-   loff_t size;
int ret;
 
lock_page(page);
-   size = i_size_read(inode);
-   if ((page->mapping != inode->i_mapping) ||
-   (page_offset(page) > size)) {
-   /* We overload EFAULT to mean page got truncated */
-   ret = -EFAULT;
+   ret = page_mkwrite_check_truncate(page, inode);
+   if (ret < 0)
goto out_unlock;
-   }
-
-   /* page is wholly or partially inside EOF */
-   if (((page->index + 1) << PAGE_SHIFT) > size)
-   end = size & ~PAGE_MASK;
-   else
-   end = PAGE_SIZE;
+   end = ret;
 
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..ef958aa4adb4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
 
-   if ((off > size) || (page->mapping != inode->i_mapping)) {
+   if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NO

Re: [f2fs-dev] [PATCH v3] fs: Fix page_mkwrite off-by-one errors

2019-12-18 Thread Andreas Gruenbacher
On Wed, Dec 18, 2019 at 7:55 PM Darrick J. Wong  wrote:
> On Wed, Dec 18, 2019 at 02:09:35PM +0100, Andreas Gruenbacher wrote:
> > Hi Darrick,
> >
> > can this fix go in via the xfs tree?
>
> Er, I'd rather not touch five other filesystems via the XFS tree.
> However, a more immediate problem that I think I see is...
>
> > Thanks,
> > Andreas
> >
> > --
> >
> > The check in block_page_mkwrite that is meant to determine whether an
> > offset is within the inode size is off by one.  This bug has been copied
> > into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
> > ceph).
> >
> > Fix that by introducing a new page_mkwrite_check_truncate helper that
> > checks for truncate and computes the bytes in the page up to EOF.  Use
> > the helper in the above mentioned filesystems.
> >
> > In addition, use the new helper in btrfs as well.
> >
> > Signed-off-by: Andreas Gruenbacher 
> > Acked-by: David Sterba  (btrfs part)
> > Acked-by: Richard Weinberger  (ubifs part)
> > ---
> >  fs/btrfs/inode.c| 15 ---
> >  fs/buffer.c | 16 +++-
> >  fs/ceph/addr.c  |  2 +-
> >  fs/ext4/inode.c | 14 --
> >  fs/f2fs/file.c  | 19 +++
> >  fs/iomap/buffered-io.c  | 18 +-
> >  fs/ubifs/file.c |  3 +--
> >  include/linux/pagemap.h | 28 
> >  8 files changed, 53 insertions(+), 62 deletions(-)
> >
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 56032c518b26..86c6fcd8139d 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >   ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
> >  again:
> >   lock_page(page);
> > - size = i_size_read(inode);
> >
> > - if ((page->mapping != inode->i_mapping) ||
> > - (page_start >= size)) {
> > - /* page got truncated out from underneath us */
> > + ret2 = page_mkwrite_check_truncate(page, inode);
> > + if (ret2 < 0)
> >   goto out_unlock;
>
> ...here we try to return -EFAULT as vm_fault_t.  Notice how btrfs returns
> VM_FAULT_* values directly and never calls block_page_mkwrite_return?  I
> know dsterba acked this, but I cannot see how this is correct?

Well, page_mkwrite_check_truncate can only fail with -EFAULT, in which
case btrfs_page_mkwrite will return VM_FAULT_NOPAGE. It would be
cleaner not to discard page_mkwrite_check_truncate's return value
though.

> > - }
> > + zero_start = ret2;
> >   wait_on_page_writeback(page);
> >
> >   lock_extent_bits(io_tree, page_start, page_end, &cached_state);
> > @@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >   goto again;
> >   }
> >
> > + size = i_size_read(inode);
> >   if (page->index == ((size - 1) >> PAGE_SHIFT)) {
> >   reserved_space = round_up(size - page_start,
> > fs_info->sectorsize);
> > @@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
> >   }
> >   ret2 = 0;
> >
> > - /* page is wholly or partially inside EOF */
> > - if (page_start + PAGE_SIZE > size)
> > - zero_start = offset_in_page(size);
> > - else
> > - zero_start = PAGE_SIZE;
> > -
> >   if (zero_start != PAGE_SIZE) {
> >   kaddr = kmap(page);
> >   memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index d8c7242426bb..53aabde57ca7 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, 
> > struct vm_fault *vmf,
> >   struct page *page = vmf->page;
> >   struct inode *inode = file_inode(vma->vm_file);
> >   unsigned long end;
> > - loff_t size;
> >   int ret;
> >
> >   lock_page(page);
> > - size = i_size_read(inode);
> > - if ((page->mapping != inode->i_mapping) ||
> > - (page_offset(page) > size)) {
> > - /* We overload EFAULT to mean page got truncated */
> > - ret = -EFAULT;
> > + ret = page_mkwrite_check_truncate(page, inode);
> > + if (ret < 0)
> >   goto out_unlock;
> > -   

[f2fs-dev] [PATCH v3] fs: Fix page_mkwrite off-by-one errors

2019-12-18 Thread Andreas Gruenbacher
Hi Darrick,

can this fix go in via the xfs tree?

Thanks,
Andreas

--

The check in block_page_mkwrite that is meant to determine whether an
offset is within the inode size is off by one.  This bug has been copied
into iomap_page_mkwrite and several filesystems (ubifs, ext4, f2fs,
ceph).

Fix that by introducing a new page_mkwrite_check_truncate helper that
checks for truncate and computes the bytes in the page up to EOF.  Use
the helper in the above mentioned filesystems.

In addition, use the new helper in btrfs as well.

Signed-off-by: Andreas Gruenbacher 
Acked-by: David Sterba  (btrfs part)
Acked-by: Richard Weinberger  (ubifs part)
---
 fs/btrfs/inode.c| 15 ---
 fs/buffer.c | 16 +++-
 fs/ceph/addr.c  |  2 +-
 fs/ext4/inode.c | 14 --
 fs/f2fs/file.c  | 19 +++
 fs/iomap/buffered-io.c  | 18 +-
 fs/ubifs/file.c |  3 +--
 include/linux/pagemap.h | 28 
 8 files changed, 53 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 56032c518b26..86c6fcd8139d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9016,13 +9016,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
lock_page(page);
-   size = i_size_read(inode);
 
-   if ((page->mapping != inode->i_mapping) ||
-   (page_start >= size)) {
-   /* page got truncated out from underneath us */
+   ret2 = page_mkwrite_check_truncate(page, inode);
+   if (ret2 < 0)
goto out_unlock;
-   }
+   zero_start = ret2;
wait_on_page_writeback(page);
 
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
@@ -9043,6 +9041,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
goto again;
}
 
+   size = i_size_read(inode);
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start,
  fs_info->sectorsize);
@@ -9075,12 +9074,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
}
ret2 = 0;
 
-   /* page is wholly or partially inside EOF */
-   if (page_start + PAGE_SIZE > size)
-   zero_start = offset_in_page(size);
-   else
-   zero_start = PAGE_SIZE;
-
if (zero_start != PAGE_SIZE) {
kaddr = kmap(page);
memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
diff --git a/fs/buffer.c b/fs/buffer.c
index d8c7242426bb..53aabde57ca7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2499,23 +2499,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf,
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
-   loff_t size;
int ret;
 
lock_page(page);
-   size = i_size_read(inode);
-   if ((page->mapping != inode->i_mapping) ||
-   (page_offset(page) > size)) {
-   /* We overload EFAULT to mean page got truncated */
-   ret = -EFAULT;
+   ret = page_mkwrite_check_truncate(page, inode);
+   if (ret < 0)
goto out_unlock;
-   }
-
-   /* page is wholly or partially inside EOF */
-   if (((page->index + 1) << PAGE_SHIFT) > size)
-   end = size & ~PAGE_MASK;
-   else
-   end = PAGE_SIZE;
+   end = ret;
 
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..ef958aa4adb4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
 
-   if ((off > size) || (page->mapping != inode->i_mapping)) {
+   if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f28de0c1b6..51ab1d2cac80 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5871,7 +5871,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
-   loff_t size;
unsigned long len;
int err;
vm_fault_t ret;
@@ -5907,18 +5906,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
}
 
lock_page(page);
-   size = i_size_read(inode);
-   /* Page got truncated from under us? */
-   if (page->mapping != mapping || page_offset(page) > size) {
+   err = page_mkwrite_check_truncate(page, inode);
+   if (err < 0) {
  

[f2fs-dev] [PATCH v2] fs: Fix page_mkwrite off-by-one errors

2019-11-29 Thread Andreas Gruenbacher
The check in block_page_mkwrite meant to determine whether an offset is
within the inode size is off by one.  This bug has spread to
iomap_page_mkwrite and to several filesystems (ubifs, ext4, f2fs, ceph).
To fix that, introduce a new page_mkwrite_check_truncate helper that
checks for truncate and computes the bytes in the page up to EOF, and
use that helper in the above mentioned filesystems and in btrfs.

Signed-off-by: Andreas Gruenbacher 

---

This patch has a trivial conflict with commit "iomap: Fix overflow in
iomap_page_mkwrite" in Darrick's iomap pull request for 5.5:

  https://lore.kernel.org/lkml/20191125190907.GN6219@magnolia/
---
 fs/btrfs/inode.c| 15 ---
 fs/buffer.c | 16 +++-
 fs/ceph/addr.c  |  2 +-
 fs/ext4/inode.c | 14 --
 fs/f2fs/file.c  | 19 +++
 fs/iomap/buffered-io.c  | 17 -
 fs/ubifs/file.c |  3 +--
 include/linux/pagemap.h | 24 
 8 files changed, 48 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 015910079e73..019948101bc2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8990,13 +8990,11 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
lock_page(page);
-   size = i_size_read(inode);
 
-   if ((page->mapping != inode->i_mapping) ||
-   (page_start >= size)) {
-   /* page got truncated out from underneath us */
+   ret2 = page_mkwrite_check_truncate(page, inode);
+   if (ret2 < 0)
goto out_unlock;
-   }
+   zero_start = ret2;
wait_on_page_writeback(page);
 
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
@@ -9017,6 +9015,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
goto again;
}
 
+   size = i_size_read(inode);
if (page->index == ((size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start,
  fs_info->sectorsize);
@@ -9049,12 +9048,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
}
ret2 = 0;
 
-   /* page is wholly or partially inside EOF */
-   if (page_start + PAGE_SIZE > size)
-   zero_start = offset_in_page(size);
-   else
-   zero_start = PAGE_SIZE;
-
if (zero_start != PAGE_SIZE) {
kaddr = kmap(page);
memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..b162ec65910e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2459,23 +2459,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf,
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
-   loff_t size;
int ret;
 
lock_page(page);
-   size = i_size_read(inode);
-   if ((page->mapping != inode->i_mapping) ||
-   (page_offset(page) > size)) {
-   /* We overload EFAULT to mean page got truncated */
-   ret = -EFAULT;
+   ret = page_mkwrite_check_truncate(page, inode);
+   if (ret < 0)
goto out_unlock;
-   }
-
-   /* page is wholly or partially inside EOF */
-   if (((page->index + 1) << PAGE_SHIFT) > size)
-   end = size & ~PAGE_MASK;
-   else
-   end = PAGE_SIZE;
+   end = ret;
 
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..ef958aa4adb4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
 
-   if ((off > size) || (page->mapping != inode->i_mapping)) {
+   if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 516faa280ced..23bf095e0b29 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6186,7 +6186,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
-   loff_t size;
unsigned long len;
int err;
vm_fault_t ret;
@@ -6222,18 +6221,13 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
}
 
lock_page(page);
-   size = i_size_read(inode);
-   /* Page got truncated from under us? */
-   if (page->mapping != mapping || page_offset(page) > size) {
+   err = page_mkwrite_check_truncate(page, inode);
+   if (err &

[f2fs-dev] [PATCH] fs: Fix page_mkwrite off-by-one errors

2019-11-27 Thread Andreas Gruenbacher
Fix a check in block_page_mkwrite meant to determine whether an offset
is within the inode size.  This error has spread to several filesystems
and to iomap_page_mkwrite, so fix those instances as well.

Signed-off-by: Andreas Gruenbacher 

---

This patch has a trivial conflict with commit "iomap: Fix overflow in
iomap_page_mkwrite" in Darrick's iomap pull request for 5.5:

  https://lore.kernel.org/lkml/20191125190907.GN6219@magnolia/
---
 fs/buffer.c| 2 +-
 fs/ceph/addr.c | 2 +-
 fs/ext4/inode.c| 2 +-
 fs/f2fs/file.c | 2 +-
 fs/iomap/buffered-io.c | 2 +-
 fs/ubifs/file.c| 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..152d391858d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2465,7 +2465,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf,
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
-   (page_offset(page) > size)) {
+   (page_offset(page) >= size)) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
goto out_unlock;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 7ab616601141..9fa0729ece41 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1575,7 +1575,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
 
-   if ((off > size) || (page->mapping != inode->i_mapping)) {
+   if ((off >= size) || (page->mapping != inode->i_mapping)) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 516faa280ced..6dd4efe2fb63 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6224,7 +6224,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
lock_page(page);
size = i_size_read(inode);
/* Page got truncated from under us? */
-   if (page->mapping != mapping || page_offset(page) > size) {
+   if (page->mapping != mapping || page_offset(page) >= size) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
goto out;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 29bc0a542759..3436be01af45 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -71,7 +71,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
down_read(&F2FS_I(inode)->i_mmap_sem);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
-   page_offset(page) > i_size_read(inode) ||
+   page_offset(page) >= i_size_read(inode) ||
!PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e25901ae3ff4..d454dbab5133 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1041,7 +1041,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const 
struct iomap_ops *ops)
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
-   (page_offset(page) > size)) {
+   (page_offset(page) >= size)) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
goto out_unlock;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index cd52585c8f4f..ca0148ec77e6 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1564,7 +1564,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault 
*vmf)
 
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
-page_offset(page) > i_size_read(inode))) {
+page_offset(page) >= i_size_read(inode))) {
/* Page got truncated out from underneath us */
goto sigbus;
}
-- 
2.20.1



___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


[f2fs-dev] [PATCH v3 7/7] f2fs: xattr simplifications

2015-10-04 Thread Andreas Gruenbacher
Now that the xattr handler is passed to the xattr handler operations, we
have access to the attribute name prefix, so simplify
f2fs_xattr_generic_list.

Also, f2fs_xattr_advise_list is only ever called for
f2fs_xattr_advise_handler; there is no need to double check for that.

Signed-off-by: Andreas Gruenbacher 
Cc: Jaegeuk Kim 
Cc: Changman Lee 
Cc: Chao Yu 
Cc: linux-f2fs-devel@lists.sourceforge.net
---
 fs/f2fs/xattr.c | 15 +++
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index e643173..862368a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -30,33 +30,27 @@ static size_t f2fs_xattr_generic_list(const struct 
xattr_handler *handler,
const char *name, size_t len)
 {
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-   int total_len, prefix_len = 0;
-   const char *prefix = NULL;
+   int total_len, prefix_len;
 
switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
-   prefix = XATTR_USER_PREFIX;
-   prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-   prefix = XATTR_TRUSTED_PREFIX;
-   prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
-   prefix = XATTR_SECURITY_PREFIX;
-   prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
 
+   prefix_len = strlen(handler->prefix);
total_len = prefix_len + len + 1;
if (list && total_len <= list_size) {
-   memcpy(list, prefix, prefix_len);
+   memcpy(list, handler->prefix, prefix_len);
memcpy(list + prefix_len, name, len);
list[prefix_len + len] = '\0';
}
@@ -123,9 +117,6 @@ static size_t f2fs_xattr_advise_list(const struct 
xattr_handler *handler,
const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
size_t size;
 
-   if (handler->flags != F2FS_XATTR_INDEX_ADVISE)
-   return 0;
-
size = strlen(xname) + 1;
if (list && size <= list_size)
memcpy(list, xname, size);
-- 
2.5.0


--
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


[f2fs-dev] [PATCH v2 7/7] f2fs: xattr simplifications

2015-09-22 Thread Andreas Gruenbacher
Now that the xattr handler is passed to the xattr handler operations, we
have access to the attribute name prefix, so simplify
f2fs_xattr_generic_list.

Also, f2fs_xattr_advise_list is only ever called for
f2fs_xattr_advise_handler; there is no need to double check for that.

Signed-off-by: Andreas Gruenbacher 
---
 fs/f2fs/xattr.c | 15 +++
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index e643173..862368a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -30,33 +30,27 @@ static size_t f2fs_xattr_generic_list(const struct 
xattr_handler *handler,
const char *name, size_t len)
 {
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-   int total_len, prefix_len = 0;
-   const char *prefix = NULL;
+   int total_len, prefix_len;
 
switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
-   prefix = XATTR_USER_PREFIX;
-   prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-   prefix = XATTR_TRUSTED_PREFIX;
-   prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
-   prefix = XATTR_SECURITY_PREFIX;
-   prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
 
+   prefix_len = strlen(handler->prefix);
total_len = prefix_len + len + 1;
if (list && total_len <= list_size) {
-   memcpy(list, prefix, prefix_len);
+   memcpy(list, handler->prefix, prefix_len);
memcpy(list + prefix_len, name, len);
list[prefix_len + len] = '\0';
}
@@ -123,9 +117,6 @@ static size_t f2fs_xattr_advise_list(const struct 
xattr_handler *handler,
const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
size_t size;
 
-   if (handler->flags != F2FS_XATTR_INDEX_ADVISE)
-   return 0;
-
size = strlen(xname) + 1;
if (list && size <= list_size)
memcpy(list, xname, size);
-- 
2.4.3


--
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 04/18] fs: add generic xattr_acl handlers

2013-12-17 Thread Andreas Gruenbacher
Christoph,

> +static int
> +posix_acl_xattr_set(struct dentry *dentry, const char *name,
> + const void *value, size_t size, int flags, int type)
> +{
> + struct inode *inode = dentry->d_inode;
> + struct posix_acl *acl = NULL;
> + int ret;
> +
> + if (!IS_POSIXACL(inode))
> + return -EOPNOTSUPP;
> + if (S_ISLNK(inode->i_mode) || !inode->i_op->set_acl)
> + return -EOPNOTSUPP;

Sama here, I would remove the S_ISLNK() check.

Andreas

--
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 02/18] fs: add get_acl helper

2013-12-17 Thread Andreas Gruenbacher
Christoph,

> +struct posix_acl *get_acl(struct inode *inode, int type)
> +{
> + struct posix_acl *acl;
> +
> + acl = get_cached_acl(inode, type);
> + if (acl != ACL_NOT_CACHED)
> + return acl;
> +
> + if (!IS_POSIXACL(inode))
> + return NULL;
> +
> + /*
> +  * A filesystem can force a ACL callback by just never filling the
> +  * ACL cache. But normally you'd fill the cache either at inode
> +  * instantiation time, or on the first ->get_acl call.
> +  *
> +  * If the filesystem doesn't have a get_acl() function at all, we'll
> +  * just create the negative cache entry.
> +  */
> + if (!inode->i_op->get_acl) {
> + set_cached_acl(inode, type, NULL);
> + return ERR_PTR(-EAGAIN);

The function should return NULL here.

Andreas

--
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 05/18] fs: make posix_acl_chmod more useful

2013-12-17 Thread Andreas Gruenbacher
Christoph,

> +int
> +posix_acl_chmod(struct inode *inode)
> +{
> + struct posix_acl *acl;
> + int ret = 0;
> +
> + if (S_ISLNK(inode->i_mode) || !inode->i_op->set_acl)
> + return -EOPNOTSUPP;

Symlinks never have get_acl callbacks, so I would remove the S_ISLNK() check 
here.

Andreas

--
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 16/18] gfs2: use generic posix ACL infrastructure

2013-12-17 Thread Andreas Gruenbacher
Christoph,

gfs2 has a left-over get_acl callback in gfs2_symlink_iops in
fs/gfs2/inode.c, from a long time ago, which should be removed
as well.

Andreas

--
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


Re: [f2fs-dev] [PATCH 00/18] Consolidate Posix ACL implementation

2013-12-10 Thread Andreas Gruenbacher
Christoph,

nice work, and a pretty diffstat.

I see that get_acl and set_acl are being defined in some but not all symlink 
inode operations (for example, btrfs them while ext4 does not), and that 
posix_acl_xattr_set() doesn't check if set_acl is defined.  Symlinks cannot 
have ACLs, so set_acl should either never be defined for symlinks (and a NULL 
check is then needed in posix_acl_xattr_set()), or it is defined in all inode 
operations, and S_ISNLNK() check is needed in posix_acl_xattr_set().  That 
latter check should probably be added in any case to be on the safe side.

  Test case:

  setfattr -h -n system.posix_acl_access \
   -v 0sAgEABgD/AgAGABMEAAAEAAYA/xAABgD/IAAEAP8= \
   symlink

Patch 6 also declares posix_acl_prepare() but this function is never 
introduced; this must be a leftover from a previous version.

Thanks,
Andreas

--
Rapidly troubleshoot problems before they affect your business. Most IT 
organizations don't have a clear picture of how application performance 
affects their revenue. With AppDynamics, you get 100% visibility into your 
Java,.NET, & PHP application. Start your 15-day FREE TRIAL of AppDynamics Pro!
http://pubads.g.doubleclick.net/gampad/clk?id=84349831&iu=/4140/ostg.clktrk
___
Linux-f2fs-devel mailing list
Linux-f2fs-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel