date:20181015

From: Darrick J. Wong 

Back when the XFS reflink code only supported clone_file_range, we were
only able to return zero or negative error codes to userspace.  However,
now that copy_file_range (which returns bytes copied) can use XFS'
clone_file_range, we have the opportunity to return partial results.
For example, if userspace sends a 1GB clone request and we run out of
space halfway through, we at least can tell userspace that we completed
512M of that request like a regular write.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_file.c|5 +
 fs/xfs/xfs_reflink.c |   20 +++-
 fs/xfs/xfs_reflink.h |2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 38fde4e11714..7d42ab8fe6e1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -928,14 +928,11 @@ xfs_file_remap_range(
loff_t  len,
unsigned intremap_flags)
 {
-   int ret;
-
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
-   ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, remap_flags);
-   return ret < 0 ? ret : len;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e8e86646bb4b..af3368862c56 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1123,6 +1123,7 @@ xfs_reflink_remap_blocks(
struct xfs_inode*dest,
xfs_fileoff_t   destoff,
xfs_filblks_t   len,
+   xfs_filblks_t   *remapped_len,
xfs_off_t   new_isize)
 {
struct xfs_bmbt_irecimap;
@@ -1130,6 +1131,7 @@ xfs_reflink_remap_blocks(
int error = 0;
xfs_filblks_t   range_len;
 
+   *remapped_len = 0;
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
uintlock_mode;
@@ -1168,6 +1170,7 @@ xfs_reflink_remap_blocks(
srcoff += range_len;
destoff += range_len;
len -= range_len;
+   *remapped_len += range_len;
}
 
return 0;
@@ -1382,7 +1385,7 @@ xfs_reflink_remap_prep(
 /*
  * Link a range of blocks from one file to another.
  */
-int
+loff_t
 xfs_reflink_remap_range(
struct file *file_in,
loff_t  pos_in,
@@ -1397,9 +1400,10 @@ xfs_reflink_remap_range(
struct xfs_inode*dest = XFS_I(inode_out);
struct xfs_mount*mp = src->i_mount;
xfs_fileoff_t   sfsbno, dfsbno;
-   xfs_filblks_t   fsblen;
+   xfs_filblks_t   fsblen, remappedfsb = 0;
+   loff_t  remapped_bytes = 0;
xfs_extlen_tcowextsize;
-   ssize_t ret;
+   int ret;
 
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
@@ -1415,11 +1419,17 @@ xfs_reflink_remap_range(
 
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
+   if (len == 0) {
+   ret = 0;
+   goto out_unlock;
+   }
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-   pos_out + len);
+   &remappedfsb, pos_out + len);
+   remapped_bytes = min_t(loff_t, len, XFS_FSB_TO_B(mp, remappedfsb));
if (ret)
goto out_unlock;
 
@@ -1442,7 +1452,7 @@ xfs_reflink_remap_range(
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-   return ret;
+   return remapped_bytes > 0 ? remapped_bytes : ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c3c46c276fe1..cbc26ff79a8f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -27,7 +27,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, 
xfs_off_t offset,
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
unsigned int remap_flags);
 extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,

[PATCH 24/26] xfs: fix pagecache truncation prior to reflink

From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range shouldn't be a problem.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Dave Chinner 
---
 fs/xfs/xfs_reflink.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 9b1ea42c81d1..e8e86646bb4b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep(
goto out_unlock;
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + *len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data,
+   round_down(pos_out, PAGE_SIZE),
+   round_up(pos_out + *len, PAGE_SIZE) - 1);
 
return 1;
 out_unlock:

[PATCH 26/26] xfs: remove redundant remap partial EOF block checks

From: Darrick J. Wong 

Now that we've moved the partial EOF block checks to the VFS helpers, we
can remove the redundant functionality from XFS.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Dave Chinner 
---
 fs/xfs/xfs_reflink.c |   19 ---
 1 file changed, 19 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index af3368862c56..755d4a9446e3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1307,7 +1307,6 @@ xfs_reflink_remap_prep(
struct inode*inode_out = file_inode(file_out);
struct xfs_inode*dest = XFS_I(inode_out);
boolsame_inode = (inode_in == inode_out);
-   u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
 
/* Lock both files against IO */
@@ -1335,24 +1334,6 @@ xfs_reflink_remap_prep(
if (ret < 0 || *len == 0)
goto out_unlock;
 
-   /*
-* If the dedupe data matches, chop off the partial EOF block
-* from the source file so we don't try to dedupe the partial
-* EOF block.
-*/
-   if (remap_flags & REMAP_FILE_DEDUP) {
-   *len &= ~blkmask;
-   } else if (*len & blkmask) {
-   /*
-* The user is attempting to share a partial EOF block,
-* if it's inside the destination EOF then reject it.
-*/
-   if (pos_out + *len < i_size_read(inode_out)) {
-   ret = -EINVAL;
-   goto out_unlock;
-   }
-   }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)

[PATCH 23/26] ocfs2: remove ocfs2_reflink_remap_range

From: Darrick J. Wong 

Since ocfs2_remap_file_range is a thin shell around
ocfs2_remap_remap_range, move everything from the latter into the
former.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/file.c |   68 +++-
 fs/ocfs2/refcounttree.c |  113 +++
 fs/ocfs2/refcounttree.h |   24 +++---
 3 files changed, 102 insertions(+), 103 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8125c5ccf821..fe570824b991 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2531,11 +2531,75 @@ static loff_t ocfs2_remap_file_range(struct file 
*file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 loff_t len, unsigned int remap_flags)
 {
+   struct inode *inode_in = file_inode(file_in);
+   struct inode *inode_out = file_inode(file_out);
+   struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+   struct buffer_head *in_bh = NULL, *out_bh = NULL;
+   bool same_inode = (inode_in == inode_out);
+   loff_t remapped = 0;
+   ssize_t ret;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
+   if (!ocfs2_refcount_tree(osb))
+   return -EOPNOTSUPP;
+   if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+   return -EROFS;
 
-   return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-   len, remap_flags);
+   /* Lock both files against IO */
+   ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+   if (ret)
+   return ret;
+
+   /* Check file eligibility and prepare for block sharing. */
+   ret = -EINVAL;
+   if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+   (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+   goto out_unlock;
+
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+   &len, remap_flags);
+   if (ret < 0 || len == 0)
+   goto out_unlock;
+
+   /* Lock out changes to the allocation maps and remap. */
+   down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+   if (!same_inode)
+   down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+   /* Zap any page cache for the destination file's range. */
+   truncate_inode_pages_range(&inode_out->i_data,
+  round_down(pos_out, PAGE_SIZE),
+  round_up(pos_out + len, PAGE_SIZE) - 1);
+
+   remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+   inode_out, out_bh, pos_out, len);
+   up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+   if (!same_inode)
+   up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+   if (remapped < 0) {
+   ret = remapped;
+   mlog_errno(ret);
+   goto out_unlock;
+   }
+
+   /*
+* Empty the extent map so that we may get the right extent
+* record from the disk.
+*/
+   ocfs2_extent_map_trunc(inode_in, 0);
+   ocfs2_extent_map_trunc(inode_out, 0);
+
+   ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+   if (ret) {
+   mlog_errno(ret);
+   goto out_unlock;
+   }
+
+out_unlock:
+   ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+   return remapped > 0 ? remapped : ret;
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c7409578657b..dc66b80585ec 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4468,9 +4468,9 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 }
 
 /* Update destination inode size, if necessary. */
-static int ocfs2_reflink_update_dest(struct inode *dest,
-struct buffer_head *d_bh,
-loff_t newlen)
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
 {
handle_t *handle;
int ret;
@@ -4621,13 +4621,13 @@ static loff_t ocfs2_reflink_remap_extent(struct inode 
*s_inode,
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
-struct buffer_head *s_bh,
-loff_t pos_in,
-struct inode *t_inode,
-struct buffer_head *t_bh,
-loff_t pos_out,
-loff_t len)
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+

[PATCH 22/26] ocfs2: support partial clone range and dedupe range

From: Darrick J. Wong 

Change the ocfs2 remap code to allow for returning partial results.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/file.c |7 +
 fs/ocfs2/refcounttree.c |   72 +--
 fs/ocfs2/refcounttree.h |   12 
 3 files changed, 46 insertions(+), 45 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fbaeafe44b5f..8125c5ccf821 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2531,14 +2531,11 @@ static loff_t ocfs2_remap_file_range(struct file 
*file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 loff_t len, unsigned int remap_flags)
 {
-   int ret;
-
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
-   ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-   len, remap_flags);
-   return ret < 0 ? ret : len;
+   return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   len, remap_flags);
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7c709229e108..c7409578657b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4507,14 +4507,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest,
 }
 
 /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len,
+struct ocfs2_cached_dealloc_ctxt 
*dealloc)
 {
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
@@ -4522,8 +4522,9 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
+   loff_t remapped_bytes = 0;
loff_t pstart, plen;
-   u32 p_cluster, num_clusters, slast, spos, tpos;
+   u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags;
int ret = 0;
 
@@ -4605,30 +4606,34 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
 next_loop:
spos += num_clusters;
tpos += num_clusters;
+   remapped_clus += num_clusters;
}
 
-out:
-   return ret;
+   goto out;
 out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
-   return ret;
+out:
+   remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+   remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+   return remapped_bytes > 0 ? remapped_bytes : ret;
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len)
+static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len)
 {
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
-   int ret;
+   loff_t ret;
 
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4700,7 +4705,7 @@ static int ocfs2_reflink_remap_blocks(struct inode 
*s_inode,
/* Actually remap extents now. */
ret = ocfs2_

[PATCH 19/26] vfs: clean up generic_remap_file_range_prep return value

From: Darrick J. Wong 

Since the remap prep function can update the length of the remap
request, we can change this function to return the usual return status
instead of the odd behavior it has now.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |2 +-
 fs/read_write.c |6 +++---
 fs/xfs/xfs_reflink.c|4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6a42c04ac0ab..46bbd315c39f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4852,7 +4852,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || len == 0)
goto out_unlock;
 
/* Lock out changes to the allocation maps and remap. */
diff --git a/fs/read_write.c b/fs/read_write.c
index 450e038e8617..37a7d3fe35d8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1872,8 +1872,8 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
  * sense, and then flush all dirty data.  Caller must ensure that the
  * inodes have been locked against any other modifications.
  *
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
- * the usual negative error code.
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
  */
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  struct file *file_out, loff_t pos_out,
@@ -1954,7 +1954,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (ret)
return ret;
 
-   return 1;
+   return 0;
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3dbe5fb7e9c0..9b1ea42c81d1 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1329,7 +1329,7 @@ xfs_reflink_remap_prep(
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || *len == 0)
goto out_unlock;
 
/*
@@ -1409,7 +1409,7 @@ xfs_reflink_remap_range(
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || len == 0)
return ret;
 
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);

[PATCH 15/26] vfs: plumb remap flags through the vfs clone functions

From: Darrick J. Wong 

Plumb a remap_flags argument through the {do,vfs}_clone_file_range
functions so that clone can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ioctl.c |2 +-
 fs/nfsd/vfs.c  |2 +-
 fs/overlayfs/copy_up.c |2 +-
 fs/overlayfs/file.c|6 +++---
 fs/read_write.c|   13 +
 include/linux/fs.h |4 ++--
 6 files changed, 17 insertions(+), 12 deletions(-)


diff --git a/fs/ioctl.c b/fs/ioctl.c
index 72537b68c272..505275ec5596 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -232,7 +232,7 @@ static long ioctl_file_clone(struct file *dst_file, 
unsigned long srcfd,
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
- olen);
+ olen, 0);
if (cloned < 0)
ret = cloned;
else if (olen && cloned != olen)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ac6cb6101cbe..726fc5b2b27a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -543,7 +543,7 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 
src_pos, struct file *dst,
 {
loff_t cloned;
 
-   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count);
+   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (count && cloned != count)
cloned = -EINVAL;
return nfserrno(cloned < 0 ? cloned : 0);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8750b7235516..5f82fece64a0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -142,7 +142,7 @@ static int ovl_copy_up_data(struct path *old, struct path 
*new, loff_t len)
}
 
/* Try to use clone_file_range to clone up within the same fs */
-   cloned = do_clone_file_range(old_file, 0, new_file, 0, len);
+   cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
goto out;
/* Couldn't clone, so now we try to copy the data */
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 6c3fec6168e9..0393815c8971 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -462,7 +462,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
-  real_out.file, pos_out, len);
+  real_out.file, pos_out, len, flags);
break;
 
case OVL_DEDUPE:
@@ -512,8 +512,8 @@ static loff_t ovl_remap_file_range(struct file *file_in, 
loff_t pos_in,
 !ovl_inode_upper(file_inode(file_out
return -EPERM;
 
-   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
-   op);
+   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+   remap_flags, op);
 }
 
 const struct file_operations ovl_file_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 906e78be5001..791b406e8264 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1857,12 +1857,15 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, loff_t len)
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags);
+
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
@@ -1893,7 +1896,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
return ret;
 
ret = file_in->f_op->remap_file_range(file_in, pos_in,
-   file_out, pos_out, len, 0);
+   file_out, pos_out, len, remap_flags);
if (ret < 0)
return ret;
 
@@ -1904,12 +1907,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
 EXPORT_SYMBOL(do_clone_file_range);
 
 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out, loff_t len)
+   struct file *file_out, loff_t pos_out,
+   loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
file_start_write(file_out);
-   ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+   ret = do_clone_file_range(file_in, pos_in, file_out, pos_out,

[PATCH 21/26] ocfs2: fix pagecache truncation prior to reflink

From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range should be fine.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 2a5c96bc9677..7c709229e108 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4862,8 +4862,9 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  SINGLE_DEPTH_NESTING);
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data,
+  round_down(pos_out, PAGE_SIZE),
+  round_up(pos_out + len, PAGE_SIZE) - 1);
 
ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
 out_bh, pos_out, len);

[PATCH 16/26] vfs: plumb remap flags through the vfs dedupe functions

From: Darrick J. Wong 

Plumb a remap_flags argument through the vfs_dedupe_file_range_one
functions so that dedupe can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/overlayfs/file.c |3 ++-
 fs/read_write.c |9 ++---
 include/linux/fs.h  |2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)


diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 0393815c8971..84dd957efa24 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -467,7 +467,8 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
-   real_out.file, pos_out, len);
+   real_out.file, pos_out, len,
+   flags);
break;
}
revert_creds(old_cred);
diff --git a/fs/read_write.c b/fs/read_write.c
index 791b406e8264..f6ab5beb935a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2019,10 +2019,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
 
 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 struct file *dst_file, loff_t dst_pos,
-loff_t len)
+loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP));
+
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
@@ -2053,7 +2055,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
}
 
ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
-   dst_pos, len, REMAP_FILE_DEDUP);
+   dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
 out_drop_write:
mnt_drop_write_file(dst_file);
 
@@ -2121,7 +2123,8 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
}
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-   info->dest_offset, len);
+   info->dest_offset, len,
+   0);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bc353a5224a4..c0ae85a7bd9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1854,7 +1854,7 @@ extern int vfs_dedupe_file_range(struct file *file,
 struct file_dedupe_range *same);
 extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
-   loff_t len);
+   loff_t len, unsigned int remap_flags);
 
 
 struct super_operations {

[PATCH 17/26] vfs: enable remap callers that can handle short operations

From: Darrick J. Wong 

Plumb in a remap flag that enables the filesystem remap handler to
shorten remapping requests for callers that can handle it.  Now
copy_file_range can report partial success (in case we run up against
alignment problems, resource limits, etc.).

We also enable CAN_SHORTEN for fideduperange to maintain existing
userspace-visible behavior where xfs/btrfs shorten the dedupe range to
avoid stale post-eof data exposure.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|   28 
 include/linux/fs.h |7 +--
 mm/filemap.c   |   16 
 3 files changed, 37 insertions(+), 14 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index f6ab5beb935a..ee9314b7bfc3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t 
pos_in,
 
cloned = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
-   min_t(loff_t, MAX_RW_COUNT, len), 0);
+   min_t(loff_t, MAX_RW_COUNT, len),
+   REMAP_FILE_CAN_SHORTEN);
if (cloned > 0) {
ret = cloned;
goto done;
@@ -1721,6 +1722,8 @@ static int remap_verify_area(struct file *file, loff_t 
pos, loff_t len,
  * can't meaningfully compare post-EOF contents.
  *
  * For clone we only link a partial EOF block above the destination file's EOF.
+ *
+ * Shorten the request if possible.
  */
 static int generic_remap_check_len(struct inode *inode_in,
   struct inode *inode_out,
@@ -1729,16 +1732,24 @@ static int generic_remap_check_len(struct inode 
*inode_in,
   unsigned int remap_flags)
 {
u64 blkmask = i_blocksize(inode_in) - 1;
+   loff_t new_len = *len;
 
if ((*len & blkmask) == 0)
return 0;
 
-   if (remap_flags & REMAP_FILE_DEDUP)
-   *len &= ~blkmask;
-   else if (pos_out + *len < i_size_read(inode_out))
-   return -EINVAL;
+   if ((remap_flags & REMAP_FILE_DEDUP) ||
+   pos_out + *len < i_size_read(inode_out))
+   new_len &= ~blkmask;
 
-   return 0;
+   if (new_len == *len)
+   return 0;
+
+   if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+   *len = new_len;
+   return 0;
+   }
+
+   return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
 }
 
 /* Update inode timestamps and remove security privileges when remapping. */
@@ -2023,7 +2034,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
 {
loff_t ret;
 
-   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP));
+   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+REMAP_FILE_CAN_SHORTEN));
 
ret = mnt_want_write_file(dst_file);
if (ret)
@@ -2124,7 +2136,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
info->dest_offset, len,
-   0);
+   REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0ae85a7bd9d..594fe4ba0b15 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1726,14 +1726,17 @@ struct block_device_operations;
  * If it is called with len == 0 that means "remap to end of source file".
  *
  * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
+ * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
  */
 #define REMAP_FILE_DEDUP   (1 << 0)
+#define REMAP_FILE_CAN_SHORTEN (1 << 1)
 
 /* All valid REMAP_FILE flags */
-#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP)
+#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP | \
+REMAP_FILE_CAN_SHORTEN)
 
 /* REMAP_FILE flags taken care of by the vfs. */
-#define REMAP_FILE_ADVISORY(0)
+#define REMAP_FILE_ADVISORY(REMAP_FILE_CAN_SHORTEN)
 
 struct iov_iter;
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e93269efafe..898eb358f7d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3051,8 +3051,12 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
if (pos_in + count == size_in) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
-   if (!IS_ALIGNED(count, bs))
-   return -EINVAL;
+   if (!IS_ALIGNED(count, bs)) {
+   if (remap_flags & REMAP_F

[PATCH 20/26] ocfs2: truncate page cache for clone destination file before remapping

From: Darrick J. Wong 

When cloning blocks into another file, truncate the page cache before we
start remapping blocks so that concurrent reads wait for us to finish.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |   10 --
 1 file changed, 4 insertions(+), 6 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 46bbd315c39f..2a5c96bc9677 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4861,14 +4861,12 @@ int ocfs2_reflink_remap_range(struct file *file_in,
down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
  SINGLE_DEPTH_NESTING);
 
-   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-out_bh, pos_out, len);
-
/* Zap any page cache for the destination file's range. */
-   if (!ret)
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data, pos_out,
+  PAGE_ALIGN(pos_out + len) - 1);
 
+   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+out_bh, pos_out, len);
up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
up_write(&OCFS2_I(inode_out)->ip_alloc_sem);

[PATCH 18/26] vfs: hide file range comparison function

From: Darrick J. Wong 

There are no callers of vfs_dedupe_file_range_compare, so we might as
well make it a static helper and remove the export.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c|  187 +---
 include/linux/fs.h |3 -
 2 files changed, 91 insertions(+), 99 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index ee9314b7bfc3..450e038e8617 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1776,6 +1776,97 @@ static int generic_remap_file_range_target(struct file 
*file,
return file_remove_privs(file);
 }
 
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+   struct page *page;
+
+   page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+   if (IS_ERR(page))
+   return page;
+   if (!PageUptodate(page)) {
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   lock_page(page);
+   return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+struct inode *dest, loff_t destoff,
+loff_t len, bool *is_same)
+{
+   loff_t src_poff;
+   loff_t dest_poff;
+   void *src_addr;
+   void *dest_addr;
+   struct page *src_page;
+   struct page *dest_page;
+   loff_t cmp_len;
+   bool same;
+   int error;
+
+   error = -EINVAL;
+   same = true;
+   while (len) {
+   src_poff = srcoff & (PAGE_SIZE - 1);
+   dest_poff = destoff & (PAGE_SIZE - 1);
+   cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+   cmp_len = min(cmp_len, len);
+   if (cmp_len <= 0)
+   goto out_error;
+
+   src_page = vfs_dedupe_get_page(src, srcoff);
+   if (IS_ERR(src_page)) {
+   error = PTR_ERR(src_page);
+   goto out_error;
+   }
+   dest_page = vfs_dedupe_get_page(dest, destoff);
+   if (IS_ERR(dest_page)) {
+   error = PTR_ERR(dest_page);
+   unlock_page(src_page);
+   put_page(src_page);
+   goto out_error;
+   }
+   src_addr = kmap_atomic(src_page);
+   dest_addr = kmap_atomic(dest_page);
+
+   flush_dcache_page(src_page);
+   flush_dcache_page(dest_page);
+
+   if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+   same = false;
+
+   kunmap_atomic(dest_addr);
+   kunmap_atomic(src_addr);
+   unlock_page(dest_page);
+   unlock_page(src_page);
+   put_page(dest_page);
+   put_page(src_page);
+
+   if (!same)
+   break;
+
+   srcoff += cmp_len;
+   destoff += cmp_len;
+   len -= cmp_len;
+   }
+
+   *is_same = same;
+   return 0;
+
+out_error:
+   return error;
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1932,102 +2023,6 @@ loff_t vfs_clone_file_range(struct file *file_in, 
loff_t pos_in,
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
 
-/*
- * Read a page's worth of file data into the page cache.  Return the page
- * locked.
- */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
-   struct address_space *mapping;
-   struct page *page;
-   pgoff_t n;
-
-   n = offset >> PAGE_SHIFT;
-   mapping = inode->i_mapping;
-   page = read_mapping_page(mapping, n, NULL);
-   if (IS_ERR(page))
-   return page;
-   if (!PageUptodate(page)) {
-   put_page(page);
-   return ERR_PTR(-EIO);
-   }
-   lock_page(page);
-   return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
-   loff_t src_poff;
-   loff_t dest_poff;
-   void *src_addr;
-   void *dest_addr;
-   struct page *src_page;
-   struct page *dest_page;
-   loff_t cmp_len;
-   bool same;
-   int error;
-
-   error = -EINVAL;
-   same = true;
-

[PATCH 14/26] vfs: make remap_file_range functions take and return bytes completed

From: Darrick J. Wong 

Change the remap_file_range functions to take a number of bytes to
operate upon and return the number of bytes they operated on.  This is a
requirement for allowing fs implementations to return short clone/dedupe
results to the user, which will enable us to obey resource limits in a
graceful manner.

A subsequent patch will enable copy_file_range to signal to the
->clone_file_range implementation that it can handle a short length,
which will be returned in the function's return value.  For now the
short return is not implemented anywhere so the behavior won't change --
either copy_file_range manages to clone the entire range or it tries an
alternative.

Neither clone ioctl can take advantage of this, alas.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |6 ++---
 fs/btrfs/ctree.h  |6 ++---
 fs/btrfs/ioctl.c  |   13 ++
 fs/cifs/cifsfs.c  |6 ++---
 fs/ioctl.c|   10 +++-
 fs/nfs/nfs4file.c |6 ++---
 fs/nfsd/vfs.c |8 +-
 fs/ocfs2/file.c   |   16 ++--
 fs/ocfs2/refcounttree.c   |2 +-
 fs/ocfs2/refcounttree.h   |2 +-
 fs/overlayfs/copy_up.c|6 ++---
 fs/overlayfs/file.c   |   12 +
 fs/read_write.c   |   49 -
 fs/xfs/xfs_file.c |9 +--
 fs/xfs/xfs_reflink.c  |4 ++-
 fs/xfs/xfs_reflink.h  |2 +-
 include/linux/fs.h|   27 +++-
 mm/filemap.c  |2 +-
 18 files changed, 106 insertions(+), 80 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index bb3183334ab9..8ba47d9d6cae 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,9 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 len, unsigned int remap_flags);
+   loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 124a05662fc2..771a961d77ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3247,9 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len,
-  unsigned int remap_flags);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bfd99c66723e..b0c513e10977 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4328,10 +4328,12 @@ static noinline int btrfs_clone_files(struct file 
*file, struct file *file_src,
return ret;
 }
 
-int btrfs_remap_file_range(struct file *src_file, loff_t off,
-   struct file *dst_file, loff_t destoff, u64 len,
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+   struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
 {
+   int ret;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
@@ -4349,10 +4351,11 @@ int btrfs_remap_file_range(struct file *src_file, 
loff_t off,
return -EINVAL;
}
 
-   return btrfs_extent_same(src, off, len, dst, destoff);
+   ret = btrfs_extent_same(src, off, len, dst, destoff);
+   } else {
+   ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
-
-   return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+   return ret < 0 ? ret : len;
 }
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8144d0dcde2..5ca71c6c8be2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/c

[PATCH 13/26] vfs: create generic_remap_file_range_touch to update inode metadata

From: Darrick J. Wong 

Create a new VFS helper to handle inode metadata updates when remapping
into a file.  If the operation can possibly alter the file contents, we
must update the ctime and mtime and remove security privileges, just
like we do for regular file writes.  Wire up ocfs2 to ensure consistent
behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c  |   28 
 fs/xfs/xfs_reflink.c |   23 ---
 2 files changed, 28 insertions(+), 23 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index ebcbfc4f2907..3f6392f1d5d4 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1737,6 +1737,30 @@ static int generic_remap_check_len(struct inode 
*inode_in,
return 0;
 }
 
+/* Update inode timestamps and remove security privileges when remapping. */
+static int generic_remap_file_range_target(struct file *file,
+  unsigned int remap_flags)
+{
+   int ret;
+
+   /* If can't alter the file contents, we're done. */
+   if (remap_flags & REMAP_FILE_DEDUP)
+   return 0;
+
+   /* Update the timestamps, since we can alter file contents. */
+   if (!(file->f_mode & FMODE_NOCMTIME)) {
+   ret = file_update_time(file);
+   if (ret)
+   return ret;
+   }
+
+   /*
+* Clear the security bits if the process is not being run by root.
+* This keeps people from modifying setuid and setgid binaries.
+*/
+   return file_remove_privs(file);
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1820,6 +1844,10 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (ret)
return ret;
 
+   ret = generic_remap_file_range_target(file_out, remap_flags);
+   if (ret)
+   return ret;
+
return 1;
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 29aab196ce7e..2d7dd8b28d7c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1372,29 +1372,6 @@ xfs_reflink_remap_prep(
truncate_inode_pages_range(&inode_out->i_data, pos_out,
   PAGE_ALIGN(pos_out + *len) - 1);
 
-   /* If we're altering the file contents... */
-   if (!(remap_flags & REMAP_FILE_DEDUP)) {
-   /*
-* ...update the timestamps (which will grab the ilock again
-* from xfs_fs_dirty_inode, so we have to call it before we
-* take the ilock).
-*/
-   if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-   ret = file_update_time(file_out);
-   if (ret)
-   goto out_unlock;
-   }
-
-   /*
-* ...clear the security bits if the process is not being run
-* by root.  This keeps people from modifying setuid and setgid
-* binaries.
-*/
-   ret = file_remove_privs(file_out);
-   if (ret)
-   goto out_unlock;
-   }
-
return 1;
 out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);

[PATCH 09/26] vfs: rename clone_verify_area to remap_verify_area

From: Darrick J. Wong 

Since we use clone_verify_area for both clone and dedupe range checks,
rename the function to make it clear that it's for both.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index aca75a97a695..734c5661fb69 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1686,7 +1686,7 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t 
__user *, off_in,
return ret;
 }
 
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
+static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
 {
struct inode *inode = file_inode(file);
 
@@ -1852,11 +1852,11 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (!file_in->f_op->clone_file_range)
return -EOPNOTSUPP;
 
-   ret = clone_verify_area(file_in, pos_in, len, false);
+   ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
 
-   ret = clone_verify_area(file_out, pos_out, len, true);
+   ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
 
@@ -1989,7 +1989,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (ret)
return ret;
 
-   ret = clone_verify_area(dst_file, dst_pos, len, true);
+   ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
 
@@ -2051,7 +2051,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;
 
-   ret = clone_verify_area(file, off, len, false);
+   ret = remap_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;

[PATCH 07/26] vfs: skip zero-length dedupe requests

From: Darrick J. Wong 

Don't bother calling the filesystem for a zero-length dedupe request;
we can return zero and exit.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |5 +
 1 file changed, 5 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index 0f0a6efdd502..f5395d8da741 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2009,6 +2009,11 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (!dst_file->f_op->dedupe_file_range)
goto out_drop_write;
 
+   if (len == 0) {
+   ret = 0;
+   goto out_drop_write;
+   }
+
ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
dst_file, dst_pos, len);
 out_drop_write:

[PATCH 11/26] vfs: pass remap flags to generic_remap_file_range_prep

From: Darrick J. Wong 

Plumb the remap flags through the filesystem from the vfs function
dispatcher all the way to the prep function to prepare for behavior
changes in subsequent patches.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/file.c |2 +-
 fs/ocfs2/refcounttree.c |4 ++--
 fs/ocfs2/refcounttree.h |2 +-
 fs/read_write.c |   14 +++---
 fs/xfs/xfs_file.c   |2 +-
 fs/xfs/xfs_reflink.c|   21 +++--
 fs/xfs/xfs_reflink.h|3 ++-
 include/linux/fs.h  |2 +-
 8 files changed, 26 insertions(+), 24 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0b757a24567c..9809b0e5746f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2538,7 +2538,7 @@ static int ocfs2_remap_file_range(struct file *file_in,
return -EINVAL;
 
return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-len, remap_flags & REMAP_FILE_DEDUP);
+len, remap_flags);
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 36c56dfbe485..df9781567ec0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4825,7 +4825,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  struct file *file_out,
  loff_t pos_out,
  u64 len,
- bool is_dedupe)
+ unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -4851,7 +4851,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
goto out_unlock;
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
-   &len, is_dedupe);
+   &len, remap_flags);
if (ret <= 0)
goto out_unlock;
 
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 4af55bf4b35b..d2c5f526edff 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -120,6 +120,6 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  struct file *file_out,
  loff_t pos_out,
  u64 len,
- bool is_dedupe);
+ unsigned int remap_flags);
 
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/read_write.c b/fs/read_write.c
index 766bdcb381f3..201381689284 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1722,14 +1722,14 @@ static int generic_remap_check_len(struct inode 
*inode_in,
   struct inode *inode_out,
   loff_t pos_out,
   u64 *len,
-  bool is_dedupe)
+  unsigned int remap_flags)
 {
u64 blkmask = i_blocksize(inode_in) - 1;
 
if ((*len & blkmask) == 0)
return 0;
 
-   if (is_dedupe)
+   if (remap_flags & REMAP_FILE_DEDUP)
*len &= ~blkmask;
else if (pos_out + *len < i_size_read(inode_out))
return -EINVAL;
@@ -1747,7 +1747,7 @@ static int generic_remap_check_len(struct inode *inode_in,
  */
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  struct file *file_out, loff_t pos_out,
- u64 *len, bool is_dedupe)
+ u64 *len, unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1771,7 +1771,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (*len == 0) {
loff_t isize = i_size_read(inode_in);
 
-   if (is_dedupe || pos_in == isize)
+   if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
@@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-   is_dedupe);
+   (remap_flags & REMAP_FILE_DEDUP));
if (ret)
return ret;
 
@@ -1804,7 +1804,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
/*
 * Check that the extents are the same.
 */
-   if (is_dedupe) {
+   if (remap_flags & REMAP_FILE_DEDUP) {
boolis_same = false;
 
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
@@ -1816,7

[PATCH 10/26] vfs: combine the clone and dedupe into a single remap_file_range

From: Darrick J. Wong 

Combine the clone_file_range and dedupe_file_range operations into a
single remap_file_range file operation dispatch since they're
fundamentally the same operation.  The differences between the two can
be made in the prep functions.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |   13 +--
 fs/btrfs/ctree.h  |8 ++-
 fs/btrfs/file.c   |3 +-
 fs/btrfs/ioctl.c  |   45 +++--
 fs/cifs/cifsfs.c  |   22 +++---
 fs/nfs/nfs4file.c |   10 ++--
 fs/ocfs2/file.c   |   24 +++-
 fs/overlayfs/file.c   |   30 ++---
 fs/read_write.c   |   18 +++
 fs/xfs/xfs_file.c |   23 ++-
 include/linux/fs.h|   20 +---
 11 files changed, 110 insertions(+), 106 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index a6c6a8af48a2..bb3183334ab9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,8 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
-   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
+   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
@@ -960,11 +961,9 @@ otherwise noted.
 
   copy_file_range: called by the copy_file_range(2) system call.
 
-  clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
-   FICLONE commands.
-
-  dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
-   command.
+  remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
+   FICLONE and FIDEDUPERANGE commands to remap file ranges.  Note that
+   a zero length implies "remap to end of source file".
 
   fadvise: possibly called by the fadvise64() system call.
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..124a05662fc2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3218,9 +3218,6 @@ void btrfs_get_block_group_info(struct list_head 
*groups_list,
struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
   struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen);
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
@@ -3250,8 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len);
+int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out, u64 len,
+  unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..9a963f061393 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3269,8 +3269,7 @@ const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
.compat_ioctl   = btrfs_compat_ioctl,
 #endif
-   .clone_file_range = btrfs_clone_file_range,
-   .dedupe_file_range = btrfs_dedupe_file_range,
+   .remap_file_range = btrfs_remap_file_range,
 };
 
 void __cold btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..bfd99c66723e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3627,26 +3627,6 @@ static int btrfs_extent_same(struct inode *src, u64 
loff, u64 olen,
return ret;
 }
 
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen)
-{
-   struct inode *src = file_inode(src_file);
-   struct inode *dst = file_inode(dst_file);
-   u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
-   if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-   /*
-

[PATCH 08/26] vfs: rename vfs_clone_file_prep to be more descriptive

From: Darrick J. Wong 

The vfs_clone_file_prep is a generic function to be called by filesystem
implementations only.  Rename the prefix to generic_ and make it more
clear that it applies to remap operations, not just clones.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +-
 fs/read_write.c |8 
 fs/xfs/xfs_reflink.c|2 +-
 include/linux/fs.h  |6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19e03936c5e1..36c56dfbe485 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index f5395d8da741..aca75a97a695 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1745,9 +1745,9 @@ static int generic_remap_check_len(struct inode *inode_in,
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 *len, bool is_dedupe)
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 *len, bool is_dedupe)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1822,7 +1822,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep);
+EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 281d5f53f2ec..a7757a128a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ba93a6e7dac4..55729e1c2e75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1825,9 +1825,9 @@ extern ssize_t vfs_readv(struct file *, const struct 
iovec __user *,
unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
   loff_t, size_t, unsigned int);
-extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out,
-  u64 *count, bool is_dedupe);
+extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+struct file *file_out, loff_t pos_out,
+u64 *count, bool is_dedupe);
 extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
   struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,

[PATCH 06/26] vfs: avoid problematic remapping requests into partial EOF block

From: Darrick J. Wong 

A deduplication data corruption is exposed in XFS and btrfs. It is
caused by extending the block match range to include the partial EOF
block, but then allowing unknown data beyond EOF to be considered a
"match" to data in the destination file because the comparison is only
made to the end of the source file. This corrupts the destination file
when the source extent is shared with it.

The VFS remapping prep functions  only support whole block dedupe, but
we still need to appear to support whole file dedupe correctly.  Hence
if the dedupe request includes the last block of the souce file, don't
include it in the actual dedupe operation. If the rest of the range
dedupes successfully, then reject the entire request.  A subsequent
patch will enable us to shorten dedupe requests correctly.

When reflinking sub-file ranges, a data corruption can occur when the
source file range includes a partial EOF block. This shares the unknown
data beyond EOF into the second file at a position inside EOF, exposing
stale data in the second file.

If the reflink request includes the last block of the souce file, only
proceed with the reflink operation if it lands at or past the
destination file's current EOF. If it lands within the destination file
EOF, reject the entire request with -EINVAL and make the caller go the
hard way.  A subsequent patch will enable us to shorten reflink requests
correctly.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c |   33 +
 1 file changed, 33 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index 2456da3f8a41..0f0a6efdd502 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1708,6 +1708,34 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
 
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else.  Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+  struct inode *inode_out,
+  loff_t pos_out,
+  u64 *len,
+  bool is_dedupe)
+{
+   u64 blkmask = i_blocksize(inode_in) - 1;
+
+   if ((*len & blkmask) == 0)
+   return 0;
+
+   if (is_dedupe)
+   *len &= ~blkmask;
+   else if (pos_out + *len < i_size_read(inode_out))
+   return -EINVAL;
+
+   return 0;
+}
 
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
@@ -1787,6 +1815,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
return -EBADE;
}
 
+   ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+   is_dedupe);
+   if (ret)
+   return ret;
+
return 1;
 }
 EXPORT_SYMBOL(vfs_clone_file_prep);

[PATCH 05/26] vfs: strengthen checking of file range inputs to generic_remap_checks

From: Darrick J. Wong 

File range remapping, if allowed to run past the destination file's EOF,
is an optimization on a regular file write.  Regular file writes that
extend the file length are subject to various constraints which are not
checked by range cloning.

This is a correctness problem because we're never allowed to touch
ranges that the page cache can't support (s_maxbytes); we're not
supposed to deal with large offsets (MAX_NON_LFS) if O_LARGEFILE isn't
set; and we must obey resource limits (RLIMIT_FSIZE).

Therefore, add these checks to the new generic_remap_checks function so
that we curtail unexpected behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
Reviewed-by: Christoph Hellwig 
---
 mm/filemap.c |   91 ++
 1 file changed, 59 insertions(+), 32 deletions(-)


diff --git a/mm/filemap.c b/mm/filemap.c
index 47e6bfd45a91..08ad210fee49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2915,6 +2915,49 @@ struct page *read_cache_page_gfp(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL(read_cache_page_gfp);
 
+static int generic_access_check_limits(struct file *file, loff_t pos,
+  loff_t *count)
+{
+   struct inode *inode = file->f_mapping->host;
+
+   /* Don't exceed the LFS limits. */
+   if (unlikely(pos + *count > MAX_NON_LFS &&
+   !(file->f_flags & O_LARGEFILE))) {
+   if (pos >= MAX_NON_LFS)
+   return -EFBIG;
+   *count = min(*count, (loff_t)MAX_NON_LFS - pos);
+   }
+
+   /*
+* Don't operate on ranges the page cache doesn't support.
+*
+* If we have written data it becomes a short write.  If we have
+* exceeded without writing data we send a signal and return EFBIG.
+* Linus frestrict idea will clean these up nicely..
+*/
+   if (unlikely(pos >= inode->i_sb->s_maxbytes))
+   return -EFBIG;
+
+   *count = min(*count, inode->i_sb->s_maxbytes - pos);
+   return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+   unsigned long limit = rlimit(RLIMIT_FSIZE);
+
+   if (limit != RLIM_INFINITY) {
+   if (pos >= limit) {
+   send_sig(SIGXFSZ, current, 0);
+   return -EFBIG;
+   }
+   *count = min(*count, (loff_t)limit - pos);
+   }
+
+   return generic_access_check_limits(file, pos, count);
+}
+
 /*
  * Performs necessary checks before doing a write
  *
@@ -2926,8 +2969,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
 {
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-   unsigned long limit = rlimit(RLIMIT_FSIZE);
-   loff_t pos;
+   loff_t count;
+   int ret;
 
if (!iov_iter_count(from))
return 0;
@@ -2936,40 +2979,15 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
 
-   pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
 
-   if (limit != RLIM_INFINITY) {
-   if (iocb->ki_pos >= limit) {
-   send_sig(SIGXFSZ, current, 0);
-   return -EFBIG;
-   }
-   iov_iter_truncate(from, limit - (unsigned long)pos);
-   }
+   count = iov_iter_count(from);
+   ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+   if (ret)
+   return ret;
 
-   /*
-* LFS rule
-*/
-   if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-   !(file->f_flags & O_LARGEFILE))) {
-   if (pos >= MAX_NON_LFS)
-   return -EFBIG;
-   iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
-   }
-
-   /*
-* Are we about to exceed the fs block limit ?
-*
-* If we have written data it becomes a short write.  If we have
-* exceeded without writing data we send a signal and return EFBIG.
-* Linus frestrict idea will clean these up nicely..
-*/
-   if (unlikely(pos >= inode->i_sb->s_maxbytes))
-   return -EFBIG;
-
-   iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+   iov_iter_truncate(from, count);
return iov_iter_count(from);
 }
 EXPORT_SYMBOL(generic_write_checks);
@@ -2991,6 +3009,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
uint64_t bcount;
loff_t size_in, size_out;
loff_t bs = inode_out->i_sb->s_blocksize;
+   int ret;
 
/* The start of both ranges must be aligned to an fs block. */

[PATCH 12/26] vfs: pass remap flags to generic_remap_checks

From: Darrick J. Wong 

Pass the same remap flags to generic_remap_checks for consistency.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|2 +-
 include/linux/fs.h |2 +-
 mm/filemap.c   |4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 201381689284..ebcbfc4f2907 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-   (remap_flags & REMAP_FILE_DEDUP));
+   remap_flags);
if (ret)
return ret;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c2800953937a..1aa3bc1bb092 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2981,7 +2981,7 @@ extern int generic_file_readonly_mmap(struct file *, 
struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
-   uint64_t *count, bool is_dedupe);
+   uint64_t *count, unsigned int remap_flags);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 08ad210fee49..b0f1f6d93d9c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3001,7 +3001,7 @@ EXPORT_SYMBOL(generic_write_checks);
  */
 int generic_remap_checks(struct file *file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
-uint64_t *req_count, bool is_dedupe)
+uint64_t *req_count, unsigned int remap_flags)
 {
struct inode *inode_in = file_in->f_mapping->host;
struct inode *inode_out = file_out->f_mapping->host;
@@ -3023,7 +3023,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
size_out = i_size_read(inode_out);
 
/* Dedupe requires both ranges to be within EOF. */
-   if (is_dedupe &&
+   if ((remap_flags & REMAP_FILE_DEDUP) &&
(pos_in >= size_in || pos_in + count > size_in ||
 pos_out >= size_out || pos_out + count > size_out))
return -EINVAL;

[PATCH 03/26] vfs: check file ranges before cloning files

From: Darrick J. Wong 

Move the file range checks from vfs_clone_file_prep into a separate
generic_remap_checks function so that all the checks are collected in a
central location.  This forms the basis for adding more checks from
generic_write_checks that will make cloning's input checking more
consistent with write input checking.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +
 fs/read_write.c |   55 +
 fs/xfs/xfs_reflink.c|2 +
 include/linux/fs.h  |9 --
 mm/filemap.c|   69 +++
 5 files changed, 90 insertions(+), 47 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..19e03936c5e1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index 260797b01851..d6e8e242a15f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-  struct inode *inode_out, loff_t pos_out,
-  u64 *len, bool is_dedupe)
+int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 *len, bool is_dedupe)
 {
-   loff_t bs = inode_out->i_sb->s_blocksize;
-   loff_t blen;
-   loff_t isize;
+   struct inode *inode_in = file_inode(file_in);
+   struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
 
@@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   isize = i_size_read(inode_in);
-
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
+   loff_t isize = i_size_read(inode_in);
+
if (is_dedupe || pos_in == isize)
return 0;
if (pos_in > isize)
@@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
*len = isize - pos_in;
}
 
-   /* Ensure offsets don't wrap and the input is inside i_size */
-   if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-   pos_in + *len > isize)
-   return -EINVAL;
-
-   /* Don't allow dedupe past EOF in the dest file */
-   if (is_dedupe) {
-   loff_t  disize;
-
-   disize = i_size_read(inode_out);
-   if (pos_out >= disize || pos_out + *len > disize)
-   return -EINVAL;
-   }
-
-   /* If we're linking to EOF, continue to the block boundary. */
-   if (pos_in + *len == isize)
-   blen = ALIGN(isize, bs) - pos_in;
-   else
-   blen = *len;
-
-   /* Only reflink if we're aligned to block boundaries */
-   if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-   !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-   return -EINVAL;
-
-   /* Don't allow overlapped reflink within the same file */
-   if (same_inode) {
-   if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-   return -EINVAL;
-   }
+   /* Check that we don't violate system file offset limits. */
+   ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+   is_dedupe);
+   if (ret)
+   return ret;
 
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
@@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+EXPORT_SYMBOL(vfs_clone_file_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
@@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (ret)
return ret;
 
-   if (pos_in + len > i_size_read(inode_in))
-

[PATCH 01/26] xfs: add a per-xfs trace_printk macro

From: Darrick J. Wong 

Add a "xfs_tprintk" macro so that developers can use trace_printk to
print out arbitrary debugging information with the XFS device name
attached to the trace output.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_error.h |6 ++
 1 file changed, 6 insertions(+)


diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..5caa8bdf6c38 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -76,6 +76,11 @@ extern int xfs_errortag_set(struct xfs_mount *mp, unsigned 
int error_tag,
unsigned int tag_value);
 extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
 extern int xfs_errortag_clearall(struct xfs_mount *mp);
+
+/* trace printk version of xfs_err and friends */
+#define xfs_tprintk(mp, fmt, args...) \
+   trace_printk("dev %d:%d " fmt, MAJOR((mp)->m_super->s_dev), \
+   MINOR((mp)->m_super->s_dev), ##args)
 #else
 #define xfs_errortag_init(mp)  (0)
 #define xfs_errortag_del(mp)
@@ -83,6 +88,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define xfs_errortag_set(mp, tag, val) (ENOSYS)
 #define xfs_errortag_add(mp, tag)  (ENOSYS)
 #define xfs_errortag_clearall(mp)  (ENOSYS)
+#define xfs_tprintk(mp, fmt, args...)  do { } while (0)
 #endif /* DEBUG */
 
 /*

[PATCH 04/26] vfs: exit early from zero length remap operations

From: Darrick J. Wong 

If a remap caller asks us to remap to the source file's EOF and the
source file has zero bytes, exit early.

Signed-off-by: Darrick J. Wong 
---
 fs/read_write.c |2 ++
 1 file changed, 2 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index d6e8e242a15f..2456da3f8a41 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1748,6 +1748,8 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
+   if (*len == 0)
+   return 0;
}
 
/* Check that we don't violate system file offset limits. */

[PATCH v5 00/26] fs: fixes for serious clone/dedupe problems

Hi all,

Dave, Eric, and I have been chasing a stale data exposure bug in the XFS
reflink implementation, and tracked it down to reflink forgetting to do
some of the file-extending activities that must happen for regular
writes.

We then started auditing the clone, dedupe, and copyfile code and
realized that from a file contents perspective, clonerange isn't any
different from a regular file write. Unfortunately, we also noticed
that *unlike* a regular write, clonerange skips a ton of overflow
checks, such as validating the ranges against s_maxbytes, MAX_NON_LFS,
and RLIMIT_FSIZE. We also observed that cloning into a file did not
strip security privileges (suid, capabilities) like a regular write
would. I also noticed that xfs and ocfs2 need to dump the page cache
before remapping blocks, not after.

In fixing the range checking problems I also realized that both dedupe
and copyfile tell userspace how much of the requested operation was
acted upon. Since the range validation can shorten a clone request (or
we can ENOSPC midway through), we might as well plumb the short
operation reporting back through the VFS indirection code to userspace.

So, here's the whole giant pile of patches[1] that fix all the problems.
This branch is against current upstream (4.19-rc8). The patch
"generic: test reflink side effects" recently sent to fstests exercises
the fixes in this series. Tests are in [2].

--D

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=djwong-devel
[2]
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=djwong-devel

[PATCH 02/26] vfs: vfs_clone_file_prep_inodes should return EINVAL for a clone from beyond EOF

From: Darrick J. Wong 

vfs_clone_file_prep_inodes cannot return 0 if it is asked to remap from
a zero byte file because that's what btrfs does.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c |3 ---
 1 file changed, 3 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 8a2737f0d61d..260797b01851 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1740,10 +1740,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   /* Are we going all the way to the end? */
isize = i_size_read(inode_in);
-   if (isize == 0)
-   return 0;
 
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {

Re: Spurious mount point

Chris Murphy:

> I wasn't aware that SUSE was now using the @ location for
> snapshots, or that it was using Btrfs for /home. For a
> while it's been XFS with a Btrfs sysroot.

Ours does use btrfs for `/' and xfs for `/home' *except*
`/home/hana', which is strange and wonderful, because the
standard usage is there in fstab and hana mount is created
somewhere else.

-- 
()  ascii ribbon campaign -- against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]

Re: Spurious mount point

2018-10-15 Thread Chris Murphy

On Mon, Oct 15, 2018 at 3:26 PM, Anton Shepelev  wrote:
> Chris Murphy to Anton Shepelev:
>
>> > How can I track down the origin of this mount point:
>> >
>> > /dev/sda2 on /home/hana type btrfs 
>> > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
>> >
>> > if it is not present in /etc/fstab?  I shouldn't like to
>> > find/grep thoughout the whole filesystem.
>>
>> Sounds like some service taking snapshots regularly is
>> managing this.  Maybe this is Mint or Ubuntu and you're
>> using Timeshift?
>
> It is SUSE Linux and (probably) its tool called `snapper',
> but I have not found a clue in its documentation.

I wasn't aware that SUSE was now using the @ location for snapshots,
or that it was using Btrfs for /home. For a while it's been XFS with a
Btrfs sysroot.





-- 
Chris Murphy

Re: Spurious mount point

Chris Murphy to Anton Shepelev:

> > How can I track down the origin of this mount point:
> >
> > /dev/sda2 on /home/hana type btrfs 
> > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
> >
> > if it is not present in /etc/fstab?  I shouldn't like to
> > find/grep thoughout the whole filesystem.
>
> Sounds like some service taking snapshots regularly is
> managing this.  Maybe this is Mint or Ubuntu and you're
> using Timeshift?

It is SUSE Linux and (probably) its tool called `snapper',
but I have not found a clue in its documentation.

> Maybe it'll show up in the journal if you add boot
> parameter 'systemd.log_level=debug' and reboot; then use
> 'journalctl -b | grep mount' and it should show all
> instances logged instances of mount events: systemd,
> udisks2, maybe others?

I will try that as soon as the client lets me reboot that
machine.

-- 
()  ascii ribbon campaign -- against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]

Re: Spurious mount point

2018-10-15 Thread Chris Murphy

On Mon, Oct 15, 2018 at 9:05 AM, Anton Shepelev  wrote:
> Hello, all
>
> How can I track down the origin of this mount point:
>
>/dev/sda2 on /home/hana type btrfs 
> (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
>
> if it is not present in /etc/fstab?  I shouldn't like to
> find/grep thoughout the whole filesystem.
>
> --
> ()  ascii ribbon campaign - against html e-mail
> /\  http://preview.tinyurl.com/qcy6mjc [archived]

Sounds like some service taking snapshots regularly is managing this.
Maybe this is Mint or Ubuntu and you're using Timeshift?

Maybe it'll show up in the journal if you add boot parameter
'systemd.log_level=debug' and reboot; then use 'journalctl -b | grep
mount' and it should show all instances logged instances of mount
events: systemd, udisks2, maybe others?

-- 
Chris Murphy

Re: reproducible builds with btrfs seed feature

2018-10-15 Thread Chris Murphy

On Mon, Oct 15, 2018 at 6:29 AM, Austin S. Hemmelgarn
 wrote:
> On 2018-10-13 18:28, Chris Murphy wrote:

>> The end result is creating two Btrfs volumes would yield image files
>> with matching hashes.
>
> So in other words, you care about matching the block layout _exactly_.

Only because that's the easiest way to verify reproducibility without
any ambiguity.

If someone's compromised a build system such that everyone is getting
the malicious payload, but they can hide it behind a subvolume or
reflink that's not used by default, could someone plausibly cause
selective use of their malicious payload? I dunno I leave that for
more crafty people. But even if it's a tiny bit of ambiguity, it's
non-zero. Hashing a file that contains the entire file system is
unambiguous.

I think populating the image with --rootdir at mkfs time should be
pretty deterministic. One stream in and out. No generations, no
snapshot, no delayed allocation. It'd be quite similar to mksquashfs.
I guess I'd have to try it a few times, and see if really the only
differences are uuids and times, and not allocation related things.

-- 
Chris Murphy

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

On Mon, Oct 15, 2018 at 10:13:17AM -0700, Darrick J. Wong wrote:
> > RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> > so the file system should know about it  Also looking at it again now
> > it seems entirely superflous - we can just pass down then len == we
> > use in higher level code instead of having a flag and will side step
> > the issue here.
> 
> I'm not a fan of hidden behaviors like that, particularly when we
> already have a flags field where callers can explicitly ask for the
> to-eof behavior.

This just means we have a flag to mean ken is 0 and needs to be filled,
rather than encoding that in the field itself.  If you fell better we
can replace 0 with 0x and still encode it in the field.

> > RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> > easily be solved by including it everywhere.
> 
> CAN_SHORTEN isn't included everywhere

Include it everywhere as in allow it in ever ->remap_file instance.

> I sort of thought about introducing a new copy_file_range flag that
> would just do deduplication and allow for opportunistic "dedup as much
> as you can" but ... meh.  Maybe I'll just drop the patch instead; we can
> revisit that when anyone wants a better dedupe interface.

Sounds fine to me.  The btrfs ioctl is really ugly, but then again
there is no pressing need for something better.

Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit

On Mon, Oct 15, 2018 at 08:32:19AM -0700, Darrick J. Wong wrote:
> > Why not put that in the name to make it more descriptive?
> 
> I'm all ears for better suggestions. :)

I think the best idea is no flag - just carry through the special
zero..

Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata

On Mon, Oct 15, 2018 at 09:30:01AM -0700, Darrick J. Wong wrote:
> I originally thought "touch" because it updates [cm]time. :)
> 
> Though looking at the final code, I think this can just be called from
> the end of generic_remap_file_range_prep, so we can skip the export and
> all that other stuff.

I though about that, but the locking didn't seem to quite work out
between xfs and ocfs.

Nevermind the big elephant of actually converting btrfs to the "VFS"
helper - I think if that doesn't work out it is rather questionable
how generic they actually are.

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

On Mon, Oct 15, 2018 at 05:47:19AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote:
> > I supposed you figured out the reason already.
> 
> No, I hadn't.
> 
> > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS.
> > All those "advisory" flags, we want to pass them in to filesystem as FYI,
> > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN
> > to every filesystem, when vfs has already taken care of the advice.
> 
> I don't think this model makes sense.  If they really are purely
> handled in the VFS we can mask them before passing them to the file
> system, if not we need to check them, or the they are avisory and
> we can have a simple #define instead of the helper.
> 
> RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> so the file system should know about it  Also looking at it again now
> it seems entirely superflous - we can just pass down then len == we
> use in higher level code instead of having a flag and will side step
> the issue here.

I'm not a fan of hidden behaviors like that, particularly when we
already have a flags field where callers can explicitly ask for the
to-eof behavior.

> RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> easily be solved by including it everywhere.

CAN_SHORTEN isn't included everywhere -- FICLONE{,RANGE} don't enable it
because they have no way to communicate the number of bytes cloned back
to userspace.  Either we can clone every byte the user asked for, or we
send back -EINVAL.  (Maybe I'm misinterpreting what you meant by 'solved
by including it everywhere'?)

> RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to
> start with, as RFR_CAN_SHORTEN can be used instead.

For now it's superfluous.  At first I was thinking that we could return
a short bytes_deduped if, say, the first part of the range actually did
match, but it became pretty obvious via shared/010 that duperemove can't
handle that, so we really must stick to the existing btrfs behavior.

The existing btrfs behavior is that we can round the length down to
avoid deduping partial EOF blocks, but we return the original length
(i.e. lie) in bytes_deduped when we do that.

I sort of thought about introducing a new copy_file_range flag that
would just do deduplication and allow for opportunistic "dedup as much
as you can" but ... meh.  Maybe I'll just drop the patch instead; we can
revisit that when anyone wants a better dedupe interface.

> So something like this in fs.h:
> 
> #define REMAP_FILE_ADVISORY_FLAGS REMAP_FILE_CAN_SHORTEN
> 
> And then in the file system:
> 
>   if (flags & ~REMAP_FILE_ADVISORY_FLAGS)
>   -EINVAL;
> 
> or
> 
>   if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP))
>   -EINVAL;
> 
> should be all that is needed.

Sounds good to me.

--D

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote:
> > unsigned (*mmap_capabilities)(struct file *);
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> None of the other methods in this file name their parameters.  While
> I generally don't like people leaving them out, in the end consistency
> is even more important.
> 
> > +int btrfs_remap_file_range(struct file *src_file, loff_t off,
> > +   struct file *dst_file, loff_t destoff, u64 len,
> > +   unsigned int remap_flags)
> >  {
> > +   if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
> > +   return -EINVAL;
> > +
> > +   if (remap_flags & RFR_SAME_DATA) {
> 
> So at least for btrfs there seems to be no shared code at all below
> the function calls.  This kinda speaks against the argument that
> they fundamentally are the same..

They /do/ share/ code -- eventually both btrfs_extent_same and
btrfs_clone_files call btrfs_clone.  xfs and ocfs2 call the same paths
internally too; it's only the vfs helpers that have the extra page cache
comparisons if it's a dedup operation.

> > +/*
> > + * These flags control the behavior of the remap_file_range function 
> > pointer.
> > + *
> > + * RFR_SAME_DATA: only remap if contents identical (i.e. deduplicate)
> > + */
> > +#define RFR_SAME_DATA  (1 << 0)
> > +
> > +#define RFR_VALID_FLAGS(RFR_SAME_DATA)
> 
> RFR?  Why not REMAP_FILE_*  Also why not the well understood
> REMAP_FILE_DEDUP instead of the odd SAME_DATA?

Sure.  I had begin to dislike typing RFR anyway.

> > +
> > +/*
> > + * Filesystem remapping implementations should call this helper on their
> > + * remap flags to filter out flags that the implementation doesn't support.
> > + *
> > + * Returns true if the flags are ok, false otherwise.
> > + */
> > +static inline bool remap_check_flags(unsigned int remap_flags,
> > +unsigned int supported_flags)
> > +{
> > +   return (remap_flags & ~(supported_flags & RFR_VALID_FLAGS)) == 0;
> > +}
> 
> Any reason to even bother with a helper for this?  ->fallocate
> seems to be doing fine without the helper, and the resulting code
> seems a lot easier to understand to me.

(Will respond to these at the current end of the flags thread.)

> > @@ -1759,10 +1779,9 @@ struct file_operations {
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
> > -   u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
> > -   u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> Same comment here.  Didn't we have some nice doc tools to avoid this
> duplication? :)

We do, but vfs.txt hasn't been ported to any of that.

--D

Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata

On Sun, Oct 14, 2018 at 10:21:31AM -0700, Christoph Hellwig wrote:
> > +/* Update inode timestamps and remove security privileges when remapping. 
> > */
> > +int generic_remap_file_range_touch(struct file *file, bool is_dedupe)
> > +{
> > +   int ret;
> > +
> > +   /* If can't alter the file contents, we're done. */
> > +   if (is_dedupe)
> > +   return 0;
> > +
> > +   /* Update the timestamps, since we can alter file contents. */
> > +   if (!(file->f_mode & FMODE_NOCMTIME)) {
> > +   ret = file_update_time(file);
> > +   if (ret)
> > +   return ret;
> > +   }
> > +
> > +   /*
> > +* Clear the security bits if the process is not being run by root.
> > +* This keeps people from modifying setuid and setgid binaries.
> > +*/
> > +   return file_remove_privs(file);
> > +}
> > +EXPORT_SYMBOL(generic_remap_file_range_touch);
> 
> The name seems a little out of touch with what it actually does.

I originally thought "touch" because it updates [cm]time. :)

Though looking at the final code, I think this can just be called from
the end of generic_remap_file_range_prep, so we can skip the export and
all that other stuff.

> Also why a bool argument instead of the more descriptive flags which
> introduced a few patches ago?

Hmm, yes, the remap_flags transition can move up to before this patch.

--D

Re: [PATCH 24/25] xfs: support returning partial reflink results

On Mon, Oct 15, 2018 at 10:05:36AM +1100, Dave Chinner wrote:
> On Sun, Oct 14, 2018 at 10:35:46AM -0700, Christoph Hellwig wrote:
> > On Fri, Oct 12, 2018 at 05:08:32PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong 
> > > 
> > > Back when the XFS reflink code only supported clone_file_range, we were
> > > only able to return zero or negative error codes to userspace.  However,
> > > now that copy_file_range (which returns bytes copied) can use XFS'
> > > clone_file_range, we have the opportunity to return partial results.
> > > For example, if userspace sends a 1GB clone request and we run out of
> > > space halfway through, we at least can tell userspace that we completed
> > > 512M of that request like a regular write.
> > > 
> > > Signed-off-by: Darrick J. Wong 
> > > ---
> > >  fs/xfs/xfs_file.c|5 +
> > >  fs/xfs/xfs_reflink.c |   20 +++-
> > >  fs/xfs/xfs_reflink.h |2 +-
> > >  3 files changed, 17 insertions(+), 10 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index bc9e94bcb7a3..b2b15b8dc4a1 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -928,14 +928,11 @@ xfs_file_remap_range(
> > >   loff_t  len,
> > >   unsigned intremap_flags)
> > >  {
> > > - int ret;
> > > -
> > >   if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
> > >   return -EINVAL;
> > >  
> > > - ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> > > + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> > >   len, remap_flags);
> > 
> > Is there any reason not to merge xfs_file_remap_range and
> > xfs_reflink_remap_range at this point?
> 
> Yeah, that seems like a good idea to me - pulling all the
> vfs/generic code interactions back up into xfs_file.c would match
> how the rest of the file operations are layered w.r.t. external and
> internal XFS code...

Yeah, ditto ocfs2.  I'll look into throwing a few more refactors onto
the end of this series... 8-D

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com

Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Martin Steigerwald

Hugo Mills - 15.10.18, 16:26:
> On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> > Hello, all
> > 
> > While trying to resolve free space problems, and found that
> > 
> > I cannot interpret the output of:
> > > btrfs filesystem show
> > 
> > Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> > Total devices 1 FS bytes used 34.06GiB
> > devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> > 
> > How come the total used value is less than the value listed
> > for the only device?
> 
>"Used" on the device is the mount of space allocated. "Used" on the
> FS is the total amount of actual data and metadata in that
> allocation.
> 
>You will also need to look at the output of "btrfs fi df" to see
> the breakdown of the 37.82 GiB into data, metadata and currently
> unused.

I usually use btrfs fi usage -T, cause

1. It has all the information.

2. It differentiates between used and allocated.

% btrfs fi usage -T /
Overall:
Device size: 100.00GiB
Device allocated: 54.06GiB
Device unallocated:   45.94GiB
Device missing:  0.00B
Used: 46.24GiB
Free (estimated): 25.58GiB  (min: 25.58GiB)
Data ratio:   2.00
Metadata ratio:   2.00
Global reserve:   70.91MiB  (used: 0.00B)

Data Metadata  System  
Id Path RAID1RAID1 RAID1Unallocated
--   -  ---
 2 /dev/mapper/msata-debian 25.00GiB   2.00GiB 32.00MiB22.97GiB
 1 /dev/mapper/sata-debian  25.00GiB   2.00GiB 32.00MiB22.97GiB
--   -  ---
   Total25.00GiB   2.00GiB 32.00MiB45.94GiB
   Used 22.38GiB 754.66MiB 16.00KiB  


For RAID it in some place reports the raw size and sometimes the logical 
size. Especially in the "Total" line I find this a bit inconsistent. 
"RAID1" columns show logical size, "Unallocated" shows raw size.

Also "Used:" in the global section shows raw size and "Free 
(estimated):" shows logical size.

Thanks
-- 
Martin

Re: [PATCH 11/25] vfs: pass remap flags to generic_remap_file_range_prep

On Sun, Oct 14, 2018 at 10:37:38AM -0700, Christoph Hellwig wrote:
> > +   bool is_dedupe = (remap_flags & RFR_SAME_DATA);
> 
> Btw, I think the code would be cleaner if we dropped this variable.

Ok to both.  I'll move up the patch to replace is_dedupe with
remap_flags to avoid churning the _touch function too, btw.

--D

Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit

On Sun, Oct 14, 2018 at 10:24:33AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 12, 2018 at 05:07:37PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong 
> > 
> > Create a RFR_TO_SRC_EOF flag to explicitly declare that the caller wants
> > the remap implementation to remap to the end of the source file, once
> > the files are locked.
> 
> The name looks like a cat threw up on your keyboard :)

Yeah... :(

> From reading the code this seems to ask for a whole file remap, right?

Nope.  In the original btrfs clonerange ioctl, length == 0 meant "to EOF".
If you made a call like this:

struct btrfs_ioctl_clone_range_args x = {
.src_offset = 16384,
.src_length = 0,
.dest_offset = 0,
.src_fd = whatever,
};
ftruncate(dest_fd, 0);
ioctl(dest_fd, BTRFS_IOC_CLONE, &x);

Then dest_fd ends up with the contents of [16k...EOF] from src_fd.
It's annoying to have the magic length number (and no flags?) but we're
stuck with this quirk of the interface.

> Why not put that in the name to make it more descriptive?

I'm all ears for better suggestions. :)

--D

Spurious mount point

Hello, all

How can I track down the origin of this mount point:

   /dev/sda2 on /home/hana type btrfs 
(rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)

if it is not present in /etc/fstab?  I shouldn't like to
find/grep thoughout the whole filesystem.

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]

Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Austin S. Hemmelgarn

On 2018-10-15 10:42, Anton Shepelev wrote:

Hugo Mills to Anton Shepelev:

While trying to resolve free space problems, and found
that I cannot interpret the output of:

btrfs filesystem show

Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
Total devices 1 FS bytes used 34.06GiB
devid1 size 40.00GiB used 37.82GiB path /dev/sda2

How come the total used value is less than the value
listed for the only device?

"Used" on the device is the mount of space allocated.
"Used" on the FS is the total amount of actual data and
metadata in that allocation.

You will also need to look at the output of "btrfs fi
df" to see the breakdown of the 37.82 GiB into data,
metadata and currently unused.

See
https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
for the details

Thank you, Hugo, understood. mount/amount is a very fitting
typo :-)

Does the standard `du' tool work correctly for btfrfs?

For the default 'physical usage' mode, it functionally does not work
correctly, because it does not know about reflinks. The easiest way to
see this is to create a couple of snapshots of a subvolume alongside the
subvolume, and then run `du -s --totals` on those snapshots and the
subvolume. It will report the total space usage to be equal to the sum
of the values reported for each snapshot and the subvolume, when it
should instead only count the space usage for shared data once.

For the 'apparent usage' mode provided by the GNU implementation, it
does work correctly.

Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills

On Mon, Oct 15, 2018 at 05:40:40PM +0300, Anton Shepelev wrote:
> Hugo Mills to Anton Shepelev:
> 
> >>While trying to resolve free space problems, and found
> >>that I cannot interpret the output of:
> >>
> >>> btrfs filesystem show
> >>
> >>Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> >>Total devices 1 FS bytes used 34.06GiB
> >>devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> >>
> >>How come the total used value is less than the value
> >>listed for the only device?
> >
> >   "Used" on the device is the mount of space allocated.
> >"Used" on the FS is the total amount of actual data and
> >metadata in that allocation.
> >
> >   You will also need to look at the output of "btrfs fi
> >df" to see the breakdown of the 37.82 GiB into data,
> >metadata and currently unused.
> >
> >See
> >https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
> > for the details
> 
> Thank you, Hugo, understood.  mount/amount is a very fitting
> typo :-)
> 
> Do the standard `du' and `du' tools report correct values
> with btrfs?

   Well...

   du will tell you the size of the files you asked it about, but it
doesn't know about reflinks, so it'll double-count if you've got a
reflink copy of something. Other than that, it should be accurate, I
think. There's also a "btrfs fi du" which can tell you the amount of
shared and unique data as well, so you can know, for example, how much
space you'll reclaim if you delete those files.

   df should be mostly OK, but it does sometimes get its estimate of
the total usable size of the FS wrong, particularly if the FS is
unbalanced. However, as the FS fills up, the estimate gets better,
because it gets more evenly balanced across devices over time.

   Hugo.

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5

signature.asc
Description: Digital signature

Re: Interpreting `btrfs filesystem show'

Hugo Mills to Anton Shepelev:

>>While trying to resolve free space problems, and found
>>that I cannot interpret the output of:
>>
>>> btrfs filesystem show
>>
>>Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
>>Total devices 1 FS bytes used 34.06GiB
>>devid1 size 40.00GiB used 37.82GiB path /dev/sda2
>>
>>How come the total used value is less than the value
>>listed for the only device?
>
>   "Used" on the device is the mount of space allocated.
>"Used" on the FS is the total amount of actual data and
>metadata in that allocation.
>
>   You will also need to look at the output of "btrfs fi
>df" to see the breakdown of the 37.82 GiB into data,
>metadata and currently unused.
>
>See
>https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
> for the details

Thank you, Hugo, understood.  mount/amount is a very fitting
typo :-)

Does the standard `du' tool work correctly for btfrfs?

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]

Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills

On Mon, Oct 15, 2018 at 02:26:41PM +, Hugo Mills wrote:
> On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> > Hello, all
> > 
> > While trying to resolve free space problems, and found that
> > I cannot interpret the output of:
> > 
> > > btrfs filesystem show
> > 
> > Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> > Total devices 1 FS bytes used 34.06GiB
> > devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> > 
> > How come the total used value is less than the value listed
> > for the only device?
> 
>"Used" on the device is the mount of space allocated. "Used" on the

s/mount/amount/

> FS is the total amount of actual data and metadata in that allocation.
> 
>You will also need to look at the output of "btrfs fi df" to see
> the breakdown of the 37.82 GiB into data, metadata and currently
> unused.
> 
>See 
> https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
>  for the details.
> 
>Hugo.
> 

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5


signature.asc
Description: Digital signature

Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills

On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> Hello, all
> 
> While trying to resolve free space problems, and found that
> I cannot interpret the output of:
> 
> > btrfs filesystem show
> 
> Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> Total devices 1 FS bytes used 34.06GiB
> devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> 
> How come the total used value is less than the value listed
> for the only device?

   "Used" on the device is the mount of space allocated. "Used" on the
FS is the total amount of actual data and metadata in that allocation.

   You will also need to look at the output of "btrfs fi df" to see
the breakdown of the 37.82 GiB into data, metadata and currently
unused.

   See 
https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
 for the details.

   Hugo.

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5

signature.asc
Description: Digital signature

Interpreting `btrfs filesystem show'

Hello, all

While trying to resolve free space problems, and found that
I cannot interpret the output of:

> btrfs filesystem show

Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
Total devices 1 FS bytes used 34.06GiB
devid1 size 40.00GiB used 37.82GiB path /dev/sda2

How come the total used value is less than the value listed
for the only device?

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]

[PATCH] btrfs: Adjust loop in free_extent_buffer

2018-10-15 Thread Nikolay Borisov

The loop construct in free_extent_buffer was added in
242e18c7c1a8 ("Btrfs: reduce lock contention on extent buffer locks")
as means of reducing the times the eb lock is taken, the non-last ref
count is decremented and lock is released. As the special handling
of UNMAPPED extent buffers was removed now there is only one decrement
op which is happening for EXTENT_BUFFER_UNMAPPED case. This commit
modifies the loop condition so that in case of UNMAPPED buffers the
eb's lock is taken only if we are 100% sure the eb is going to be freed
by the current executor of the code. Additionally, remove superfluous 
ref count  ops in btrfs test. 

Signed-off-by: Nikolay Borisov 
---

This was tested with xfstest multiple times + unloading of btrfs module to 
ensure no eb leaks are present. 


 fs/btrfs/extent_io.c | 4 +++-
 fs/btrfs/tests/btrfs-tests.c | 2 --
 fs/btrfs/tests/inode-tests.c | 6 --
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0e9d4f28379d..544a1a5fd416 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5095,7 +5095,9 @@ void free_extent_buffer(struct extent_buffer *eb)
 
while (1) {
refs = atomic_read(&eb->refs);
-   if (refs <= 3)
+   if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 
3)
+   || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+   refs == 1))
break;
old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
if (old == refs)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 4cb8fcfd3ec4..8a59597f1883 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -177,8 +177,6 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (root->node) {
/* One for allocate_extent_buffer */
free_extent_buffer(root->node);
-   /* One for get_exent_buffer */
-   free_extent_buffer(root->node);
}
kfree(root);
 }
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 64043f028820..af0c8e30d9e2 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, 
u32 nodesize)
goto out;
}
 
-   /*
-* We will just free a dummy node if it's ref count is 2 so we need an
-* extra ref so our searches don't accidentally release our page.
-*/
-   extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
ret = -EINVAL;
@@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
 
-   extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
BTRFS_I(inode)->root = root;
-- 
2.7.4

Re: [PATCH 0/6] Some trivail cleanup about dealyed-refs

2018-10-15 Thread David Sterba

On Mon, Oct 15, 2018 at 10:39:19AM +0800, Lu Fengqi wrote:
> On Thu, Oct 11, 2018 at 01:51:37PM +0200, David Sterba wrote:
> >On Thu, Oct 11, 2018 at 01:40:32PM +0800, Lu Fengqi wrote:
> >> There is no functional change. Just improve readablity.
> >> 
> >> PATCH 1-4 parameter cleanup patches
> >> PATCH 5 cleanup about btrfs_select_ref_head
> >> PATCH 6 switch int to bool; add some comment
> >> 
> >> Lu Fengqi (6):
> >>   btrfs: delayed-ref: pass delayed_refs directly to
> >> btrfs_select_ref_head()
> >>   btrfs: delayed-ref: pass delayed_refs directly to
> >> btrfs_delayed_ref_lock()
> >>   btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
> >>   btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
> >>   btrfs: simplify btrfs_select_ref_head and cleanup some local variables
> >>   btrfs: switch return_bigger to bool in find_ref_head
> >
> >1-4 and 6 added to misc-next, thanks.
> 
> There is not patch 2 at the misc-next branch. So it was forgotten?

You're right, got lost somewhere, now in misc-next. Thanks for checking.

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Matthew Wilcox

On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote:
> > unsigned (*mmap_capabilities)(struct file *);
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> None of the other methods in this file name their parameters.  While
> I generally don't like people leaving them out, in the end consistency
> is even more important.

I would agree with you *except* that the parameters do not follow memcpy()
traditional order (dst, src, len).  Instead they are (src, dst, len), so we
should probably name them to advise the poor sod who has to implement this
that we've chosen an inconsistent API.

Or we could fix it.

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Amir Goldstein

On Mon, Oct 15, 2018 at 3:47 PM Christoph Hellwig  wrote:
>
> On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote:
> > I supposed you figured out the reason already.
>
> No, I hadn't.
>
> > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS.
> > All those "advisory" flags, we want to pass them in to filesystem as FYI,
> > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN
> > to every filesystem, when vfs has already taken care of the advice.
>
> I don't think this model makes sense.  If they really are purely
> handled in the VFS we can mask them before passing them to the file
> system, if not we need to check them, or the they are avisory and
> we can have a simple #define instead of the helper.
>
> RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> so the file system should know about it  Also looking at it again now
> it seems entirely superflous - we can just pass down then len == we
> use in higher level code instead of having a flag and will side step
> the issue here.
>
> RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> easily be solved by including it everywhere.
>
> RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to
> start with, as RFR_CAN_SHORTEN can be used instead.
>
> So something like this in fs.h:
>
> #define REMAP_FILE_ADVISORY_FLAGS   REMAP_FILE_CAN_SHORTEN
>
> And then in the file system:
>
> if (flags & ~REMAP_FILE_ADVISORY_FLAGS)
> -EINVAL;
>
> or
>
> if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP))
> -EINVAL;
>
> should be all that is needed.

Yeh, I suppose that makes sense.

Thanks,
Amir.

Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range