[PATCH 25/26] xfs: support returning partial reflink results

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Back when the XFS reflink code only supported clone_file_range, we were
only able to return zero or negative error codes to userspace.  However,
now that copy_file_range (which returns bytes copied) can use XFS'
clone_file_range, we have the opportunity to return partial results.
For example, if userspace sends a 1GB clone request and we run out of
space halfway through, we at least can tell userspace that we completed
512M of that request like a regular write.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_file.c|5 +
 fs/xfs/xfs_reflink.c |   20 +++-
 fs/xfs/xfs_reflink.h |2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 38fde4e11714..7d42ab8fe6e1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -928,14 +928,11 @@ xfs_file_remap_range(
loff_t  len,
unsigned intremap_flags)
 {
-   int ret;
-
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
-   ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, remap_flags);
-   return ret < 0 ? ret : len;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e8e86646bb4b..af3368862c56 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1123,6 +1123,7 @@ xfs_reflink_remap_blocks(
struct xfs_inode*dest,
xfs_fileoff_t   destoff,
xfs_filblks_t   len,
+   xfs_filblks_t   *remapped_len,
xfs_off_t   new_isize)
 {
struct xfs_bmbt_irecimap;
@@ -1130,6 +1131,7 @@ xfs_reflink_remap_blocks(
int error = 0;
xfs_filblks_t   range_len;
 
+   *remapped_len = 0;
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
uintlock_mode;
@@ -1168,6 +1170,7 @@ xfs_reflink_remap_blocks(
srcoff += range_len;
destoff += range_len;
len -= range_len;
+   *remapped_len += range_len;
}
 
return 0;
@@ -1382,7 +1385,7 @@ xfs_reflink_remap_prep(
 /*
  * Link a range of blocks from one file to another.
  */
-int
+loff_t
 xfs_reflink_remap_range(
struct file *file_in,
loff_t  pos_in,
@@ -1397,9 +1400,10 @@ xfs_reflink_remap_range(
struct xfs_inode*dest = XFS_I(inode_out);
struct xfs_mount*mp = src->i_mount;
xfs_fileoff_t   sfsbno, dfsbno;
-   xfs_filblks_t   fsblen;
+   xfs_filblks_t   fsblen, remappedfsb = 0;
+   loff_t  remapped_bytes = 0;
xfs_extlen_tcowextsize;
-   ssize_t ret;
+   int ret;
 
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return -EOPNOTSUPP;
@@ -1415,11 +1419,17 @@ xfs_reflink_remap_range(
 
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
+   if (len == 0) {
+   ret = 0;
+   goto out_unlock;
+   }
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-   pos_out + len);
+   &remappedfsb, pos_out + len);
+   remapped_bytes = min_t(loff_t, len, XFS_FSB_TO_B(mp, remappedfsb));
if (ret)
goto out_unlock;
 
@@ -1442,7 +1452,7 @@ xfs_reflink_remap_range(
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-   return ret;
+   return remapped_bytes > 0 ? remapped_bytes : ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c3c46c276fe1..cbc26ff79a8f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -27,7 +27,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, 
xfs_off_t offset,
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
unsigned int remap_flags);
 extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,



[PATCH 24/26] xfs: fix pagecache truncation prior to reflink

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range shouldn't be a problem.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Dave Chinner 
---
 fs/xfs/xfs_reflink.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 9b1ea42c81d1..e8e86646bb4b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep(
goto out_unlock;
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + *len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data,
+   round_down(pos_out, PAGE_SIZE),
+   round_up(pos_out + *len, PAGE_SIZE) - 1);
 
return 1;
 out_unlock:



[PATCH 26/26] xfs: remove redundant remap partial EOF block checks

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Now that we've moved the partial EOF block checks to the VFS helpers, we
can remove the redundant functionality from XFS.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Dave Chinner 
---
 fs/xfs/xfs_reflink.c |   19 ---
 1 file changed, 19 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index af3368862c56..755d4a9446e3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1307,7 +1307,6 @@ xfs_reflink_remap_prep(
struct inode*inode_out = file_inode(file_out);
struct xfs_inode*dest = XFS_I(inode_out);
boolsame_inode = (inode_in == inode_out);
-   u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
 
/* Lock both files against IO */
@@ -1335,24 +1334,6 @@ xfs_reflink_remap_prep(
if (ret < 0 || *len == 0)
goto out_unlock;
 
-   /*
-* If the dedupe data matches, chop off the partial EOF block
-* from the source file so we don't try to dedupe the partial
-* EOF block.
-*/
-   if (remap_flags & REMAP_FILE_DEDUP) {
-   *len &= ~blkmask;
-   } else if (*len & blkmask) {
-   /*
-* The user is attempting to share a partial EOF block,
-* if it's inside the destination EOF then reject it.
-*/
-   if (pos_out + *len < i_size_read(inode_out)) {
-   ret = -EINVAL;
-   goto out_unlock;
-   }
-   }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)



[PATCH 23/26] ocfs2: remove ocfs2_reflink_remap_range

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Since ocfs2_remap_file_range is a thin shell around
ocfs2_remap_remap_range, move everything from the latter into the
former.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/file.c |   68 +++-
 fs/ocfs2/refcounttree.c |  113 +++
 fs/ocfs2/refcounttree.h |   24 +++---
 3 files changed, 102 insertions(+), 103 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8125c5ccf821..fe570824b991 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2531,11 +2531,75 @@ static loff_t ocfs2_remap_file_range(struct file 
*file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 loff_t len, unsigned int remap_flags)
 {
+   struct inode *inode_in = file_inode(file_in);
+   struct inode *inode_out = file_inode(file_out);
+   struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+   struct buffer_head *in_bh = NULL, *out_bh = NULL;
+   bool same_inode = (inode_in == inode_out);
+   loff_t remapped = 0;
+   ssize_t ret;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
+   if (!ocfs2_refcount_tree(osb))
+   return -EOPNOTSUPP;
+   if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+   return -EROFS;
 
-   return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-   len, remap_flags);
+   /* Lock both files against IO */
+   ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+   if (ret)
+   return ret;
+
+   /* Check file eligibility and prepare for block sharing. */
+   ret = -EINVAL;
+   if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+   (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+   goto out_unlock;
+
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+   &len, remap_flags);
+   if (ret < 0 || len == 0)
+   goto out_unlock;
+
+   /* Lock out changes to the allocation maps and remap. */
+   down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+   if (!same_inode)
+   down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+ SINGLE_DEPTH_NESTING);
+
+   /* Zap any page cache for the destination file's range. */
+   truncate_inode_pages_range(&inode_out->i_data,
+  round_down(pos_out, PAGE_SIZE),
+  round_up(pos_out + len, PAGE_SIZE) - 1);
+
+   remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
+   inode_out, out_bh, pos_out, len);
+   up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+   if (!same_inode)
+   up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+   if (remapped < 0) {
+   ret = remapped;
+   mlog_errno(ret);
+   goto out_unlock;
+   }
+
+   /*
+* Empty the extent map so that we may get the right extent
+* record from the disk.
+*/
+   ocfs2_extent_map_trunc(inode_in, 0);
+   ocfs2_extent_map_trunc(inode_out, 0);
+
+   ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+   if (ret) {
+   mlog_errno(ret);
+   goto out_unlock;
+   }
+
+out_unlock:
+   ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+   return remapped > 0 ? remapped : ret;
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index c7409578657b..dc66b80585ec 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4468,9 +4468,9 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 }
 
 /* Update destination inode size, if necessary. */
-static int ocfs2_reflink_update_dest(struct inode *dest,
-struct buffer_head *d_bh,
-loff_t newlen)
+int ocfs2_reflink_update_dest(struct inode *dest,
+ struct buffer_head *d_bh,
+ loff_t newlen)
 {
handle_t *handle;
int ret;
@@ -4621,13 +4621,13 @@ static loff_t ocfs2_reflink_remap_extent(struct inode 
*s_inode,
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
-struct buffer_head *s_bh,
-loff_t pos_in,
-struct inode *t_inode,
-struct buffer_head *t_bh,
-loff_t pos_out,
-loff_t len)
+loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+   

[PATCH 22/26] ocfs2: support partial clone range and dedupe range

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Change the ocfs2 remap code to allow for returning partial results.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/file.c |7 +
 fs/ocfs2/refcounttree.c |   72 +--
 fs/ocfs2/refcounttree.h |   12 
 3 files changed, 46 insertions(+), 45 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fbaeafe44b5f..8125c5ccf821 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2531,14 +2531,11 @@ static loff_t ocfs2_remap_file_range(struct file 
*file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 loff_t len, unsigned int remap_flags)
 {
-   int ret;
-
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
-   ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-   len, remap_flags);
-   return ret < 0 ? ret : len;
+   return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   len, remap_flags);
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7c709229e108..c7409578657b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4507,14 +4507,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest,
 }
 
 /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len,
+struct ocfs2_cached_dealloc_ctxt 
*dealloc)
 {
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
@@ -4522,8 +4522,9 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
+   loff_t remapped_bytes = 0;
loff_t pstart, plen;
-   u32 p_cluster, num_clusters, slast, spos, tpos;
+   u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags;
int ret = 0;
 
@@ -4605,30 +4606,34 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
 next_loop:
spos += num_clusters;
tpos += num_clusters;
+   remapped_clus += num_clusters;
}
 
-out:
-   return ret;
+   goto out;
 out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
-   return ret;
+out:
+   remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+   remapped_bytes = min_t(loff_t, len, remapped_bytes);
+
+   return remapped_bytes > 0 ? remapped_bytes : ret;
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len)
+static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len)
 {
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
-   int ret;
+   loff_t ret;
 
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4700,7 +4705,7 @@ static int ocfs2_reflink_remap_blocks(struct inode 
*s_inode,
/* Actually remap extents now. */
ret = ocfs2_

[PATCH 19/26] vfs: clean up generic_remap_file_range_prep return value

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Since the remap prep function can update the length of the remap
request, we can change this function to return the usual return status
instead of the odd behavior it has now.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |2 +-
 fs/read_write.c |6 +++---
 fs/xfs/xfs_reflink.c|4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 6a42c04ac0ab..46bbd315c39f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4852,7 +4852,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || len == 0)
goto out_unlock;
 
/* Lock out changes to the allocation maps and remap. */
diff --git a/fs/read_write.c b/fs/read_write.c
index 450e038e8617..37a7d3fe35d8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1872,8 +1872,8 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
  * sense, and then flush all dirty data.  Caller must ensure that the
  * inodes have been locked against any other modifications.
  *
- * Returns: 0 for "nothing to clone", 1 for "something to clone", or
- * the usual negative error code.
+ * If there's an error, then the usual negative error code is returned.
+ * Otherwise returns 0 with *len set to the request length.
  */
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  struct file *file_out, loff_t pos_out,
@@ -1954,7 +1954,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (ret)
return ret;
 
-   return 1;
+   return 0;
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 3dbe5fb7e9c0..9b1ea42c81d1 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1329,7 +1329,7 @@ xfs_reflink_remap_prep(
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || *len == 0)
goto out_unlock;
 
/*
@@ -1409,7 +1409,7 @@ xfs_reflink_remap_range(
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
-   if (ret <= 0)
+   if (ret < 0 || len == 0)
return ret;
 
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);



[PATCH 15/26] vfs: plumb remap flags through the vfs clone functions

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb a remap_flags argument through the {do,vfs}_clone_file_range
functions so that clone can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ioctl.c |2 +-
 fs/nfsd/vfs.c  |2 +-
 fs/overlayfs/copy_up.c |2 +-
 fs/overlayfs/file.c|6 +++---
 fs/read_write.c|   13 +
 include/linux/fs.h |4 ++--
 6 files changed, 17 insertions(+), 12 deletions(-)


diff --git a/fs/ioctl.c b/fs/ioctl.c
index 72537b68c272..505275ec5596 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -232,7 +232,7 @@ static long ioctl_file_clone(struct file *dst_file, 
unsigned long srcfd,
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
- olen);
+ olen, 0);
if (cloned < 0)
ret = cloned;
else if (olen && cloned != olen)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ac6cb6101cbe..726fc5b2b27a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -543,7 +543,7 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 
src_pos, struct file *dst,
 {
loff_t cloned;
 
-   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count);
+   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (count && cloned != count)
cloned = -EINVAL;
return nfserrno(cloned < 0 ? cloned : 0);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8750b7235516..5f82fece64a0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -142,7 +142,7 @@ static int ovl_copy_up_data(struct path *old, struct path 
*new, loff_t len)
}
 
/* Try to use clone_file_range to clone up within the same fs */
-   cloned = do_clone_file_range(old_file, 0, new_file, 0, len);
+   cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
goto out;
/* Couldn't clone, so now we try to copy the data */
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 6c3fec6168e9..0393815c8971 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -462,7 +462,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
-  real_out.file, pos_out, len);
+  real_out.file, pos_out, len, flags);
break;
 
case OVL_DEDUPE:
@@ -512,8 +512,8 @@ static loff_t ovl_remap_file_range(struct file *file_in, 
loff_t pos_in,
 !ovl_inode_upper(file_inode(file_out
return -EPERM;
 
-   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
-   op);
+   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+   remap_flags, op);
 }
 
 const struct file_operations ovl_file_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 906e78be5001..791b406e8264 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1857,12 +1857,15 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, loff_t len)
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags);
+
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
@@ -1893,7 +1896,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
return ret;
 
ret = file_in->f_op->remap_file_range(file_in, pos_in,
-   file_out, pos_out, len, 0);
+   file_out, pos_out, len, remap_flags);
if (ret < 0)
return ret;
 
@@ -1904,12 +1907,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
 EXPORT_SYMBOL(do_clone_file_range);
 
 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out, loff_t len)
+   struct file *file_out, loff_t pos_out,
+   loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
file_start_write(file_out);
-   ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+   ret = do_clone_file_range(file_in, pos_in, file_out, pos_out,

[PATCH 21/26] ocfs2: fix pagecache truncation prior to reflink

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range should be fine.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 2a5c96bc9677..7c709229e108 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4862,8 +4862,9 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  SINGLE_DEPTH_NESTING);
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data,
+  round_down(pos_out, PAGE_SIZE),
+  round_up(pos_out + len, PAGE_SIZE) - 1);
 
ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
 out_bh, pos_out, len);



[PATCH 16/26] vfs: plumb remap flags through the vfs dedupe functions

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb a remap_flags argument through the vfs_dedupe_file_range_one
functions so that dedupe can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/overlayfs/file.c |3 ++-
 fs/read_write.c |9 ++---
 include/linux/fs.h  |2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)


diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 0393815c8971..84dd957efa24 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -467,7 +467,8 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
-   real_out.file, pos_out, len);
+   real_out.file, pos_out, len,
+   flags);
break;
}
revert_creds(old_cred);
diff --git a/fs/read_write.c b/fs/read_write.c
index 791b406e8264..f6ab5beb935a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2019,10 +2019,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
 
 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 struct file *dst_file, loff_t dst_pos,
-loff_t len)
+loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP));
+
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
@@ -2053,7 +2055,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
}
 
ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
-   dst_pos, len, REMAP_FILE_DEDUP);
+   dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
 out_drop_write:
mnt_drop_write_file(dst_file);
 
@@ -2121,7 +2123,8 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
}
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-   info->dest_offset, len);
+   info->dest_offset, len,
+   0);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bc353a5224a4..c0ae85a7bd9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1854,7 +1854,7 @@ extern int vfs_dedupe_file_range(struct file *file,
 struct file_dedupe_range *same);
 extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
-   loff_t len);
+   loff_t len, unsigned int remap_flags);
 
 
 struct super_operations {



[PATCH 17/26] vfs: enable remap callers that can handle short operations

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb in a remap flag that enables the filesystem remap handler to
shorten remapping requests for callers that can handle it.  Now
copy_file_range can report partial success (in case we run up against
alignment problems, resource limits, etc.).

We also enable CAN_SHORTEN for fideduperange to maintain existing
userspace-visible behavior where xfs/btrfs shorten the dedupe range to
avoid stale post-eof data exposure.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|   28 
 include/linux/fs.h |7 +--
 mm/filemap.c   |   16 
 3 files changed, 37 insertions(+), 14 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index f6ab5beb935a..ee9314b7bfc3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t 
pos_in,
 
cloned = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
-   min_t(loff_t, MAX_RW_COUNT, len), 0);
+   min_t(loff_t, MAX_RW_COUNT, len),
+   REMAP_FILE_CAN_SHORTEN);
if (cloned > 0) {
ret = cloned;
goto done;
@@ -1721,6 +1722,8 @@ static int remap_verify_area(struct file *file, loff_t 
pos, loff_t len,
  * can't meaningfully compare post-EOF contents.
  *
  * For clone we only link a partial EOF block above the destination file's EOF.
+ *
+ * Shorten the request if possible.
  */
 static int generic_remap_check_len(struct inode *inode_in,
   struct inode *inode_out,
@@ -1729,16 +1732,24 @@ static int generic_remap_check_len(struct inode 
*inode_in,
   unsigned int remap_flags)
 {
u64 blkmask = i_blocksize(inode_in) - 1;
+   loff_t new_len = *len;
 
if ((*len & blkmask) == 0)
return 0;
 
-   if (remap_flags & REMAP_FILE_DEDUP)
-   *len &= ~blkmask;
-   else if (pos_out + *len < i_size_read(inode_out))
-   return -EINVAL;
+   if ((remap_flags & REMAP_FILE_DEDUP) ||
+   pos_out + *len < i_size_read(inode_out))
+   new_len &= ~blkmask;
 
-   return 0;
+   if (new_len == *len)
+   return 0;
+
+   if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+   *len = new_len;
+   return 0;
+   }
+
+   return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
 }
 
 /* Update inode timestamps and remove security privileges when remapping. */
@@ -2023,7 +2034,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
 {
loff_t ret;
 
-   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP));
+   WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+REMAP_FILE_CAN_SHORTEN));
 
ret = mnt_want_write_file(dst_file);
if (ret)
@@ -2124,7 +2136,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
info->dest_offset, len,
-   0);
+   REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0ae85a7bd9d..594fe4ba0b15 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1726,14 +1726,17 @@ struct block_device_operations;
  * If it is called with len == 0 that means "remap to end of source file".
  *
  * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
+ * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
  */
 #define REMAP_FILE_DEDUP   (1 << 0)
+#define REMAP_FILE_CAN_SHORTEN (1 << 1)
 
 /* All valid REMAP_FILE flags */
-#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP)
+#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP | \
+REMAP_FILE_CAN_SHORTEN)
 
 /* REMAP_FILE flags taken care of by the vfs. */
-#define REMAP_FILE_ADVISORY(0)
+#define REMAP_FILE_ADVISORY(REMAP_FILE_CAN_SHORTEN)
 
 struct iov_iter;
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e93269efafe..898eb358f7d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3051,8 +3051,12 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
if (pos_in + count == size_in) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
-   if (!IS_ALIGNED(count, bs))
-   return -EINVAL;
+   if (!IS_ALIGNED(count, bs)) {
+   if (remap_flags & REMAP_F

[PATCH 20/26] ocfs2: truncate page cache for clone destination file before remapping

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

When cloning blocks into another file, truncate the page cache before we
start remapping blocks so that concurrent reads wait for us to finish.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |   10 --
 1 file changed, 4 insertions(+), 6 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 46bbd315c39f..2a5c96bc9677 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4861,14 +4861,12 @@ int ocfs2_reflink_remap_range(struct file *file_in,
down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
  SINGLE_DEPTH_NESTING);
 
-   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-out_bh, pos_out, len);
-
/* Zap any page cache for the destination file's range. */
-   if (!ret)
-   truncate_inode_pages_range(&inode_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(&inode_out->i_data, pos_out,
+  PAGE_ALIGN(pos_out + len) - 1);
 
+   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+out_bh, pos_out, len);
up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
if (!same_inode)
up_write(&OCFS2_I(inode_out)->ip_alloc_sem);



[PATCH 18/26] vfs: hide file range comparison function

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

There are no callers of vfs_dedupe_file_range_compare, so we might as
well make it a static helper and remove the export.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c|  187 +---
 include/linux/fs.h |3 -
 2 files changed, 91 insertions(+), 99 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index ee9314b7bfc3..450e038e8617 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1776,6 +1776,97 @@ static int generic_remap_file_range_target(struct file 
*file,
return file_remove_privs(file);
 }
 
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+   struct page *page;
+
+   page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+   if (IS_ERR(page))
+   return page;
+   if (!PageUptodate(page)) {
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   lock_page(page);
+   return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+struct inode *dest, loff_t destoff,
+loff_t len, bool *is_same)
+{
+   loff_t src_poff;
+   loff_t dest_poff;
+   void *src_addr;
+   void *dest_addr;
+   struct page *src_page;
+   struct page *dest_page;
+   loff_t cmp_len;
+   bool same;
+   int error;
+
+   error = -EINVAL;
+   same = true;
+   while (len) {
+   src_poff = srcoff & (PAGE_SIZE - 1);
+   dest_poff = destoff & (PAGE_SIZE - 1);
+   cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+   cmp_len = min(cmp_len, len);
+   if (cmp_len <= 0)
+   goto out_error;
+
+   src_page = vfs_dedupe_get_page(src, srcoff);
+   if (IS_ERR(src_page)) {
+   error = PTR_ERR(src_page);
+   goto out_error;
+   }
+   dest_page = vfs_dedupe_get_page(dest, destoff);
+   if (IS_ERR(dest_page)) {
+   error = PTR_ERR(dest_page);
+   unlock_page(src_page);
+   put_page(src_page);
+   goto out_error;
+   }
+   src_addr = kmap_atomic(src_page);
+   dest_addr = kmap_atomic(dest_page);
+
+   flush_dcache_page(src_page);
+   flush_dcache_page(dest_page);
+
+   if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+   same = false;
+
+   kunmap_atomic(dest_addr);
+   kunmap_atomic(src_addr);
+   unlock_page(dest_page);
+   unlock_page(src_page);
+   put_page(dest_page);
+   put_page(src_page);
+
+   if (!same)
+   break;
+
+   srcoff += cmp_len;
+   destoff += cmp_len;
+   len -= cmp_len;
+   }
+
+   *is_same = same;
+   return 0;
+
+out_error:
+   return error;
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1932,102 +2023,6 @@ loff_t vfs_clone_file_range(struct file *file_in, 
loff_t pos_in,
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
 
-/*
- * Read a page's worth of file data into the page cache.  Return the page
- * locked.
- */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
-   struct address_space *mapping;
-   struct page *page;
-   pgoff_t n;
-
-   n = offset >> PAGE_SHIFT;
-   mapping = inode->i_mapping;
-   page = read_mapping_page(mapping, n, NULL);
-   if (IS_ERR(page))
-   return page;
-   if (!PageUptodate(page)) {
-   put_page(page);
-   return ERR_PTR(-EIO);
-   }
-   lock_page(page);
-   return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
-   loff_t src_poff;
-   loff_t dest_poff;
-   void *src_addr;
-   void *dest_addr;
-   struct page *src_page;
-   struct page *dest_page;
-   loff_t cmp_len;
-   bool same;
-   int error;
-
-   error = -EINVAL;
-   same = true;
-   

[PATCH 14/26] vfs: make remap_file_range functions take and return bytes completed

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Change the remap_file_range functions to take a number of bytes to
operate upon and return the number of bytes they operated on.  This is a
requirement for allowing fs implementations to return short clone/dedupe
results to the user, which will enable us to obey resource limits in a
graceful manner.

A subsequent patch will enable copy_file_range to signal to the
->clone_file_range implementation that it can handle a short length,
which will be returned in the function's return value.  For now the
short return is not implemented anywhere so the behavior won't change --
either copy_file_range manages to clone the entire range or it tries an
alternative.

Neither clone ioctl can take advantage of this, alas.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |6 ++---
 fs/btrfs/ctree.h  |6 ++---
 fs/btrfs/ioctl.c  |   13 ++
 fs/cifs/cifsfs.c  |6 ++---
 fs/ioctl.c|   10 +++-
 fs/nfs/nfs4file.c |6 ++---
 fs/nfsd/vfs.c |8 +-
 fs/ocfs2/file.c   |   16 ++--
 fs/ocfs2/refcounttree.c   |2 +-
 fs/ocfs2/refcounttree.h   |2 +-
 fs/overlayfs/copy_up.c|6 ++---
 fs/overlayfs/file.c   |   12 +
 fs/read_write.c   |   49 -
 fs/xfs/xfs_file.c |9 +--
 fs/xfs/xfs_reflink.c  |4 ++-
 fs/xfs/xfs_reflink.h  |2 +-
 include/linux/fs.h|   27 +++-
 mm/filemap.c  |2 +-
 18 files changed, 106 insertions(+), 80 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index bb3183334ab9..8ba47d9d6cae 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,9 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 len, unsigned int remap_flags);
+   loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 124a05662fc2..771a961d77ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3247,9 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len,
-  unsigned int remap_flags);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bfd99c66723e..b0c513e10977 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4328,10 +4328,12 @@ static noinline int btrfs_clone_files(struct file 
*file, struct file *file_src,
return ret;
 }
 
-int btrfs_remap_file_range(struct file *src_file, loff_t off,
-   struct file *dst_file, loff_t destoff, u64 len,
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+   struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
 {
+   int ret;
+
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
 
@@ -4349,10 +4351,11 @@ int btrfs_remap_file_range(struct file *src_file, 
loff_t off,
return -EINVAL;
}
 
-   return btrfs_extent_same(src, off, len, dst, destoff);
+   ret = btrfs_extent_same(src, off, len, dst, destoff);
+   } else {
+   ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
-
-   return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+   return ret < 0 ? ret : len;
 }
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e8144d0dcde2..5ca71c6c8be2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/c

[PATCH 13/26] vfs: create generic_remap_file_range_touch to update inode metadata

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Create a new VFS helper to handle inode metadata updates when remapping
into a file.  If the operation can possibly alter the file contents, we
must update the ctime and mtime and remove security privileges, just
like we do for regular file writes.  Wire up ocfs2 to ensure consistent
behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c  |   28 
 fs/xfs/xfs_reflink.c |   23 ---
 2 files changed, 28 insertions(+), 23 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index ebcbfc4f2907..3f6392f1d5d4 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1737,6 +1737,30 @@ static int generic_remap_check_len(struct inode 
*inode_in,
return 0;
 }
 
+/* Update inode timestamps and remove security privileges when remapping. */
+static int generic_remap_file_range_target(struct file *file,
+  unsigned int remap_flags)
+{
+   int ret;
+
+   /* If can't alter the file contents, we're done. */
+   if (remap_flags & REMAP_FILE_DEDUP)
+   return 0;
+
+   /* Update the timestamps, since we can alter file contents. */
+   if (!(file->f_mode & FMODE_NOCMTIME)) {
+   ret = file_update_time(file);
+   if (ret)
+   return ret;
+   }
+
+   /*
+* Clear the security bits if the process is not being run by root.
+* This keeps people from modifying setuid and setgid binaries.
+*/
+   return file_remove_privs(file);
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1820,6 +1844,10 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (ret)
return ret;
 
+   ret = generic_remap_file_range_target(file_out, remap_flags);
+   if (ret)
+   return ret;
+
return 1;
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 29aab196ce7e..2d7dd8b28d7c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1372,29 +1372,6 @@ xfs_reflink_remap_prep(
truncate_inode_pages_range(&inode_out->i_data, pos_out,
   PAGE_ALIGN(pos_out + *len) - 1);
 
-   /* If we're altering the file contents... */
-   if (!(remap_flags & REMAP_FILE_DEDUP)) {
-   /*
-* ...update the timestamps (which will grab the ilock again
-* from xfs_fs_dirty_inode, so we have to call it before we
-* take the ilock).
-*/
-   if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-   ret = file_update_time(file_out);
-   if (ret)
-   goto out_unlock;
-   }
-
-   /*
-* ...clear the security bits if the process is not being run
-* by root.  This keeps people from modifying setuid and setgid
-* binaries.
-*/
-   ret = file_remove_privs(file_out);
-   if (ret)
-   goto out_unlock;
-   }
-
return 1;
 out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);



[PATCH 09/26] vfs: rename clone_verify_area to remap_verify_area

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Since we use clone_verify_area for both clone and dedupe range checks,
rename the function to make it clear that it's for both.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index aca75a97a695..734c5661fb69 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1686,7 +1686,7 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t 
__user *, off_in,
return ret;
 }
 
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
+static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
 {
struct inode *inode = file_inode(file);
 
@@ -1852,11 +1852,11 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (!file_in->f_op->clone_file_range)
return -EOPNOTSUPP;
 
-   ret = clone_verify_area(file_in, pos_in, len, false);
+   ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
 
-   ret = clone_verify_area(file_out, pos_out, len, true);
+   ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
 
@@ -1989,7 +1989,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (ret)
return ret;
 
-   ret = clone_verify_area(dst_file, dst_pos, len, true);
+   ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
 
@@ -2051,7 +2051,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;
 
-   ret = clone_verify_area(file, off, len, false);
+   ret = remap_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;



[PATCH 07/26] vfs: skip zero-length dedupe requests

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Don't bother calling the filesystem for a zero-length dedupe request;
we can return zero and exit.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |5 +
 1 file changed, 5 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index 0f0a6efdd502..f5395d8da741 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2009,6 +2009,11 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (!dst_file->f_op->dedupe_file_range)
goto out_drop_write;
 
+   if (len == 0) {
+   ret = 0;
+   goto out_drop_write;
+   }
+
ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
dst_file, dst_pos, len);
 out_drop_write:



[PATCH 11/26] vfs: pass remap flags to generic_remap_file_range_prep

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb the remap flags through the filesystem from the vfs function
dispatcher all the way to the prep function to prepare for behavior
changes in subsequent patches.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/file.c |2 +-
 fs/ocfs2/refcounttree.c |4 ++--
 fs/ocfs2/refcounttree.h |2 +-
 fs/read_write.c |   14 +++---
 fs/xfs/xfs_file.c   |2 +-
 fs/xfs/xfs_reflink.c|   21 +++--
 fs/xfs/xfs_reflink.h|3 ++-
 include/linux/fs.h  |2 +-
 8 files changed, 26 insertions(+), 24 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0b757a24567c..9809b0e5746f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2538,7 +2538,7 @@ static int ocfs2_remap_file_range(struct file *file_in,
return -EINVAL;
 
return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-len, remap_flags & REMAP_FILE_DEDUP);
+len, remap_flags);
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 36c56dfbe485..df9781567ec0 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4825,7 +4825,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  struct file *file_out,
  loff_t pos_out,
  u64 len,
- bool is_dedupe)
+ unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -4851,7 +4851,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
goto out_unlock;
 
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
-   &len, is_dedupe);
+   &len, remap_flags);
if (ret <= 0)
goto out_unlock;
 
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 4af55bf4b35b..d2c5f526edff 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -120,6 +120,6 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  struct file *file_out,
  loff_t pos_out,
  u64 len,
- bool is_dedupe);
+ unsigned int remap_flags);
 
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/read_write.c b/fs/read_write.c
index 766bdcb381f3..201381689284 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1722,14 +1722,14 @@ static int generic_remap_check_len(struct inode 
*inode_in,
   struct inode *inode_out,
   loff_t pos_out,
   u64 *len,
-  bool is_dedupe)
+  unsigned int remap_flags)
 {
u64 blkmask = i_blocksize(inode_in) - 1;
 
if ((*len & blkmask) == 0)
return 0;
 
-   if (is_dedupe)
+   if (remap_flags & REMAP_FILE_DEDUP)
*len &= ~blkmask;
else if (pos_out + *len < i_size_read(inode_out))
return -EINVAL;
@@ -1747,7 +1747,7 @@ static int generic_remap_check_len(struct inode *inode_in,
  */
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  struct file *file_out, loff_t pos_out,
- u64 *len, bool is_dedupe)
+ u64 *len, unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1771,7 +1771,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (*len == 0) {
loff_t isize = i_size_read(inode_in);
 
-   if (is_dedupe || pos_in == isize)
+   if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
return 0;
if (pos_in > isize)
return -EINVAL;
@@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-   is_dedupe);
+   (remap_flags & REMAP_FILE_DEDUP));
if (ret)
return ret;
 
@@ -1804,7 +1804,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
/*
 * Check that the extents are the same.
 */
-   if (is_dedupe) {
+   if (remap_flags & REMAP_FILE_DEDUP) {
boolis_same = false;
 
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
@@ -1816,7 

[PATCH 10/26] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Combine the clone_file_range and dedupe_file_range operations into a
single remap_file_range file operation dispatch since they're
fundamentally the same operation.  The differences between the two can
be made in the prep functions.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |   13 +--
 fs/btrfs/ctree.h  |8 ++-
 fs/btrfs/file.c   |3 +-
 fs/btrfs/ioctl.c  |   45 +++--
 fs/cifs/cifsfs.c  |   22 +++---
 fs/nfs/nfs4file.c |   10 ++--
 fs/ocfs2/file.c   |   24 +++-
 fs/overlayfs/file.c   |   30 ++---
 fs/read_write.c   |   18 +++
 fs/xfs/xfs_file.c |   23 ++-
 include/linux/fs.h|   20 +---
 11 files changed, 110 insertions(+), 106 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index a6c6a8af48a2..bb3183334ab9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,8 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
-   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
+   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
@@ -960,11 +961,9 @@ otherwise noted.
 
   copy_file_range: called by the copy_file_range(2) system call.
 
-  clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
-   FICLONE commands.
-
-  dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
-   command.
+  remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
+   FICLONE and FIDEDUPERANGE commands to remap file ranges.  Note that
+   a zero length implies "remap to end of source file".
 
   fadvise: possibly called by the fadvise64() system call.
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..124a05662fc2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3218,9 +3218,6 @@ void btrfs_get_block_group_info(struct list_head 
*groups_list,
struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
   struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen);
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
@@ -3250,8 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len);
+int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out, u64 len,
+  unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..9a963f061393 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3269,8 +3269,7 @@ const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
.compat_ioctl   = btrfs_compat_ioctl,
 #endif
-   .clone_file_range = btrfs_clone_file_range,
-   .dedupe_file_range = btrfs_dedupe_file_range,
+   .remap_file_range = btrfs_remap_file_range,
 };
 
 void __cold btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..bfd99c66723e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3627,26 +3627,6 @@ static int btrfs_extent_same(struct inode *src, u64 
loff, u64 olen,
return ret;
 }
 
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen)
-{
-   struct inode *src = file_inode(src_file);
-   struct inode *dst = file_inode(dst_file);
-   u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
-   if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-   /*
-   

[PATCH 08/26] vfs: rename vfs_clone_file_prep to be more descriptive

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

The vfs_clone_file_prep is a generic function to be called by filesystem
implementations only.  Rename the prefix to generic_ and make it more
clear that it applies to remap operations, not just clones.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +-
 fs/read_write.c |8 
 fs/xfs/xfs_reflink.c|2 +-
 include/linux/fs.h  |6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19e03936c5e1..36c56dfbe485 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index f5395d8da741..aca75a97a695 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1745,9 +1745,9 @@ static int generic_remap_check_len(struct inode *inode_in,
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 *len, bool is_dedupe)
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 *len, bool is_dedupe)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1822,7 +1822,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep);
+EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 281d5f53f2ec..a7757a128a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ba93a6e7dac4..55729e1c2e75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1825,9 +1825,9 @@ extern ssize_t vfs_readv(struct file *, const struct 
iovec __user *,
unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
   loff_t, size_t, unsigned int);
-extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out,
-  u64 *count, bool is_dedupe);
+extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+struct file *file_out, loff_t pos_out,
+u64 *count, bool is_dedupe);
 extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
   struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,



[PATCH 06/26] vfs: avoid problematic remapping requests into partial EOF block

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

A deduplication data corruption is exposed in XFS and btrfs. It is
caused by extending the block match range to include the partial EOF
block, but then allowing unknown data beyond EOF to be considered a
"match" to data in the destination file because the comparison is only
made to the end of the source file. This corrupts the destination file
when the source extent is shared with it.

The VFS remapping prep functions  only support whole block dedupe, but
we still need to appear to support whole file dedupe correctly.  Hence
if the dedupe request includes the last block of the souce file, don't
include it in the actual dedupe operation. If the rest of the range
dedupes successfully, then reject the entire request.  A subsequent
patch will enable us to shorten dedupe requests correctly.

When reflinking sub-file ranges, a data corruption can occur when the
source file range includes a partial EOF block. This shares the unknown
data beyond EOF into the second file at a position inside EOF, exposing
stale data in the second file.

If the reflink request includes the last block of the souce file, only
proceed with the reflink operation if it lands at or past the
destination file's current EOF. If it lands within the destination file
EOF, reject the entire request with -EINVAL and make the caller go the
hard way.  A subsequent patch will enable us to shorten reflink requests
correctly.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c |   33 +
 1 file changed, 33 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index 2456da3f8a41..0f0a6efdd502 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1708,6 +1708,34 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
 
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
+/*
+ * Ensure that we don't remap a partial EOF block in the middle of something
+ * else.  Assume that the offsets have already been checked for block
+ * alignment.
+ *
+ * For deduplication we always scale down to the previous block because we
+ * can't meaningfully compare post-EOF contents.
+ *
+ * For clone we only link a partial EOF block above the destination file's EOF.
+ */
+static int generic_remap_check_len(struct inode *inode_in,
+  struct inode *inode_out,
+  loff_t pos_out,
+  u64 *len,
+  bool is_dedupe)
+{
+   u64 blkmask = i_blocksize(inode_in) - 1;
+
+   if ((*len & blkmask) == 0)
+   return 0;
+
+   if (is_dedupe)
+   *len &= ~blkmask;
+   else if (pos_out + *len < i_size_read(inode_out))
+   return -EINVAL;
+
+   return 0;
+}
 
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
@@ -1787,6 +1815,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
return -EBADE;
}
 
+   ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+   is_dedupe);
+   if (ret)
+   return ret;
+
return 1;
 }
 EXPORT_SYMBOL(vfs_clone_file_prep);



[PATCH 05/26] vfs: strengthen checking of file range inputs to generic_remap_checks

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

File range remapping, if allowed to run past the destination file's EOF,
is an optimization on a regular file write.  Regular file writes that
extend the file length are subject to various constraints which are not
checked by range cloning.

This is a correctness problem because we're never allowed to touch
ranges that the page cache can't support (s_maxbytes); we're not
supposed to deal with large offsets (MAX_NON_LFS) if O_LARGEFILE isn't
set; and we must obey resource limits (RLIMIT_FSIZE).

Therefore, add these checks to the new generic_remap_checks function so
that we curtail unexpected behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
Reviewed-by: Christoph Hellwig 
---
 mm/filemap.c |   91 ++
 1 file changed, 59 insertions(+), 32 deletions(-)


diff --git a/mm/filemap.c b/mm/filemap.c
index 47e6bfd45a91..08ad210fee49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2915,6 +2915,49 @@ struct page *read_cache_page_gfp(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL(read_cache_page_gfp);
 
+static int generic_access_check_limits(struct file *file, loff_t pos,
+  loff_t *count)
+{
+   struct inode *inode = file->f_mapping->host;
+
+   /* Don't exceed the LFS limits. */
+   if (unlikely(pos + *count > MAX_NON_LFS &&
+   !(file->f_flags & O_LARGEFILE))) {
+   if (pos >= MAX_NON_LFS)
+   return -EFBIG;
+   *count = min(*count, (loff_t)MAX_NON_LFS - pos);
+   }
+
+   /*
+* Don't operate on ranges the page cache doesn't support.
+*
+* If we have written data it becomes a short write.  If we have
+* exceeded without writing data we send a signal and return EFBIG.
+* Linus frestrict idea will clean these up nicely..
+*/
+   if (unlikely(pos >= inode->i_sb->s_maxbytes))
+   return -EFBIG;
+
+   *count = min(*count, inode->i_sb->s_maxbytes - pos);
+   return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+   unsigned long limit = rlimit(RLIMIT_FSIZE);
+
+   if (limit != RLIM_INFINITY) {
+   if (pos >= limit) {
+   send_sig(SIGXFSZ, current, 0);
+   return -EFBIG;
+   }
+   *count = min(*count, (loff_t)limit - pos);
+   }
+
+   return generic_access_check_limits(file, pos, count);
+}
+
 /*
  * Performs necessary checks before doing a write
  *
@@ -2926,8 +2969,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
 {
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-   unsigned long limit = rlimit(RLIMIT_FSIZE);
-   loff_t pos;
+   loff_t count;
+   int ret;
 
if (!iov_iter_count(from))
return 0;
@@ -2936,40 +2979,15 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
 
-   pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
 
-   if (limit != RLIM_INFINITY) {
-   if (iocb->ki_pos >= limit) {
-   send_sig(SIGXFSZ, current, 0);
-   return -EFBIG;
-   }
-   iov_iter_truncate(from, limit - (unsigned long)pos);
-   }
+   count = iov_iter_count(from);
+   ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+   if (ret)
+   return ret;
 
-   /*
-* LFS rule
-*/
-   if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-   !(file->f_flags & O_LARGEFILE))) {
-   if (pos >= MAX_NON_LFS)
-   return -EFBIG;
-   iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
-   }
-
-   /*
-* Are we about to exceed the fs block limit ?
-*
-* If we have written data it becomes a short write.  If we have
-* exceeded without writing data we send a signal and return EFBIG.
-* Linus frestrict idea will clean these up nicely..
-*/
-   if (unlikely(pos >= inode->i_sb->s_maxbytes))
-   return -EFBIG;
-
-   iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+   iov_iter_truncate(from, count);
return iov_iter_count(from);
 }
 EXPORT_SYMBOL(generic_write_checks);
@@ -2991,6 +3009,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
uint64_t bcount;
loff_t size_in, size_out;
loff_t bs = inode_out->i_sb->s_blocksize;
+   int ret;
 
/* The start of both ranges must be aligned to an fs block. */
  

[PATCH 12/26] vfs: pass remap flags to generic_remap_checks

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Pass the same remap flags to generic_remap_checks for consistency.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|2 +-
 include/linux/fs.h |2 +-
 mm/filemap.c   |4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 201381689284..ebcbfc4f2907 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-   (remap_flags & REMAP_FILE_DEDUP));
+   remap_flags);
if (ret)
return ret;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c2800953937a..1aa3bc1bb092 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2981,7 +2981,7 @@ extern int generic_file_readonly_mmap(struct file *, 
struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
-   uint64_t *count, bool is_dedupe);
+   uint64_t *count, unsigned int remap_flags);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 08ad210fee49..b0f1f6d93d9c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3001,7 +3001,7 @@ EXPORT_SYMBOL(generic_write_checks);
  */
 int generic_remap_checks(struct file *file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
-uint64_t *req_count, bool is_dedupe)
+uint64_t *req_count, unsigned int remap_flags)
 {
struct inode *inode_in = file_in->f_mapping->host;
struct inode *inode_out = file_out->f_mapping->host;
@@ -3023,7 +3023,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
size_out = i_size_read(inode_out);
 
/* Dedupe requires both ranges to be within EOF. */
-   if (is_dedupe &&
+   if ((remap_flags & REMAP_FILE_DEDUP) &&
(pos_in >= size_in || pos_in + count > size_in ||
 pos_out >= size_out || pos_out + count > size_out))
return -EINVAL;



[PATCH 03/26] vfs: check file ranges before cloning files

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Move the file range checks from vfs_clone_file_prep into a separate
generic_remap_checks function so that all the checks are collected in a
central location.  This forms the basis for adding more checks from
generic_write_checks that will make cloning's input checking more
consistent with write input checking.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +
 fs/read_write.c |   55 +
 fs/xfs/xfs_reflink.c|2 +
 include/linux/fs.h  |9 --
 mm/filemap.c|   69 +++
 5 files changed, 90 insertions(+), 47 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..19e03936c5e1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
&len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index 260797b01851..d6e8e242a15f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-  struct inode *inode_out, loff_t pos_out,
-  u64 *len, bool is_dedupe)
+int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 *len, bool is_dedupe)
 {
-   loff_t bs = inode_out->i_sb->s_blocksize;
-   loff_t blen;
-   loff_t isize;
+   struct inode *inode_in = file_inode(file_in);
+   struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
 
@@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   isize = i_size_read(inode_in);
-
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
+   loff_t isize = i_size_read(inode_in);
+
if (is_dedupe || pos_in == isize)
return 0;
if (pos_in > isize)
@@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
*len = isize - pos_in;
}
 
-   /* Ensure offsets don't wrap and the input is inside i_size */
-   if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-   pos_in + *len > isize)
-   return -EINVAL;
-
-   /* Don't allow dedupe past EOF in the dest file */
-   if (is_dedupe) {
-   loff_t  disize;
-
-   disize = i_size_read(inode_out);
-   if (pos_out >= disize || pos_out + *len > disize)
-   return -EINVAL;
-   }
-
-   /* If we're linking to EOF, continue to the block boundary. */
-   if (pos_in + *len == isize)
-   blen = ALIGN(isize, bs) - pos_in;
-   else
-   blen = *len;
-
-   /* Only reflink if we're aligned to block boundaries */
-   if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-   !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-   return -EINVAL;
-
-   /* Don't allow overlapped reflink within the same file */
-   if (same_inode) {
-   if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-   return -EINVAL;
-   }
+   /* Check that we don't violate system file offset limits. */
+   ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+   is_dedupe);
+   if (ret)
+   return ret;
 
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
@@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+EXPORT_SYMBOL(vfs_clone_file_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
@@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (ret)
return ret;
 
-   if (pos_in + len > i_size_read(inode_in))
- 

[PATCH 01/26] xfs: add a per-xfs trace_printk macro

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

Add a "xfs_tprintk" macro so that developers can use trace_printk to
print out arbitrary debugging information with the XFS device name
attached to the trace output.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_error.h |6 ++
 1 file changed, 6 insertions(+)


diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..5caa8bdf6c38 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -76,6 +76,11 @@ extern int xfs_errortag_set(struct xfs_mount *mp, unsigned 
int error_tag,
unsigned int tag_value);
 extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
 extern int xfs_errortag_clearall(struct xfs_mount *mp);
+
+/* trace printk version of xfs_err and friends */
+#define xfs_tprintk(mp, fmt, args...) \
+   trace_printk("dev %d:%d " fmt, MAJOR((mp)->m_super->s_dev), \
+   MINOR((mp)->m_super->s_dev), ##args)
 #else
 #define xfs_errortag_init(mp)  (0)
 #define xfs_errortag_del(mp)
@@ -83,6 +88,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define xfs_errortag_set(mp, tag, val) (ENOSYS)
 #define xfs_errortag_add(mp, tag)  (ENOSYS)
 #define xfs_errortag_clearall(mp)  (ENOSYS)
+#define xfs_tprintk(mp, fmt, args...)  do { } while (0)
 #endif /* DEBUG */
 
 /*



[PATCH 04/26] vfs: exit early from zero length remap operations

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

If a remap caller asks us to remap to the source file's EOF and the
source file has zero bytes, exit early.

Signed-off-by: Darrick J. Wong 
---
 fs/read_write.c |2 ++
 1 file changed, 2 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index d6e8e242a15f..2456da3f8a41 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1748,6 +1748,8 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
+   if (*len == 0)
+   return 0;
}
 
/* Check that we don't violate system file offset limits. */



[PATCH v5 00/26] fs: fixes for serious clone/dedupe problems

2018-10-15 Thread Darrick J. Wong
Hi all,

Dave, Eric, and I have been chasing a stale data exposure bug in the XFS
reflink implementation, and tracked it down to reflink forgetting to do
some of the file-extending activities that must happen for regular
writes.

We then started auditing the clone, dedupe, and copyfile code and
realized that from a file contents perspective, clonerange isn't any
different from a regular file write.  Unfortunately, we also noticed
that *unlike* a regular write, clonerange skips a ton of overflow
checks, such as validating the ranges against s_maxbytes, MAX_NON_LFS,
and RLIMIT_FSIZE.  We also observed that cloning into a file did not
strip security privileges (suid, capabilities) like a regular write
would.  I also noticed that xfs and ocfs2 need to dump the page cache
before remapping blocks, not after.

In fixing the range checking problems I also realized that both dedupe
and copyfile tell userspace how much of the requested operation was
acted upon.  Since the range validation can shorten a clone request (or
we can ENOSPC midway through), we might as well plumb the short
operation reporting back through the VFS indirection code to userspace.

So, here's the whole giant pile of patches[1] that fix all the problems.
This branch is against current upstream (4.19-rc8).  The patch
"generic: test reflink side effects" recently sent to fstests exercises
the fixes in this series.  Tests are in [2].

--D

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=djwong-devel
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=djwong-devel


[PATCH 02/26] vfs: vfs_clone_file_prep_inodes should return EINVAL for a clone from beyond EOF

2018-10-15 Thread Darrick J. Wong
From: Darrick J. Wong 

vfs_clone_file_prep_inodes cannot return 0 if it is asked to remap from
a zero byte file because that's what btrfs does.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c |3 ---
 1 file changed, 3 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 8a2737f0d61d..260797b01851 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1740,10 +1740,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   /* Are we going all the way to the end? */
isize = i_size_read(inode_in);
-   if (isize == 0)
-   return 0;
 
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {



Re: Spurious mount point

2018-10-15 Thread Anton Shepelev
Chris Murphy:

> I wasn't aware that SUSE was now using the @ location for
> snapshots, or that it was using Btrfs for /home. For a
> while it's been XFS with a Btrfs sysroot.

Ours does use btrfs for `/' and xfs for `/home' *except*
`/home/hana', which is strange and wonderful, because the
standard usage is there in fstab and hana mount is created
somewhere else.

-- 
()  ascii ribbon campaign -- against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]


Re: Spurious mount point

2018-10-15 Thread Chris Murphy
On Mon, Oct 15, 2018 at 3:26 PM, Anton Shepelev  wrote:
> Chris Murphy to Anton Shepelev:
>
>> > How can I track down the origin of this mount point:
>> >
>> > /dev/sda2 on /home/hana type btrfs 
>> > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
>> >
>> > if it is not present in /etc/fstab?  I shouldn't like to
>> > find/grep thoughout the whole filesystem.
>>
>> Sounds like some service taking snapshots regularly is
>> managing this.  Maybe this is Mint or Ubuntu and you're
>> using Timeshift?
>
> It is SUSE Linux and (probably) its tool called `snapper',
> but I have not found a clue in its documentation.

I wasn't aware that SUSE was now using the @ location for snapshots,
or that it was using Btrfs for /home. For a while it's been XFS with a
Btrfs sysroot.





-- 
Chris Murphy


Re: Spurious mount point

2018-10-15 Thread Anton Shepelev
Chris Murphy to Anton Shepelev:

> > How can I track down the origin of this mount point:
> >
> > /dev/sda2 on /home/hana type btrfs 
> > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
> >
> > if it is not present in /etc/fstab?  I shouldn't like to
> > find/grep thoughout the whole filesystem.
>
> Sounds like some service taking snapshots regularly is
> managing this.  Maybe this is Mint or Ubuntu and you're
> using Timeshift?

It is SUSE Linux and (probably) its tool called `snapper',
but I have not found a clue in its documentation.

> Maybe it'll show up in the journal if you add boot
> parameter 'systemd.log_level=debug' and reboot; then use
> 'journalctl -b | grep mount' and it should show all
> instances logged instances of mount events: systemd,
> udisks2, maybe others?

I will try that as soon as the client lets me reboot that
machine.

-- 
()  ascii ribbon campaign -- against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]


Re: Spurious mount point

2018-10-15 Thread Chris Murphy
On Mon, Oct 15, 2018 at 9:05 AM, Anton Shepelev  wrote:
> Hello, all
>
> How can I track down the origin of this mount point:
>
>/dev/sda2 on /home/hana type btrfs 
> (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)
>
> if it is not present in /etc/fstab?  I shouldn't like to
> find/grep thoughout the whole filesystem.
>
> --
> ()  ascii ribbon campaign - against html e-mail
> /\  http://preview.tinyurl.com/qcy6mjc [archived]

Sounds like some service taking snapshots regularly is managing this.
Maybe this is Mint or Ubuntu and you're using Timeshift?

Maybe it'll show up in the journal if you add boot parameter
'systemd.log_level=debug' and reboot; then use 'journalctl -b | grep
mount' and it should show all instances logged instances of mount
events: systemd, udisks2, maybe others?



-- 
Chris Murphy


Re: reproducible builds with btrfs seed feature

2018-10-15 Thread Chris Murphy
On Mon, Oct 15, 2018 at 6:29 AM, Austin S. Hemmelgarn
 wrote:
> On 2018-10-13 18:28, Chris Murphy wrote:

>> The end result is creating two Btrfs volumes would yield image files
>> with matching hashes.
>
> So in other words, you care about matching the block layout _exactly_.

Only because that's the easiest way to verify reproducibility without
any ambiguity.

If someone's compromised a build system such that everyone is getting
the malicious payload, but they can hide it behind a subvolume or
reflink that's not used by default, could someone plausibly cause
selective use of their malicious payload? I dunno I leave that for
more crafty people. But even if it's a tiny bit of ambiguity, it's
non-zero. Hashing a file that contains the entire file system is
unambiguous.

I think populating the image with --rootdir at mkfs time should be
pretty deterministic. One stream in and out. No generations, no
snapshot, no delayed allocation. It'd be quite similar to mksquashfs.
I guess I'd have to try it a few times, and see if really the only
differences are uuids and times, and not allocation related things.



-- 
Chris Murphy


Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Christoph Hellwig
On Mon, Oct 15, 2018 at 10:13:17AM -0700, Darrick J. Wong wrote:
> > RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> > so the file system should know about it  Also looking at it again now
> > it seems entirely superflous - we can just pass down then len == we
> > use in higher level code instead of having a flag and will side step
> > the issue here.
> 
> I'm not a fan of hidden behaviors like that, particularly when we
> already have a flags field where callers can explicitly ask for the
> to-eof behavior.

This just means we have a flag to mean ken is 0 and needs to be filled,
rather than encoding that in the field itself.  If you fell better we
can replace 0 with 0x and still encode it in the field.

> > RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> > easily be solved by including it everywhere.
> 
> CAN_SHORTEN isn't included everywhere

Include it everywhere as in allow it in ever ->remap_file instance.

> I sort of thought about introducing a new copy_file_range flag that
> would just do deduplication and allow for opportunistic "dedup as much
> as you can" but ... meh.  Maybe I'll just drop the patch instead; we can
> revisit that when anyone wants a better dedupe interface.

Sounds fine to me.  The btrfs ioctl is really ugly, but then again
there is no pressing need for something better.


Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit

2018-10-15 Thread Christoph Hellwig
On Mon, Oct 15, 2018 at 08:32:19AM -0700, Darrick J. Wong wrote:
> > Why not put that in the name to make it more descriptive?
> 
> I'm all ears for better suggestions. :)

I think the best idea is no flag - just carry through the special
zero..


Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata

2018-10-15 Thread Christoph Hellwig
On Mon, Oct 15, 2018 at 09:30:01AM -0700, Darrick J. Wong wrote:
> I originally thought "touch" because it updates [cm]time. :)
> 
> Though looking at the final code, I think this can just be called from
> the end of generic_remap_file_range_prep, so we can skip the export and
> all that other stuff.

I though about that, but the locking didn't seem to quite work out
between xfs and ocfs.

Nevermind the big elephant of actually converting btrfs to the "VFS"
helper - I think if that doesn't work out it is rather questionable
how generic they actually are.


Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Darrick J. Wong
On Mon, Oct 15, 2018 at 05:47:19AM -0700, Christoph Hellwig wrote:
> On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote:
> > I supposed you figured out the reason already.
> 
> No, I hadn't.
> 
> > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS.
> > All those "advisory" flags, we want to pass them in to filesystem as FYI,
> > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN
> > to every filesystem, when vfs has already taken care of the advice.
> 
> I don't think this model makes sense.  If they really are purely
> handled in the VFS we can mask them before passing them to the file
> system, if not we need to check them, or the they are avisory and
> we can have a simple #define instead of the helper.
> 
> RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> so the file system should know about it  Also looking at it again now
> it seems entirely superflous - we can just pass down then len == we
> use in higher level code instead of having a flag and will side step
> the issue here.

I'm not a fan of hidden behaviors like that, particularly when we
already have a flags field where callers can explicitly ask for the
to-eof behavior.

> RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> easily be solved by including it everywhere.

CAN_SHORTEN isn't included everywhere -- FICLONE{,RANGE} don't enable it
because they have no way to communicate the number of bytes cloned back
to userspace.  Either we can clone every byte the user asked for, or we
send back -EINVAL.  (Maybe I'm misinterpreting what you meant by 'solved
by including it everywhere'?)

> RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to
> start with, as RFR_CAN_SHORTEN can be used instead.

For now it's superfluous.  At first I was thinking that we could return
a short bytes_deduped if, say, the first part of the range actually did
match, but it became pretty obvious via shared/010 that duperemove can't
handle that, so we really must stick to the existing btrfs behavior.

The existing btrfs behavior is that we can round the length down to
avoid deduping partial EOF blocks, but we return the original length
(i.e. lie) in bytes_deduped when we do that.

I sort of thought about introducing a new copy_file_range flag that
would just do deduplication and allow for opportunistic "dedup as much
as you can" but ... meh.  Maybe I'll just drop the patch instead; we can
revisit that when anyone wants a better dedupe interface.

> So something like this in fs.h:
> 
> #define REMAP_FILE_ADVISORY_FLAGS REMAP_FILE_CAN_SHORTEN
> 
> And then in the file system:
> 
>   if (flags & ~REMAP_FILE_ADVISORY_FLAGS)
>   -EINVAL;
> 
> or
> 
>   if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP))
>   -EINVAL;
> 
> should be all that is needed.

Sounds good to me.

--D


Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Darrick J. Wong
On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote:
> > unsigned (*mmap_capabilities)(struct file *);
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> None of the other methods in this file name their parameters.  While
> I generally don't like people leaving them out, in the end consistency
> is even more important.
> 
> > +int btrfs_remap_file_range(struct file *src_file, loff_t off,
> > +   struct file *dst_file, loff_t destoff, u64 len,
> > +   unsigned int remap_flags)
> >  {
> > +   if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
> > +   return -EINVAL;
> > +
> > +   if (remap_flags & RFR_SAME_DATA) {
> 
> So at least for btrfs there seems to be no shared code at all below
> the function calls.  This kinda speaks against the argument that
> they fundamentally are the same..

They /do/ share/ code -- eventually both btrfs_extent_same and
btrfs_clone_files call btrfs_clone.  xfs and ocfs2 call the same paths
internally too; it's only the vfs helpers that have the extra page cache
comparisons if it's a dedup operation.

> > +/*
> > + * These flags control the behavior of the remap_file_range function 
> > pointer.
> > + *
> > + * RFR_SAME_DATA: only remap if contents identical (i.e. deduplicate)
> > + */
> > +#define RFR_SAME_DATA  (1 << 0)
> > +
> > +#define RFR_VALID_FLAGS(RFR_SAME_DATA)
> 
> RFR?  Why not REMAP_FILE_*  Also why not the well understood
> REMAP_FILE_DEDUP instead of the odd SAME_DATA?

Sure.  I had begin to dislike typing RFR anyway.

> > +
> > +/*
> > + * Filesystem remapping implementations should call this helper on their
> > + * remap flags to filter out flags that the implementation doesn't support.
> > + *
> > + * Returns true if the flags are ok, false otherwise.
> > + */
> > +static inline bool remap_check_flags(unsigned int remap_flags,
> > +unsigned int supported_flags)
> > +{
> > +   return (remap_flags & ~(supported_flags & RFR_VALID_FLAGS)) == 0;
> > +}
> 
> Any reason to even bother with a helper for this?  ->fallocate
> seems to be doing fine without the helper, and the resulting code
> seems a lot easier to understand to me.

(Will respond to these at the current end of the flags thread.)

> > @@ -1759,10 +1779,9 @@ struct file_operations {
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
> > -   u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
> > -   u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> Same comment here.  Didn't we have some nice doc tools to avoid this
> duplication? :)

We do, but vfs.txt hasn't been ported to any of that.

--D


Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata

2018-10-15 Thread Darrick J. Wong
On Sun, Oct 14, 2018 at 10:21:31AM -0700, Christoph Hellwig wrote:
> > +/* Update inode timestamps and remove security privileges when remapping. 
> > */
> > +int generic_remap_file_range_touch(struct file *file, bool is_dedupe)
> > +{
> > +   int ret;
> > +
> > +   /* If can't alter the file contents, we're done. */
> > +   if (is_dedupe)
> > +   return 0;
> > +
> > +   /* Update the timestamps, since we can alter file contents. */
> > +   if (!(file->f_mode & FMODE_NOCMTIME)) {
> > +   ret = file_update_time(file);
> > +   if (ret)
> > +   return ret;
> > +   }
> > +
> > +   /*
> > +* Clear the security bits if the process is not being run by root.
> > +* This keeps people from modifying setuid and setgid binaries.
> > +*/
> > +   return file_remove_privs(file);
> > +}
> > +EXPORT_SYMBOL(generic_remap_file_range_touch);
> 
> The name seems a little out of touch with what it actually does.

I originally thought "touch" because it updates [cm]time. :)

Though looking at the final code, I think this can just be called from
the end of generic_remap_file_range_prep, so we can skip the export and
all that other stuff.

> Also why a bool argument instead of the more descriptive flags which
> introduced a few patches ago?

Hmm, yes, the remap_flags transition can move up to before this patch.

--D


Re: [PATCH 24/25] xfs: support returning partial reflink results

2018-10-15 Thread Darrick J. Wong
On Mon, Oct 15, 2018 at 10:05:36AM +1100, Dave Chinner wrote:
> On Sun, Oct 14, 2018 at 10:35:46AM -0700, Christoph Hellwig wrote:
> > On Fri, Oct 12, 2018 at 05:08:32PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong 
> > > 
> > > Back when the XFS reflink code only supported clone_file_range, we were
> > > only able to return zero or negative error codes to userspace.  However,
> > > now that copy_file_range (which returns bytes copied) can use XFS'
> > > clone_file_range, we have the opportunity to return partial results.
> > > For example, if userspace sends a 1GB clone request and we run out of
> > > space halfway through, we at least can tell userspace that we completed
> > > 512M of that request like a regular write.
> > > 
> > > Signed-off-by: Darrick J. Wong 
> > > ---
> > >  fs/xfs/xfs_file.c|5 +
> > >  fs/xfs/xfs_reflink.c |   20 +++-
> > >  fs/xfs/xfs_reflink.h |2 +-
> > >  3 files changed, 17 insertions(+), 10 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index bc9e94bcb7a3..b2b15b8dc4a1 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -928,14 +928,11 @@ xfs_file_remap_range(
> > >   loff_t  len,
> > >   unsigned intremap_flags)
> > >  {
> > > - int ret;
> > > -
> > >   if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
> > >   return -EINVAL;
> > >  
> > > - ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> > > + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
> > >   len, remap_flags);
> > 
> > Is there any reason not to merge xfs_file_remap_range and
> > xfs_reflink_remap_range at this point?
> 
> Yeah, that seems like a good idea to me - pulling all the
> vfs/generic code interactions back up into xfs_file.c would match
> how the rest of the file operations are layered w.r.t. external and
> internal XFS code...

Yeah, ditto ocfs2.  I'll look into throwing a few more refactors onto
the end of this series... 8-D

--D

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Martin Steigerwald
Hugo Mills - 15.10.18, 16:26:
> On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> > Hello, all
> > 
> > While trying to resolve free space problems, and found that
> > 
> > I cannot interpret the output of:
> > > btrfs filesystem show
> > 
> > Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> > Total devices 1 FS bytes used 34.06GiB
> > devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> > 
> > How come the total used value is less than the value listed
> > for the only device?
> 
>"Used" on the device is the mount of space allocated. "Used" on the
> FS is the total amount of actual data and metadata in that
> allocation.
> 
>You will also need to look at the output of "btrfs fi df" to see
> the breakdown of the 37.82 GiB into data, metadata and currently
> unused.

I usually use btrfs fi usage -T, cause

1. It has all the information.

2. It differentiates between used and allocated.

% btrfs fi usage -T /
Overall:
Device size: 100.00GiB
Device allocated: 54.06GiB
Device unallocated:   45.94GiB
Device missing:  0.00B
Used: 46.24GiB
Free (estimated): 25.58GiB  (min: 25.58GiB)
Data ratio:   2.00
Metadata ratio:   2.00
Global reserve:   70.91MiB  (used: 0.00B)

Data Metadata  System  
Id Path RAID1RAID1 RAID1Unallocated
--   -  ---
 2 /dev/mapper/msata-debian 25.00GiB   2.00GiB 32.00MiB22.97GiB
 1 /dev/mapper/sata-debian  25.00GiB   2.00GiB 32.00MiB22.97GiB
--   -  ---
   Total25.00GiB   2.00GiB 32.00MiB45.94GiB
   Used 22.38GiB 754.66MiB 16.00KiB  


For RAID it in some place reports the raw size and sometimes the logical 
size. Especially in the "Total" line I find this a bit inconsistent. 
"RAID1" columns show logical size, "Unallocated" shows raw size.

Also "Used:" in the global section shows raw size and "Free 
(estimated):" shows logical size.

Thanks
-- 
Martin




Re: [PATCH 11/25] vfs: pass remap flags to generic_remap_file_range_prep

2018-10-15 Thread Darrick J. Wong
On Sun, Oct 14, 2018 at 10:37:38AM -0700, Christoph Hellwig wrote:
> > +   bool is_dedupe = (remap_flags & RFR_SAME_DATA);
> 
> Btw, I think the code would be cleaner if we dropped this variable.

Ok to both.  I'll move up the patch to replace is_dedupe with
remap_flags to avoid churning the _touch function too, btw.

--D


Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit

2018-10-15 Thread Darrick J. Wong
On Sun, Oct 14, 2018 at 10:24:33AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 12, 2018 at 05:07:37PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong 
> > 
> > Create a RFR_TO_SRC_EOF flag to explicitly declare that the caller wants
> > the remap implementation to remap to the end of the source file, once
> > the files are locked.
> 
> The name looks like a cat threw up on your keyboard :)

Yeah... :(

> From reading the code this seems to ask for a whole file remap, right?

Nope.  In the original btrfs clonerange ioctl, length == 0 meant "to EOF".
If you made a call like this:

struct btrfs_ioctl_clone_range_args x = {
.src_offset = 16384,
.src_length = 0,
.dest_offset = 0,
.src_fd = whatever,
};
ftruncate(dest_fd, 0);
ioctl(dest_fd, BTRFS_IOC_CLONE, &x);

Then dest_fd ends up with the contents of [16k...EOF] from src_fd.
It's annoying to have the magic length number (and no flags?) but we're
stuck with this quirk of the interface.

> Why not put that in the name to make it more descriptive?

I'm all ears for better suggestions. :)

--D


Spurious mount point

2018-10-15 Thread Anton Shepelev
Hello, all

How can I track down the origin of this mount point:

   /dev/sda2 on /home/hana type btrfs 
(rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana)

if it is not present in /etc/fstab?  I shouldn't like to
find/grep thoughout the whole filesystem.

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Austin S. Hemmelgarn

On 2018-10-15 10:42, Anton Shepelev wrote:

Hugo Mills to Anton Shepelev:


While trying to resolve free space problems, and found
that I cannot interpret the output of:


btrfs filesystem show


Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
Total devices 1 FS bytes used 34.06GiB
devid1 size 40.00GiB used 37.82GiB path /dev/sda2

How come the total used value is less than the value
listed for the only device?


   "Used" on the device is the mount of space allocated.
"Used" on the FS is the total amount of actual data and
metadata in that allocation.

   You will also need to look at the output of "btrfs fi
df" to see the breakdown of the 37.82 GiB into data,
metadata and currently unused.

See
https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
 for the details


Thank you, Hugo, understood.  mount/amount is a very fitting
typo :-)

Does the standard `du' tool work correctly for btfrfs?

For the default 'physical usage' mode, it functionally does not work 
correctly, because it does not know about reflinks.  The easiest way to 
see this is to create a couple of snapshots of a subvolume alongside the 
subvolume, and then run `du -s --totals` on those snapshots and the 
subvolume.  It will report the total space usage to be equal to the sum 
of the values reported for each snapshot and the subvolume, when it 
should instead only count the space usage for shared data once.


For the 'apparent usage' mode provided by the GNU implementation, it 
does work correctly.


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills
On Mon, Oct 15, 2018 at 05:40:40PM +0300, Anton Shepelev wrote:
> Hugo Mills to Anton Shepelev:
> 
> >>While trying to resolve free space problems, and found
> >>that I cannot interpret the output of:
> >>
> >>> btrfs filesystem show
> >>
> >>Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> >>Total devices 1 FS bytes used 34.06GiB
> >>devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> >>
> >>How come the total used value is less than the value
> >>listed for the only device?
> >
> >   "Used" on the device is the mount of space allocated.
> >"Used" on the FS is the total amount of actual data and
> >metadata in that allocation.
> >
> >   You will also need to look at the output of "btrfs fi
> >df" to see the breakdown of the 37.82 GiB into data,
> >metadata and currently unused.
> >
> >See
> >https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
> > for the details
> 
> Thank you, Hugo, understood.  mount/amount is a very fitting
> typo :-)
> 
> Do the standard `du' and `du' tools report correct values
> with btrfs?

   Well...

   du will tell you the size of the files you asked it about, but it
doesn't know about reflinks, so it'll double-count if you've got a
reflink copy of something. Other than that, it should be accurate, I
think. There's also a "btrfs fi du" which can tell you the amount of
shared and unique data as well, so you can know, for example, how much
space you'll reclaim if you delete those files.

   df should be mostly OK, but it does sometimes get its estimate of
the total usable size of the FS wrong, particularly if the FS is
unbalanced. However, as the FS fills up, the estimate gets better,
because it gets more evenly balanced across devices over time.

   Hugo.

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5


signature.asc
Description: Digital signature


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Anton Shepelev
Hugo Mills to Anton Shepelev:

>>While trying to resolve free space problems, and found
>>that I cannot interpret the output of:
>>
>>> btrfs filesystem show
>>
>>Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
>>Total devices 1 FS bytes used 34.06GiB
>>devid1 size 40.00GiB used 37.82GiB path /dev/sda2
>>
>>How come the total used value is less than the value
>>listed for the only device?
>
>   "Used" on the device is the mount of space allocated.
>"Used" on the FS is the total amount of actual data and
>metadata in that allocation.
>
>   You will also need to look at the output of "btrfs fi
>df" to see the breakdown of the 37.82 GiB into data,
>metadata and currently unused.
>
>See
>https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
> for the details

Thank you, Hugo, understood.  mount/amount is a very fitting
typo :-)

Does the standard `du' tool work correctly for btfrfs?

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills
On Mon, Oct 15, 2018 at 02:26:41PM +, Hugo Mills wrote:
> On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> > Hello, all
> > 
> > While trying to resolve free space problems, and found that
> > I cannot interpret the output of:
> > 
> > > btrfs filesystem show
> > 
> > Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> > Total devices 1 FS bytes used 34.06GiB
> > devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> > 
> > How come the total used value is less than the value listed
> > for the only device?
> 
>"Used" on the device is the mount of space allocated. "Used" on the

s/mount/amount/

> FS is the total amount of actual data and metadata in that allocation.
> 
>You will also need to look at the output of "btrfs fi df" to see
> the breakdown of the 37.82 GiB into data, metadata and currently
> unused.
> 
>See 
> https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
>  for the details.
> 
>Hugo.
> 

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5


signature.asc
Description: Digital signature


Re: Interpreting `btrfs filesystem show'

2018-10-15 Thread Hugo Mills
On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote:
> Hello, all
> 
> While trying to resolve free space problems, and found that
> I cannot interpret the output of:
> 
> > btrfs filesystem show
> 
> Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
> Total devices 1 FS bytes used 34.06GiB
> devid1 size 40.00GiB used 37.82GiB path /dev/sda2
> 
> How come the total used value is less than the value listed
> for the only device?

   "Used" on the device is the mount of space allocated. "Used" on the
FS is the total amount of actual data and metadata in that allocation.

   You will also need to look at the output of "btrfs fi df" to see
the breakdown of the 37.82 GiB into data, metadata and currently
unused.

   See 
https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools
 for the details.

   Hugo.

-- 
Hugo Mills | "Your problem is that you have a negative
hugo@... carfax.org.uk | personality."
http://carfax.org.uk/  | "No, I don't!"
PGP: E2AB1DE4  |  Londo and Vir, Babylon 5


signature.asc
Description: Digital signature


Interpreting `btrfs filesystem show'

2018-10-15 Thread Anton Shepelev
Hello, all

While trying to resolve free space problems, and found that
I cannot interpret the output of:

> btrfs filesystem show

Label: none  uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8
Total devices 1 FS bytes used 34.06GiB
devid1 size 40.00GiB used 37.82GiB path /dev/sda2

How come the total used value is less than the value listed
for the only device?

-- 
()  ascii ribbon campaign - against html e-mail
/\  http://preview.tinyurl.com/qcy6mjc [archived]


[PATCH] btrfs: Adjust loop in free_extent_buffer

2018-10-15 Thread Nikolay Borisov
The loop construct in free_extent_buffer was added in
242e18c7c1a8 ("Btrfs: reduce lock contention on extent buffer locks")
as means of reducing the times the eb lock is taken, the non-last ref
count is decremented and lock is released. As the special handling
of UNMAPPED extent buffers was removed now there is only one decrement
op which is happening for EXTENT_BUFFER_UNMAPPED case. This commit
modifies the loop condition so that in case of UNMAPPED buffers the
eb's lock is taken only if we are 100% sure the eb is going to be freed
by the current executor of the code. Additionally, remove superfluous 
ref count  ops in btrfs test. 

Signed-off-by: Nikolay Borisov 
---

This was tested with xfstest multiple times + unloading of btrfs module to 
ensure no eb leaks are present. 


 fs/btrfs/extent_io.c | 4 +++-
 fs/btrfs/tests/btrfs-tests.c | 2 --
 fs/btrfs/tests/inode-tests.c | 6 --
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0e9d4f28379d..544a1a5fd416 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5095,7 +5095,9 @@ void free_extent_buffer(struct extent_buffer *eb)
 
while (1) {
refs = atomic_read(&eb->refs);
-   if (refs <= 3)
+   if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 
3)
+   || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
+   refs == 1))
break;
old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
if (old == refs)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 4cb8fcfd3ec4..8a59597f1883 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -177,8 +177,6 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (root->node) {
/* One for allocate_extent_buffer */
free_extent_buffer(root->node);
-   /* One for get_exent_buffer */
-   free_extent_buffer(root->node);
}
kfree(root);
 }
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 64043f028820..af0c8e30d9e2 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, 
u32 nodesize)
goto out;
}
 
-   /*
-* We will just free a dummy node if it's ref count is 2 so we need an
-* extra ref so our searches don't accidentally release our page.
-*/
-   extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
ret = -EINVAL;
@@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
 
-   extent_buffer_get(root->node);
btrfs_set_header_nritems(root->node, 0);
btrfs_set_header_level(root->node, 0);
BTRFS_I(inode)->root = root;
-- 
2.7.4



Re: [PATCH 0/6] Some trivail cleanup about dealyed-refs

2018-10-15 Thread David Sterba
On Mon, Oct 15, 2018 at 10:39:19AM +0800, Lu Fengqi wrote:
> On Thu, Oct 11, 2018 at 01:51:37PM +0200, David Sterba wrote:
> >On Thu, Oct 11, 2018 at 01:40:32PM +0800, Lu Fengqi wrote:
> >> There is no functional change. Just improve readablity.
> >> 
> >> PATCH 1-4 parameter cleanup patches
> >> PATCH 5 cleanup about btrfs_select_ref_head
> >> PATCH 6 switch int to bool; add some comment
> >> 
> >> Lu Fengqi (6):
> >>   btrfs: delayed-ref: pass delayed_refs directly to
> >> btrfs_select_ref_head()
> >>   btrfs: delayed-ref: pass delayed_refs directly to
> >> btrfs_delayed_ref_lock()
> >>   btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
> >>   btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
> >>   btrfs: simplify btrfs_select_ref_head and cleanup some local variables
> >>   btrfs: switch return_bigger to bool in find_ref_head
> >
> >1-4 and 6 added to misc-next, thanks.
> 
> There is not patch 2 at the misc-next branch. So it was forgotten?

You're right, got lost somewhere, now in misc-next. Thanks for checking.


Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Matthew Wilcox
On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote:
> > unsigned (*mmap_capabilities)(struct file *);
> >  #endif
> > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
> > loff_t, size_t, unsigned int);
> > -   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > -   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
> > u64);
> > +   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
> > +   struct file *file_out, loff_t pos_out,
> > +   u64 len, unsigned int remap_flags);
> 
> None of the other methods in this file name their parameters.  While
> I generally don't like people leaving them out, in the end consistency
> is even more important.

I would agree with you *except* that the parameters do not follow memcpy()
traditional order (dst, src, len).  Instead they are (src, dst, len), so we
should probably name them to advise the poor sod who has to implement this
that we've chosen an inconsistent API.

Or we could fix it.



Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Amir Goldstein
On Mon, Oct 15, 2018 at 3:47 PM Christoph Hellwig  wrote:
>
> On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote:
> > I supposed you figured out the reason already.
>
> No, I hadn't.
>
> > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS.
> > All those "advisory" flags, we want to pass them in to filesystem as FYI,
> > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN
> > to every filesystem, when vfs has already taken care of the advice.
>
> I don't think this model makes sense.  If they really are purely
> handled in the VFS we can mask them before passing them to the file
> system, if not we need to check them, or the they are avisory and
> we can have a simple #define instead of the helper.
>
> RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
> so the file system should know about it  Also looking at it again now
> it seems entirely superflous - we can just pass down then len == we
> use in higher level code instead of having a flag and will side step
> the issue here.
>
> RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
> easily be solved by including it everywhere.
>
> RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to
> start with, as RFR_CAN_SHORTEN can be used instead.
>
> So something like this in fs.h:
>
> #define REMAP_FILE_ADVISORY_FLAGS   REMAP_FILE_CAN_SHORTEN
>
> And then in the file system:
>
> if (flags & ~REMAP_FILE_ADVISORY_FLAGS)
> -EINVAL;
>
> or
>
> if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP))
> -EINVAL;
>
> should be all that is needed.

Yeh, I suppose that makes sense.

Thanks,
Amir.


Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-15 Thread Christoph Hellwig
On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote:
> I supposed you figured out the reason already.

No, I hadn't.

> It makes it appearance in patch 16/25 as RFR_VFS_FLAGS.
> All those "advisory" flags, we want to pass them in to filesystem as FYI,
> but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN
> to every filesystem, when vfs has already taken care of the advice.

I don't think this model makes sense.  If they really are purely
handled in the VFS we can mask them before passing them to the file
system, if not we need to check them, or the they are avisory and
we can have a simple #define instead of the helper.

RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep,
so the file system should know about it  Also looking at it again now
it seems entirely superflous - we can just pass down then len == we
use in higher level code instead of having a flag and will side step
the issue here.

RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can
easily be solved by including it everywhere.

RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to
start with, as RFR_CAN_SHORTEN can be used instead.

So something like this in fs.h:

#define REMAP_FILE_ADVISORY_FLAGS   REMAP_FILE_CAN_SHORTEN

And then in the file system:

if (flags & ~REMAP_FILE_ADVISORY_FLAGS)
-EINVAL;

or

if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP))
-EINVAL;

should be all that is needed.


Re: reproducible builds with btrfs seed feature

2018-10-15 Thread Austin S. Hemmelgarn

On 2018-10-13 18:28, Chris Murphy wrote:

Is it practical and desirable to make Btrfs based OS installation
images reproducible? Or is Btrfs simply too complex and
non-deterministic? [1]

The main three problems with Btrfs right now for reproducibility are:
a. many objects have uuids other than the volume uuid; and mkfs only
lets us set the volume uuid
b. atime, ctime, mtime, otime; and no way to make them all the same
c. non-deterministic allocation of file extents, compression, inode
assignment, logical and physical address allocation

I'm imagining reproducible image creation would be a mkfs feature that
builds on Btrfs seed and --rootdir concepts to constrain Btrfs
features to maybe make reproducible Btrfs volumes possible:

- No raid
- Either all objects needing uuids can have those uuids specified by
switch, or possibly a defined set of uuids expressly for this use
case, or possibly all of them can just be zeros (eek? not sure)
- A flag to set all times the same
- Possibly require that target block device is zero filled before
creation of the Btrfs
- Possibly disallow subvolumes and snapshots
- Require the resulting image is seed/ro and maybe also a new
compat_ro flag to enforce that such Btrfs file systems cannot be
modified after the fact.
- Enforce a consistent means of allocation and compression

The end result is creating two Btrfs volumes would yield image files
with matching hashes.
So in other words, you care about matching the block layout _exactly_. 
This is a great idea for paranoid people, but it's usually overkill. 
Realistically, almost nothing in userspace cares about the block layout, 
worrying about it just makes verifying the reproduced image a bit easier 
(there's no reason you can't verify all the relevant data without doing 
a checksum or HMAC of the image as a whole).


If I had to guess, the biggest challenge would be allocation. But it's
also possible that such an image may have problems with "sprouts". A
non-removable sprout seems fairly straightforward and safe; but if a
"reproducible build" type of seed is removed, it seems like removal
needs to be smart enough to refresh *all* uuids found in the sprout: a
hard break from the seed.

Competing file systems, ext4 with make_ext4 fork, and squashfs. At the
moment I'm thinking it might be easier to teach squashfs integrity
checking than to make Btrfs reproducible.  But then I also think
restricting Btrfs features, and applying some requirements to
constrain Btrfs to make it reproducible, really enhances the Btrfs
seed-sprout feature.

Any thoughts? Useful? Difficult to implement?

Squashfs might be a better fit for this use case *if* it can be taught
about integrity checking. It does per file checksums for the purpose
of deduplication but those checksums aren't retained for later
integrity checking.
I've seen projects with SquashFS that store integrity data separately 
but leverage other infrastructure.  Methods I've seen so far include:


* GPG-signed SquashFS images, usually with detached signatures
* SquashFS with PAR2 integrity checking data
* SquashFS on top of dm-verity
* SquashFS on top of dm-integrity

The first two need to be externally checked prior to mount, but doing so 
is not hard.  The fourth is tricky to set up right, but provides better 
integration with encrypted images.  The third does exactly what's needed 
though.  You just use the embedded data variant of dm-verity, bind the 
resultant image to a loop device, activate dm-verity on the loop device, 
and mount the resultant mapped device like any other SquashFS image.


I've also seen some talk of using SquashFS with IMA and IMA appraisal, 
but I've not seen anybody actually _do_ that, and it wouldn't be on 
quite the level you seem to want (it verifies the files in the image, 
but not the image as a whole).


Re: BTRFS bad block management. Does it exist?

2018-10-15 Thread Austin S. Hemmelgarn

On 2018-10-14 07:08, waxhead wrote:

In case BTRFS fails to WRITE to a disk. What happens?
Does the bad area get mapped out somehow? Does it try again until it 
succeed or until it "times out" or reach a threshold counter?
Does it eventually try to write to a different disk (in case of using 
the raid1/10 profile?)


Building on Qu's answer (which is absolutely correct), BTRFS makes the 
perfectly reasonable assumption that you're not trying to use known bad 
hardware.  It's not alone in this respect either, pretty much every 
Linux filesystem makes the exact same assumption (and almost all 
non-Linux ones too), because it really is a perfectly reasonable 
assumption.  The only exception is ext[234], but they only support it 
statically (you can set the bad block list at mkfs time, but not 
afterwards, and they don't update it at runtime), and it's a holdover 
from earlier filesystems which originated at a time when storage was 
sufficiently expensive _and_ unreliable that you kept using disks until 
they were essentially completely dead.


The reality is that with modern storage hardware, if you have 
persistently bad sectors the device is either defective (and should be 
returned under warranty), or it's beyond expected EOL (and should just 
be replaced).  Most people know about SSD's doing block remapping to 
avoid bad blocks, but hard drives do it to, and they're actually rather 
good at it.  In both cases, enough spare blocks are provided that the 
device can handle average rates of media errors through the entirety of 
it's average life expectancy without running out of spare blocks.


On top of all of that though, it's fully possible to work around bad 
blocks in the block layer if you take the time to actually do it.  With 
a bit of reasonably simple math, you can easily set up an LVM volume 
that actively avoids all the bad blocks on a disk while still fully 
utilizing the rest of the volume.  Similarly, with a bit of work (and a 
partition table that supports _lots_ of partitions) you can work around 
bad blocks with an MD concatenated device.


[PATCH] btrfs: fix test btrfs/007 to not leave temporary files in /tmp

2018-10-15 Thread fdmanana
From: Filipe Manana 

This test was using the "mktemp -d" command to create a temporary
directory for storing send streams and computations from fssum, without
ever deleting them when it finishes. Therefore after running it for many
times it filled up all space from /tmp.

Fix this by using a temporary directory in TEST_DEV instead, as all the
more recent send/receive tests do, to store these files, and making sure
they get deleted when the test finishes. On average the sum of the size
of those files is between 5.5Mb to 6Mb, but changing the number of
operations for fsstress makes it even bigger.

Signed-off-by: Filipe Manana 
---
 tests/btrfs/007 | 33 +++--
 tests/btrfs/007.out |  1 -
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/tests/btrfs/007 b/tests/btrfs/007
index 09f2f011..438f2f27 100755
--- a/tests/btrfs/007
+++ b/tests/btrfs/007
@@ -16,14 +16,14 @@ seq=`basename $0`
 seqres=$RESULT_DIR/$seq
 echo "QA output created by $seq"
 
-tmp=`mktemp -d`
+tmp=/tmp/$$
 status=1
 
 _cleanup()
 {
-   echo "*** unmount"
-   _scratch_unmount 2>/dev/null
+   cd /
rm -f $tmp.*
+   rm -fr $send_files_dir
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -38,7 +38,11 @@ _require_scratch
 _require_fssum
 _require_seek_data_hole
 
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
 rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
 
 workout()
 {
@@ -57,19 +61,20 @@ workout()
 
_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT 
$SCRATCH_MNT/incr
 
-   echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap" \
+   echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > 
${send_files_dir}/base.snap" \
>> $seqres.full
-   $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap 2>> 
$seqres.full \
+   $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $send_files_dir/base.snap 2>> 
$seqres.full \
|| _fail "failed: '$@'"
echo "# $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base\
-   $SCRATCH_MNT/incr > $tmp/incr.snap" >> $seqres.full
+   $SCRATCH_MNT/incr > ${send_files_dir}/incr.snap" >> $seqres.full
$BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base \
-   $SCRATCH_MNT/incr > $tmp/incr.snap 2>> $seqres.full \
+   $SCRATCH_MNT/incr > $send_files_dir/incr.snap 2>> $seqres.full \
|| _fail "failed: '$@'"
 
-   run_check $FSSUM_PROG -A -f -w $tmp/base.fssum $SCRATCH_MNT/base
-   run_check $FSSUM_PROG -A -f -w $tmp/incr.fssum -x 
$SCRATCH_MNT/incr/base \
-   $SCRATCH_MNT/incr
+   run_check $FSSUM_PROG -A -f -w $send_files_dir/base.fssum \
+   $SCRATCH_MNT/base
+   run_check $FSSUM_PROG -A -f -w $send_files_dir/incr.fssum \
+   -x $SCRATCH_MNT/incr/base $SCRATCH_MNT/incr
 
_scratch_unmount >/dev/null 2>&1
echo "*** mkfs -dsize=$fsz">>$seqres.full
@@ -78,11 +83,11 @@ workout()
|| _fail "size=$fsz mkfs failed"
_scratch_mount "-o noatime"
 
-   _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/base.snap
-   run_check $FSSUM_PROG -r $tmp/base.fssum $SCRATCH_MNT/base
+   _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/base.snap
+   run_check $FSSUM_PROG -r $send_files_dir/base.fssum $SCRATCH_MNT/base
 
-   _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/incr.snap
-   run_check $FSSUM_PROG -r $tmp/incr.fssum $SCRATCH_MNT/incr
+   _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/incr.snap
+   run_check $FSSUM_PROG -r $send_files_dir/incr.fssum $SCRATCH_MNT/incr
 }
 
 echo "*** test send / receive"
diff --git a/tests/btrfs/007.out b/tests/btrfs/007.out
index 8f8cec7d..5d029ceb 100644
--- a/tests/btrfs/007.out
+++ b/tests/btrfs/007.out
@@ -1,4 +1,3 @@
 QA output created by 007
 *** test send / receive
 *** done
-*** unmount
-- 
2.11.0



[PATCH] generic: test fsync after fallocate on a very small file

2018-10-15 Thread fdmanana
From: Filipe Manana 

Test that if we have a very small file, with a size smaller than the
block size, then fallocate a very small range within the block size but
past the file's current size, fsync the file and then power fail, after
mounting the filesystem all the file data is there and the file size is
correct.

This test is motivated by a failure in btrfs where it triggered an
assertion when using the no-holes feature, that is, when running with
MKFS_OPTIONS="-O no-holes". The btrfs issue is fixed by a patch for the
linux kernel titled:

 "Btrfs: fix assertion on fsync of regular file when using no-holes
  feature"

Signed-off-by: Filipe Manana 
---
 tests/generic/512 | 61 +++
 tests/generic/512.out |  9 
 tests/generic/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/generic/512
 create mode 100644 tests/generic/512.out

diff --git a/tests/generic/512 b/tests/generic/512
new file mode 100755
index ..f4e13c68
--- /dev/null
+++ b/tests/generic/512
@@ -0,0 +1,61 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 512
+#
+# Test that if we have a very small file, with a size smaller than the block
+# size, then fallocate a very small range within the block size but past the
+# file's current size, fsync the file and then power fail, after mounting the
+# filesystem all the file data is there and the file size is correct.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   _cleanup_flakey
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/dmflakey
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_xfs_io_command "falloc"
+_require_dm_target flakey
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_require_metadata_journaling $SCRATCH_DEV
+_init_flakey
+_mount_flakey
+
+$XFS_IO_PROG -f \
+-c "pwrite -S 0xb6 0 21" \
+-c "falloc 40 40" \
+-c "fsync" \
+$SCRATCH_MNT/foobar | _filter_xfs_io
+
+# Simulate a power failure and mount the filesystem. We expect no data loss
+# and a correct file size.
+_flakey_drop_and_remount
+
+echo "File content after power failure:"
+od -t x1 -A d $SCRATCH_MNT/foobar
+
+_unmount_flakey
+
+status=0
+exit
diff --git a/tests/generic/512.out b/tests/generic/512.out
new file mode 100644
index ..19a0a1b1
--- /dev/null
+++ b/tests/generic/512.out
@@ -0,0 +1,9 @@
+QA output created by 512
+wrote 21/21 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+File content after power failure:
+000 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6
+016 b6 b6 b6 b6 b6 00 00 00 00 00 00 00 00 00 00 00
+032 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+*
+080
diff --git a/tests/generic/group b/tests/generic/group
index 348214ac..d17a0248 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -514,3 +514,4 @@
 509 auto quick log
 510 auto quick log
 511 auto quick rw zero
+512 auto quick log prealloc
-- 
2.11.0



[PATCH] Btrfs: fix assertion on fsync of regular file when using no-holes feature

2018-10-15 Thread fdmanana
From: Filipe Manana 

When using the NO_HOLES feature and logging a regular file, we were
expecting that if we find an inline extent, that either its size in ram
(uncompressed and unenconded) matches the size of the file or if it does
not, that it matches the sector size and it represents compressed data.
This assertion does not cover a case where the length of the inline extent
is smaller then the sector size and also smaller the file's size, such
case is possible through fallocate. Example:

  $ mkfs.btrfs -f -O no-holes /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0xb60 0 21" /mnt/foobar
  $ xfs_io -c "falloc 40 40" /mnt/foobar
  $ xfs_io -c "fsync" /mnt/foobar

In the abobe example we trigger the assertion because the inline extent's
length is 21 bytes while the file size is 80 bytes. The fallocate() call
merely updated the file's size and did not touch the existing inline
extent, as expected.

So fix this by adjusting the assertion so that an inline extent length
smaller then the file size is valid if the file size is smaller then the
filesystem's sector size.

A test case for fstests follows soon.

Reported-by: Anatoly Trosinenko 
Link: 
https://lore.kernel.org/linux-btrfs/CAE5jQCfRSBC7n4pUTFJcmHh109=gwyT9mFkCOL+NKfzswmR=_...@mail.gmail.com/
Signed-off-by: Filipe Manana 
---
 fs/btrfs/tree-log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ed455dba..1673dccc76c2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4652,7 +4652,8 @@ static int btrfs_log_trailing_hole(struct 
btrfs_trans_handle *trans,
ASSERT(len == i_size ||
   (len == fs_info->sectorsize &&
btrfs_file_extent_compression(leaf, extent) !=
-   BTRFS_COMPRESS_NONE));
+   BTRFS_COMPRESS_NONE) ||
+  (len < i_size && i_size < fs_info->sectorsize));
return 0;
}
 
-- 
2.11.0



Re: How to recover from btrfs scrub errors? (uncorrectable errors, checksum error at logical)

2018-10-15 Thread Qu Wenruo


On 2018/10/15 下午3:50, Otto Kekäläinen wrote:
> Hello!
> 
> I am trying to figure out how to recover from errors detected by btrfs scrub.
> 
> Scrub status reports:
> 
> scrub status for 4f4479d5-648a-45b9-bcbf-978c766aeb41
> scrub started at Mon Oct 15 10:02:28 2018, running for 00:35:39
> total bytes scrubbed: 791.15GiB with 18 errors
> error details: csum=18
> corrected errors: 0, uncorrectable errors: 18, unverified errors: 0
> 
> Kernel log contains lines like
> 
>   BTRFS warning (device dm-8): checksum error at logical 7351706472448 on dev
>   /dev/mapper/disk6tb, sector 61412648, root 12725, inode 152358265,
> offset 483328:
>   path resolving failed with ret=-2
> 
> I've tried so far:
> - deleting the files (when path is visible)

Please ensure there are no other subvolumes/snapshots containing the
same file or reflink to it.

If path is not visible, please use the root and inode number to locate
the culprit file.
"find" command support to search using inode number.
And "btrfs subvolume list" command will show the subvolume number.

Also it's recommended to sync the fs before scrub, in case culprit inode
only get orphaned but not deleted from disk.

> - overwriting the files with new data

If you're only overwriting the culprit sector, it could get CoWed and
the original data extent is still there.

You need to ensure the old data is not referred by any other root/inode.
Please ensure there is no reflink/snapshot first.

Then delete the file or overwrite the whole culprit file.

Thanks,
Qu

> - changed disk (with btrfs replace)
> 
> The checksum errors however persist.
> How do I get rid of them?
> 
> 
> The files are logs and other non-vital information. I am fine by
> deleting the corrupted files. It is OK to recover so that I loose a
> few gigabytes of data, but not the entire filesystem.
> 
> Setup is a multi-disk btrfs filesystem, data single, metadata RAID-1
> Mounted with:
> 
> /dev/mapper/wdc3td on /data type btrfs
> (rw,noatime,compress=lzo,space_cache,subvolid=5,subvol=/)
> 
> I've read lots of online sources on the topic but none of these help
> me on how to recover from the current state:
> 
> https://btrfs.wiki.kernel.org/index.php/Btrfsck
> http://marc.merlins.org/perso/btrfs/post_2014-03-19_Btrfs-Tips_-Btrfs-Scrub-and-Btrfs-Filesystem-Repair.html
> https://wiki.archlinux.org/index.php/Identify_damaged_files#Find_damaged_files
> 



signature.asc
Description: OpenPGP digital signature


How to recover from btrfs scrub errors? (uncorrectable errors, checksum error at logical)

2018-10-15 Thread Otto Kekäläinen
Hello!

I am trying to figure out how to recover from errors detected by btrfs scrub.

Scrub status reports:

scrub status for 4f4479d5-648a-45b9-bcbf-978c766aeb41
scrub started at Mon Oct 15 10:02:28 2018, running for 00:35:39
total bytes scrubbed: 791.15GiB with 18 errors
error details: csum=18
corrected errors: 0, uncorrectable errors: 18, unverified errors: 0

Kernel log contains lines like

  BTRFS warning (device dm-8): checksum error at logical 7351706472448 on dev
  /dev/mapper/disk6tb, sector 61412648, root 12725, inode 152358265,
offset 483328:
  path resolving failed with ret=-2

I've tried so far:
- deleting the files (when path is visible)
- overwriting the files with new data
- changed disk (with btrfs replace)

The checksum errors however persist.
How do I get rid of them?


The files are logs and other non-vital information. I am fine by
deleting the corrupted files. It is OK to recover so that I loose a
few gigabytes of data, but not the entire filesystem.

Setup is a multi-disk btrfs filesystem, data single, metadata RAID-1
Mounted with:

/dev/mapper/wdc3td on /data type btrfs
(rw,noatime,compress=lzo,space_cache,subvolid=5,subvol=/)

I've read lots of online sources on the topic but none of these help
me on how to recover from the current state:

https://btrfs.wiki.kernel.org/index.php/Btrfsck
http://marc.merlins.org/perso/btrfs/post_2014-03-19_Btrfs-Tips_-Btrfs-Scrub-and-Btrfs-Filesystem-Repair.html
https://wiki.archlinux.org/index.php/Identify_damaged_files#Find_damaged_files