[PATCH 25/26] xfs: support returning partial reflink results
From: Darrick J. Wong Back when the XFS reflink code only supported clone_file_range, we were only able to return zero or negative error codes to userspace. However, now that copy_file_range (which returns bytes copied) can use XFS' clone_file_range, we have the opportunity to return partial results. For example, if userspace sends a 1GB clone request and we run out of space halfway through, we at least can tell userspace that we completed 512M of that request like a regular write. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c|5 + fs/xfs/xfs_reflink.c | 20 +++- fs/xfs/xfs_reflink.h |2 +- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 38fde4e11714..7d42ab8fe6e1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -928,14 +928,11 @@ xfs_file_remap_range( loff_t len, unsigned intremap_flags) { - int ret; - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, len, remap_flags); - return ret < 0 ? ret : len; } STATIC int diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e8e86646bb4b..af3368862c56 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1123,6 +1123,7 @@ xfs_reflink_remap_blocks( struct xfs_inode*dest, xfs_fileoff_t destoff, xfs_filblks_t len, + xfs_filblks_t *remapped_len, xfs_off_t new_isize) { struct xfs_bmbt_irecimap; @@ -1130,6 +1131,7 @@ xfs_reflink_remap_blocks( int error = 0; xfs_filblks_t range_len; + *remapped_len = 0; /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { uintlock_mode; @@ -1168,6 +1170,7 @@ xfs_reflink_remap_blocks( srcoff += range_len; destoff += range_len; len -= range_len; + *remapped_len += range_len; } return 0; @@ -1382,7 +1385,7 @@ xfs_reflink_remap_prep( /* * Link a range of blocks from one file to another. */ -int +loff_t xfs_reflink_remap_range( struct file *file_in, loff_t pos_in, @@ -1397,9 +1400,10 @@ xfs_reflink_remap_range( struct xfs_inode*dest = XFS_I(inode_out); struct xfs_mount*mp = src->i_mount; xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; + xfs_filblks_t fsblen, remappedfsb = 0; + loff_t remapped_bytes = 0; xfs_extlen_tcowextsize; - ssize_t ret; + int ret; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1415,11 +1419,17 @@ xfs_reflink_remap_range( trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + if (len == 0) { + ret = 0; + goto out_unlock; + } + dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - pos_out + len); + &remappedfsb, pos_out + len); + remapped_bytes = min_t(loff_t, len, XFS_FSB_TO_B(mp, remappedfsb)); if (ret) goto out_unlock; @@ -1442,7 +1452,7 @@ xfs_reflink_remap_range( xfs_reflink_remap_unlock(file_in, file_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); - return ret; + return remapped_bytes > 0 ? remapped_bytes : ret; } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index c3c46c276fe1..cbc26ff79a8f 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -27,7 +27,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, +extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
[PATCH 24/26] xfs: fix pagecache truncation prior to reflink
From: Darrick J. Wong Prior to remapping blocks, it is necessary to remove pages from the destination file's page cache. Unfortunately, the truncation is not aggressive enough -- if page size > block size, we'll end up zeroing subpage blocks instead of removing them. So, round the start offset down and the end offset up to page boundaries. We already wrote all the dirty data so the larger range shouldn't be a problem. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_reflink.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 9b1ea42c81d1..e8e86646bb4b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1369,8 +1369,9 @@ xfs_reflink_remap_prep( goto out_unlock; /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + *len) - 1); + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + *len, PAGE_SIZE) - 1); return 1; out_unlock:
[PATCH 26/26] xfs: remove redundant remap partial EOF block checks
From: Darrick J. Wong Now that we've moved the partial EOF block checks to the VFS helpers, we can remove the redundant functionality from XFS. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_reflink.c | 19 --- 1 file changed, 19 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index af3368862c56..755d4a9446e3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1307,7 +1307,6 @@ xfs_reflink_remap_prep( struct inode*inode_out = file_inode(file_out); struct xfs_inode*dest = XFS_I(inode_out); boolsame_inode = (inode_in == inode_out); - u64 blkmask = i_blocksize(inode_in) - 1; ssize_t ret; /* Lock both files against IO */ @@ -1335,24 +1334,6 @@ xfs_reflink_remap_prep( if (ret < 0 || *len == 0) goto out_unlock; - /* -* If the dedupe data matches, chop off the partial EOF block -* from the source file so we don't try to dedupe the partial -* EOF block. -*/ - if (remap_flags & REMAP_FILE_DEDUP) { - *len &= ~blkmask; - } else if (*len & blkmask) { - /* -* The user is attempting to share a partial EOF block, -* if it's inside the destination EOF then reject it. -*/ - if (pos_out + *len < i_size_read(inode_out)) { - ret = -EINVAL; - goto out_unlock; - } - } - /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret)
[PATCH 23/26] ocfs2: remove ocfs2_reflink_remap_range
From: Darrick J. Wong Since ocfs2_remap_file_range is a thin shell around ocfs2_remap_remap_range, move everything from the latter into the former. Signed-off-by: Darrick J. Wong --- fs/ocfs2/file.c | 68 +++- fs/ocfs2/refcounttree.c | 113 +++ fs/ocfs2/refcounttree.h | 24 +++--- 3 files changed, 102 insertions(+), 103 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8125c5ccf821..fe570824b991 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2531,11 +2531,75 @@ static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + loff_t remapped = 0; + ssize_t ret; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; - return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, remap_flags); + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + len, PAGE_SIZE) - 1); + + remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, + inode_out, out_bh, pos_out, len); + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (remapped < 0) { + ret = remapped; + mlog_errno(ret); + goto out_unlock; + } + + /* +* Empty the extent map so that we may get the right extent +* record from the disk. +*/ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return remapped > 0 ? remapped : ret; } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c7409578657b..dc66b80585ec 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4468,9 +4468,9 @@ int ocfs2_reflink_ioctl(struct inode *inode, } /* Update destination inode size, if necessary. */ -static int ocfs2_reflink_update_dest(struct inode *dest, -struct buffer_head *d_bh, -loff_t newlen) +int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) { handle_t *handle; int ret; @@ -4621,13 +4621,13 @@ static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, } /* Set up refcount tree and remap s_inode to t_inode. */ -static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, -struct buffer_head *s_bh, -loff_t pos_in, -struct inode *t_inode, -struct buffer_head *t_bh, -loff_t pos_out, -loff_t len) +loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, +
[PATCH 22/26] ocfs2: support partial clone range and dedupe range
From: Darrick J. Wong Change the ocfs2 remap code to allow for returning partial results. Signed-off-by: Darrick J. Wong --- fs/ocfs2/file.c |7 + fs/ocfs2/refcounttree.c | 72 +-- fs/ocfs2/refcounttree.h | 12 3 files changed, 46 insertions(+), 45 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fbaeafe44b5f..8125c5ccf821 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2531,14 +2531,11 @@ static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { - int ret; - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, remap_flags); - return ret < 0 ? ret : len; + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, remap_flags); } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7c709229e108..c7409578657b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4507,14 +4507,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest, } /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ -static int ocfs2_reflink_remap_extent(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode, +struct buffer_head *s_bh, +loff_t pos_in, +struct inode *t_inode, +struct buffer_head *t_bh, +loff_t pos_out, +loff_t len, +struct ocfs2_cached_dealloc_ctxt *dealloc) { struct ocfs2_extent_tree s_et; struct ocfs2_extent_tree t_et; @@ -4522,8 +4522,9 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_super *osb; + loff_t remapped_bytes = 0; loff_t pstart, plen; - u32 p_cluster, num_clusters, slast, spos, tpos; + u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0; unsigned int ext_flags; int ret = 0; @@ -4605,30 +4606,34 @@ static int ocfs2_reflink_remap_extent(struct inode *s_inode, next_loop: spos += num_clusters; tpos += num_clusters; + remapped_clus += num_clusters; } -out: - return ret; + goto out; out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); - return ret; +out: + remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus); + remapped_bytes = min_t(loff_t, len, remapped_bytes); + + return remapped_bytes > 0 ? remapped_bytes : ret; } /* Set up refcount tree and remap s_inode to t_inode. */ -static int ocfs2_reflink_remap_blocks(struct inode *s_inode, - struct buffer_head *s_bh, - loff_t pos_in, - struct inode *t_inode, - struct buffer_head *t_bh, - loff_t pos_out, - loff_t len) +static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode, +struct buffer_head *s_bh, +loff_t pos_in, +struct inode *t_inode, +struct buffer_head *t_bh, +loff_t pos_out, +loff_t len) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb; struct ocfs2_dinode *dis; struct ocfs2_dinode *dit; - int ret; + loff_t ret; osb = OCFS2_SB(s_inode->i_sb); dis = (struct ocfs2_dinode *)s_bh->b_data; @@ -4700,7 +4705,7 @@ static int ocfs2_reflink_remap_blocks(struct inode *s_inode, /* Actually remap extents now. */ ret = ocfs2_
[PATCH 19/26] vfs: clean up generic_remap_file_range_prep return value
From: Darrick J. Wong Since the remap prep function can update the length of the remap request, we can change this function to return the usual return status instead of the odd behavior it has now. Signed-off-by: Darrick J. Wong --- fs/ocfs2/refcounttree.c |2 +- fs/read_write.c |6 +++--- fs/xfs/xfs_reflink.c|4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6a42c04ac0ab..46bbd315c39f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4852,7 +4852,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, &len, remap_flags); - if (ret <= 0) + if (ret < 0 || len == 0) goto out_unlock; /* Lock out changes to the allocation maps and remap. */ diff --git a/fs/read_write.c b/fs/read_write.c index 450e038e8617..37a7d3fe35d8 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1872,8 +1872,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * - * Returns: 0 for "nothing to clone", 1 for "something to clone", or - * the usual negative error code. + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, @@ -1954,7 +1954,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (ret) return ret; - return 1; + return 0; } EXPORT_SYMBOL(generic_remap_file_range_prep); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3dbe5fb7e9c0..9b1ea42c81d1 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1329,7 +1329,7 @@ xfs_reflink_remap_prep( ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); - if (ret <= 0) + if (ret < 0 || *len == 0) goto out_unlock; /* @@ -1409,7 +1409,7 @@ xfs_reflink_remap_range( /* Prepare and then clone file data. */ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, &len, remap_flags); - if (ret <= 0) + if (ret < 0 || len == 0) return ret; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
[PATCH 15/26] vfs: plumb remap flags through the vfs clone functions
From: Darrick J. Wong Plumb a remap_flags argument through the {do,vfs}_clone_file_range functions so that clone can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/ioctl.c |2 +- fs/nfsd/vfs.c |2 +- fs/overlayfs/copy_up.c |2 +- fs/overlayfs/file.c|6 +++--- fs/read_write.c| 13 + include/linux/fs.h |4 ++-- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 72537b68c272..505275ec5596 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -232,7 +232,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, - olen); + olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ac6cb6101cbe..726fc5b2b27a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -543,7 +543,7 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, { loff_t cloned; - cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count); + cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0); if (count && cloned != count) cloned = -EINVAL; return nfserrno(cloned < 0 ? cloned : 0); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8750b7235516..5f82fece64a0 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -142,7 +142,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) } /* Try to use clone_file_range to clone up within the same fs */ - cloned = do_clone_file_range(old_file, 0, new_file, 0, len); + cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); if (cloned == len) goto out; /* Couldn't clone, so now we try to copy the data */ diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 6c3fec6168e9..0393815c8971 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -462,7 +462,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_CLONE: ret = vfs_clone_file_range(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, flags); break; case OVL_DEDUPE: @@ -512,8 +512,8 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, !ovl_inode_upper(file_inode(file_out return -EPERM; - return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0, - op); + return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, + remap_flags, op); } const struct file_operations ovl_file_operations = { diff --git a/fs/read_write.c b/fs/read_write.c index 906e78be5001..791b406e8264 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1857,12 +1857,15 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); loff_t ret; + WARN_ON_ONCE(remap_flags); + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) @@ -1893,7 +1896,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, return ret; ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, 0); + file_out, pos_out, len, remap_flags); if (ret < 0) return ret; @@ -1904,12 +1907,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, EXPORT_SYMBOL(do_clone_file_range); loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, loff_t len) + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) { loff_t ret; file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out,
[PATCH 21/26] ocfs2: fix pagecache truncation prior to reflink
From: Darrick J. Wong Prior to remapping blocks, it is necessary to remove pages from the destination file's page cache. Unfortunately, the truncation is not aggressive enough -- if page size > block size, we'll end up zeroing subpage blocks instead of removing them. So, round the start offset down and the end offset up to page boundaries. We already wrote all the dirty data so the larger range should be fine. Signed-off-by: Darrick J. Wong --- fs/ocfs2/refcounttree.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 2a5c96bc9677..7c709229e108 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4862,8 +4862,9 @@ int ocfs2_reflink_remap_range(struct file *file_in, SINGLE_DEPTH_NESTING); /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + len, PAGE_SIZE) - 1); ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, out_bh, pos_out, len);
[PATCH 16/26] vfs: plumb remap flags through the vfs dedupe functions
From: Darrick J. Wong Plumb a remap_flags argument through the vfs_dedupe_file_range_one functions so that dedupe can take advantage of it. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/overlayfs/file.c |3 ++- fs/read_write.c |9 ++--- include/linux/fs.h |2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 0393815c8971..84dd957efa24 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -467,7 +467,8 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(real_in.file, pos_in, - real_out.file, pos_out, len); + real_out.file, pos_out, len, + flags); break; } revert_creds(old_cred); diff --git a/fs/read_write.c b/fs/read_write.c index 791b406e8264..f6ab5beb935a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2019,10 +2019,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare); loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, -loff_t len) +loff_t len, unsigned int remap_flags) { loff_t ret; + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + ret = mnt_want_write_file(dst_file); if (ret) return ret; @@ -2053,7 +2055,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, } ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, REMAP_FILE_DEDUP); + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); @@ -2121,7 +2123,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) } deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len); + info->dest_offset, len, + 0); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/include/linux/fs.h b/include/linux/fs.h index bc353a5224a4..c0ae85a7bd9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1854,7 +1854,7 @@ extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, - loff_t len); + loff_t len, unsigned int remap_flags); struct super_operations {
[PATCH 17/26] vfs: enable remap callers that can handle short operations
From: Darrick J. Wong Plumb in a remap flag that enables the filesystem remap handler to shorten remapping requests for callers that can handle it. Now copy_file_range can report partial success (in case we run up against alignment problems, resource limits, etc.). We also enable CAN_SHORTEN for fideduperange to maintain existing userspace-visible behavior where xfs/btrfs shorten the dedupe range to avoid stale post-eof data exposure. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/read_write.c| 28 include/linux/fs.h |7 +-- mm/filemap.c | 16 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index f6ab5beb935a..ee9314b7bfc3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, cloned = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), 0); + min_t(loff_t, MAX_RW_COUNT, len), + REMAP_FILE_CAN_SHORTEN); if (cloned > 0) { ret = cloned; goto done; @@ -1721,6 +1722,8 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len, * can't meaningfully compare post-EOF contents. * * For clone we only link a partial EOF block above the destination file's EOF. + * + * Shorten the request if possible. */ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, @@ -1729,16 +1732,24 @@ static int generic_remap_check_len(struct inode *inode_in, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; if ((*len & blkmask) == 0) return 0; - if (remap_flags & REMAP_FILE_DEDUP) - *len &= ~blkmask; - else if (pos_out + *len < i_size_read(inode_out)) - return -EINVAL; + if ((remap_flags & REMAP_FILE_DEDUP) || + pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; - return 0; + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } /* Update inode timestamps and remove security privileges when remapping. */ @@ -2023,7 +2034,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, { loff_t ret; - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP)); + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | +REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret) @@ -2124,7 +2136,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) deduped = vfs_dedupe_file_range_one(file, off, dst_file, info->dest_offset, len, - 0); + REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) diff --git a/include/linux/fs.h b/include/linux/fs.h index c0ae85a7bd9d..594fe4ba0b15 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1726,14 +1726,17 @@ struct block_device_operations; * If it is called with len == 0 that means "remap to end of source file". * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) + * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) +#define REMAP_FILE_CAN_SHORTEN (1 << 1) /* All valid REMAP_FILE flags */ -#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP) +#define REMAP_FILE_VALID_FLAGS (REMAP_FILE_DEDUP | \ +REMAP_FILE_CAN_SHORTEN) /* REMAP_FILE flags taken care of by the vfs. */ -#define REMAP_FILE_ADVISORY(0) +#define REMAP_FILE_ADVISORY(REMAP_FILE_CAN_SHORTEN) struct iov_iter; diff --git a/mm/filemap.c b/mm/filemap.c index 1e93269efafe..898eb358f7d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3051,8 +3051,12 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, if (pos_in + count == size_in) { bcount = ALIGN(size_in, bs) - pos_in; } else { - if (!IS_ALIGNED(count, bs)) - return -EINVAL; + if (!IS_ALIGNED(count, bs)) { + if (remap_flags & REMAP_F
[PATCH 20/26] ocfs2: truncate page cache for clone destination file before remapping
From: Darrick J. Wong When cloning blocks into another file, truncate the page cache before we start remapping blocks so that concurrent reads wait for us to finish. Signed-off-by: Darrick J. Wong --- fs/ocfs2/refcounttree.c | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 46bbd315c39f..2a5c96bc9677 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4861,14 +4861,12 @@ int ocfs2_reflink_remap_range(struct file *file_in, down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, SINGLE_DEPTH_NESTING); - ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, -out_bh, pos_out, len); - /* Zap any page cache for the destination file's range. */ - if (!ret) - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, +out_bh, pos_out, len); up_write(&OCFS2_I(inode_in)->ip_alloc_sem); if (!same_inode) up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
[PATCH 18/26] vfs: hide file range comparison function
From: Darrick J. Wong There are no callers of vfs_dedupe_file_range_compare, so we might as well make it a static helper and remove the export. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig --- fs/read_write.c| 187 +--- include/linux/fs.h |3 - 2 files changed, 91 insertions(+), 99 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index ee9314b7bfc3..450e038e8617 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1776,6 +1776,97 @@ static int generic_remap_file_range_target(struct file *file, return file_remove_privs(file); } +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, +struct inode *dest, loff_t destoff, +loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the @@ -1932,102 +2023,6 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_range); -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; -
[PATCH 14/26] vfs: make remap_file_range functions take and return bytes completed
From: Darrick J. Wong Change the remap_file_range functions to take a number of bytes to operate upon and return the number of bytes they operated on. This is a requirement for allowing fs implementations to return short clone/dedupe results to the user, which will enable us to obey resource limits in a graceful manner. A subsequent patch will enable copy_file_range to signal to the ->clone_file_range implementation that it can handle a short length, which will be returned in the function's return value. For now the short return is not implemented anywhere so the behavior won't change -- either copy_file_range manages to clone the entire range or it tries an alternative. Neither clone ioctl can take advantage of this, alas. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- Documentation/filesystems/vfs.txt |6 ++--- fs/btrfs/ctree.h |6 ++--- fs/btrfs/ioctl.c | 13 ++ fs/cifs/cifsfs.c |6 ++--- fs/ioctl.c| 10 +++- fs/nfs/nfs4file.c |6 ++--- fs/nfsd/vfs.c |8 +- fs/ocfs2/file.c | 16 ++-- fs/ocfs2/refcounttree.c |2 +- fs/ocfs2/refcounttree.h |2 +- fs/overlayfs/copy_up.c|6 ++--- fs/overlayfs/file.c | 12 + fs/read_write.c | 49 - fs/xfs/xfs_file.c |9 +-- fs/xfs/xfs_reflink.c |4 ++- fs/xfs/xfs_reflink.h |2 +- include/linux/fs.h| 27 +++- mm/filemap.c |2 +- 18 files changed, 106 insertions(+), 80 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index bb3183334ab9..8ba47d9d6cae 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,9 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*remap_file_range)(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len, unsigned int remap_flags); + loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 124a05662fc2..771a961d77ad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3247,9 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_remap_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, - unsigned int remap_flags); +loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bfd99c66723e..b0c513e10977 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4328,10 +4328,12 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, return ret; } -int btrfs_remap_file_range(struct file *src_file, loff_t off, - struct file *dst_file, loff_t destoff, u64 len, +loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + int ret; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; @@ -4349,10 +4351,11 @@ int btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; } - return btrfs_extent_same(src, off, len, dst, destoff); + ret = btrfs_extent_same(src, off, len, dst, destoff); + } else { + ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); } - - return btrfs_clone_files(dst_file, src_file, off, len, destoff); + return ret < 0 ? ret : len; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8144d0dcde2..5ca71c6c8be2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/c
[PATCH 13/26] vfs: create generic_remap_file_range_touch to update inode metadata
From: Darrick J. Wong Create a new VFS helper to handle inode metadata updates when remapping into a file. If the operation can possibly alter the file contents, we must update the ctime and mtime and remove security privileges, just like we do for regular file writes. Wire up ocfs2 to ensure consistent behavior. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/read_write.c | 28 fs/xfs/xfs_reflink.c | 23 --- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index ebcbfc4f2907..3f6392f1d5d4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1737,6 +1737,30 @@ static int generic_remap_check_len(struct inode *inode_in, return 0; } +/* Update inode timestamps and remove security privileges when remapping. */ +static int generic_remap_file_range_target(struct file *file, + unsigned int remap_flags) +{ + int ret; + + /* If can't alter the file contents, we're done. */ + if (remap_flags & REMAP_FILE_DEDUP) + return 0; + + /* Update the timestamps, since we can alter file contents. */ + if (!(file->f_mode & FMODE_NOCMTIME)) { + ret = file_update_time(file); + if (ret) + return ret; + } + + /* +* Clear the security bits if the process is not being run by root. +* This keeps people from modifying setuid and setgid binaries. +*/ + return file_remove_privs(file); +} + /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the @@ -1820,6 +1844,10 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (ret) return ret; + ret = generic_remap_file_range_target(file_out, remap_flags); + if (ret) + return ret; + return 1; } EXPORT_SYMBOL(generic_remap_file_range_prep); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 29aab196ce7e..2d7dd8b28d7c 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1372,29 +1372,6 @@ xfs_reflink_remap_prep( truncate_inode_pages_range(&inode_out->i_data, pos_out, PAGE_ALIGN(pos_out + *len) - 1); - /* If we're altering the file contents... */ - if (!(remap_flags & REMAP_FILE_DEDUP)) { - /* -* ...update the timestamps (which will grab the ilock again -* from xfs_fs_dirty_inode, so we have to call it before we -* take the ilock). -*/ - if (!(file_out->f_mode & FMODE_NOCMTIME)) { - ret = file_update_time(file_out); - if (ret) - goto out_unlock; - } - - /* -* ...clear the security bits if the process is not being run -* by root. This keeps people from modifying setuid and setgid -* binaries. -*/ - ret = file_remove_privs(file_out); - if (ret) - goto out_unlock; - } - return 1; out_unlock: xfs_reflink_remap_unlock(file_in, file_out);
[PATCH 09/26] vfs: rename clone_verify_area to remap_verify_area
From: Darrick J. Wong Since we use clone_verify_area for both clone and dedupe range checks, rename the function to make it clear that it's for both. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/read_write.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index aca75a97a695..734c5661fb69 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1686,7 +1686,7 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, return ret; } -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool write) { struct inode *inode = file_inode(file); @@ -1852,11 +1852,11 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (!file_in->f_op->clone_file_range) return -EOPNOTSUPP; - ret = clone_verify_area(file_in, pos_in, len, false); + ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; - ret = clone_verify_area(file_out, pos_out, len, true); + ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; @@ -1989,7 +1989,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, if (ret) return ret; - ret = clone_verify_area(dst_file, dst_pos, len, true); + ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret < 0) goto out_drop_write; @@ -2051,7 +2051,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) if (!S_ISREG(src->i_mode)) goto out; - ret = clone_verify_area(file, off, len, false); + ret = remap_verify_area(file, off, len, false); if (ret < 0) goto out; ret = 0;
[PATCH 07/26] vfs: skip zero-length dedupe requests
From: Darrick J. Wong Don't bother calling the filesystem for a zero-length dedupe request; we can return zero and exit. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Amir Goldstein --- fs/read_write.c |5 + 1 file changed, 5 insertions(+) diff --git a/fs/read_write.c b/fs/read_write.c index 0f0a6efdd502..f5395d8da741 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -2009,6 +2009,11 @@ int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, if (!dst_file->f_op->dedupe_file_range) goto out_drop_write; + if (len == 0) { + ret = 0; + goto out_drop_write; + } + ret = dst_file->f_op->dedupe_file_range(src_file, src_pos, dst_file, dst_pos, len); out_drop_write:
[PATCH 11/26] vfs: pass remap flags to generic_remap_file_range_prep
From: Darrick J. Wong Plumb the remap flags through the filesystem from the vfs function dispatcher all the way to the prep function to prepare for behavior changes in subsequent patches. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/ocfs2/file.c |2 +- fs/ocfs2/refcounttree.c |4 ++-- fs/ocfs2/refcounttree.h |2 +- fs/read_write.c | 14 +++--- fs/xfs/xfs_file.c |2 +- fs/xfs/xfs_reflink.c| 21 +++-- fs/xfs/xfs_reflink.h|3 ++- include/linux/fs.h |2 +- 8 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0b757a24567c..9809b0e5746f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2538,7 +2538,7 @@ static int ocfs2_remap_file_range(struct file *file_in, return -EINVAL; return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, -len, remap_flags & REMAP_FILE_DEDUP); +len, remap_flags); } const struct inode_operations ocfs2_file_iops = { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 36c56dfbe485..df9781567ec0 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4825,7 +4825,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, struct file *file_out, loff_t pos_out, u64 len, - bool is_dedupe) + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -4851,7 +4851,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, goto out_unlock; ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - &len, is_dedupe); + &len, remap_flags); if (ret <= 0) goto out_unlock; diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 4af55bf4b35b..d2c5f526edff 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -120,6 +120,6 @@ int ocfs2_reflink_remap_range(struct file *file_in, struct file *file_out, loff_t pos_out, u64 len, - bool is_dedupe); + unsigned int remap_flags); #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/read_write.c b/fs/read_write.c index 766bdcb381f3..201381689284 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1722,14 +1722,14 @@ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, u64 *len, - bool is_dedupe) + unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; if ((*len & blkmask) == 0) return 0; - if (is_dedupe) + if (remap_flags & REMAP_FILE_DEDUP) *len &= ~blkmask; else if (pos_out + *len < i_size_read(inode_out)) return -EINVAL; @@ -1747,7 +1747,7 @@ static int generic_remap_check_len(struct inode *inode_in, */ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) + u64 *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1771,7 +1771,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (*len == 0) { loff_t isize = i_size_read(inode_in); - if (is_dedupe || pos_in == isize) + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - is_dedupe); + (remap_flags & REMAP_FILE_DEDUP)); if (ret) return ret; @@ -1804,7 +1804,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* * Check that the extents are the same. */ - if (is_dedupe) { + if (remap_flags & REMAP_FILE_DEDUP) { boolis_same = false; ret = vfs_dedupe_file_range_compare(inode_in, pos_in, @@ -1816,7
[PATCH 10/26] vfs: combine the clone and dedupe into a single remap_file_range
From: Darrick J. Wong Combine the clone_file_range and dedupe_file_range operations into a single remap_file_range file operation dispatch since they're fundamentally the same operation. The differences between the two can be made in the prep functions. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- Documentation/filesystems/vfs.txt | 13 +-- fs/btrfs/ctree.h |8 ++- fs/btrfs/file.c |3 +- fs/btrfs/ioctl.c | 45 +++-- fs/cifs/cifsfs.c | 22 +++--- fs/nfs/nfs4file.c | 10 ++-- fs/ocfs2/file.c | 24 +++- fs/overlayfs/file.c | 30 ++--- fs/read_write.c | 18 +++ fs/xfs/xfs_file.c | 23 ++- include/linux/fs.h| 20 +--- 11 files changed, 110 insertions(+), 106 deletions(-) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index a6c6a8af48a2..bb3183334ab9 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -883,8 +883,9 @@ struct file_operations { unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64); + int (*remap_file_range)(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); }; @@ -960,11 +961,9 @@ otherwise noted. copy_file_range: called by the copy_file_range(2) system call. - clone_file_range: called by the ioctl(2) system call for FICLONERANGE and - FICLONE commands. - - dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE - command. + remap_file_range: called by the ioctl(2) system call for FICLONERANGE and + FICLONE and FIDEDUPERANGE commands to remap file ranges. Note that + a zero length implies "remap to end of source file". fadvise: possibly called by the fadvise64() system call. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..124a05662fc2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3218,9 +3218,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3250,8 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); +int btrfs_remap_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len, + unsigned int remap_flags); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2be00e873e92..9a963f061393 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3269,8 +3269,7 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .clone_file_range = btrfs_clone_file_range, - .dedupe_file_range = btrfs_dedupe_file_range, + .remap_file_range = btrfs_remap_file_range, }; void __cold btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d60b6caf09e8..bfd99c66723e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3627,26 +3627,6 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return ret; } -int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, - struct file *dst_file, loff_t dst_loff, - u64 olen) -{ - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* -
[PATCH 08/26] vfs: rename vfs_clone_file_prep to be more descriptive
From: Darrick J. Wong The vfs_clone_file_prep is a generic function to be called by filesystem implementations only. Rename the prefix to generic_ and make it more clear that it applies to remap operations, not just clones. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/ocfs2/refcounttree.c |2 +- fs/read_write.c |8 fs/xfs/xfs_reflink.c|2 +- include/linux/fs.h |6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19e03936c5e1..36c56dfbe485 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) goto out_unlock; - ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, &len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/fs/read_write.c b/fs/read_write.c index f5395d8da741..aca75a97a695 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1745,9 +1745,9 @@ static int generic_remap_check_len(struct inode *inode_in, * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -1822,7 +1822,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep); +EXPORT_SYMBOL(generic_remap_file_range_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 281d5f53f2ec..a7757a128a78 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/include/linux/fs.h b/include/linux/fs.h index ba93a6e7dac4..55729e1c2e75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1825,9 +1825,9 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, unsigned long, loff_t *, rwf_t); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 *count, bool is_dedupe); +extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, +struct file *file_out, loff_t pos_out, +u64 *count, bool is_dedupe); extern int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
[PATCH 06/26] vfs: avoid problematic remapping requests into partial EOF block
From: Darrick J. Wong A deduplication data corruption is exposed in XFS and btrfs. It is caused by extending the block match range to include the partial EOF block, but then allowing unknown data beyond EOF to be considered a "match" to data in the destination file because the comparison is only made to the end of the source file. This corrupts the destination file when the source extent is shared with it. The VFS remapping prep functions only support whole block dedupe, but we still need to appear to support whole file dedupe correctly. Hence if the dedupe request includes the last block of the souce file, don't include it in the actual dedupe operation. If the rest of the range dedupes successfully, then reject the entire request. A subsequent patch will enable us to shorten dedupe requests correctly. When reflinking sub-file ranges, a data corruption can occur when the source file range includes a partial EOF block. This shares the unknown data beyond EOF into the second file at a position inside EOF, exposing stale data in the second file. If the reflink request includes the last block of the souce file, only proceed with the reflink operation if it lands at or past the destination file's current EOF. If it lands within the destination file EOF, reject the entire request with -EINVAL and make the caller go the hard way. A subsequent patch will enable us to shorten reflink requests correctly. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/read_write.c | 33 + 1 file changed, 33 insertions(+) diff --git a/fs/read_write.c b/fs/read_write.c index 2456da3f8a41..0f0a6efdd502 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1708,6 +1708,34 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For deduplication we always scale down to the previous block because we + * can't meaningfully compare post-EOF contents. + * + * For clone we only link a partial EOF block above the destination file's EOF. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + u64 *len, + bool is_dedupe) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + + if ((*len & blkmask) == 0) + return 0; + + if (is_dedupe) + *len &= ~blkmask; + else if (pos_out + *len < i_size_read(inode_out)) + return -EINVAL; + + return 0; +} /* * Check that the two inodes are eligible for cloning, the ranges make @@ -1787,6 +1815,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, return -EBADE; } + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + is_dedupe); + if (ret) + return ret; + return 1; } EXPORT_SYMBOL(vfs_clone_file_prep);
[PATCH 05/26] vfs: strengthen checking of file range inputs to generic_remap_checks
From: Darrick J. Wong File range remapping, if allowed to run past the destination file's EOF, is an optimization on a regular file write. Regular file writes that extend the file length are subject to various constraints which are not checked by range cloning. This is a correctness problem because we're never allowed to touch ranges that the page cache can't support (s_maxbytes); we're not supposed to deal with large offsets (MAX_NON_LFS) if O_LARGEFILE isn't set; and we must obey resource limits (RLIMIT_FSIZE). Therefore, add these checks to the new generic_remap_checks function so that we curtail unexpected behavior. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein Reviewed-by: Christoph Hellwig --- mm/filemap.c | 91 ++ 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 47e6bfd45a91..08ad210fee49 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2915,6 +2915,49 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); +static int generic_access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + + /* Don't exceed the LFS limits. */ + if (unlikely(pos + *count > MAX_NON_LFS && + !(file->f_flags & O_LARGEFILE))) { + if (pos >= MAX_NON_LFS) + return -EFBIG; + *count = min(*count, (loff_t)MAX_NON_LFS - pos); + } + + /* +* Don't operate on ranges the page cache doesn't support. +* +* If we have written data it becomes a short write. If we have +* exceeded without writing data we send a signal and return EFBIG. +* Linus frestrict idea will clean these up nicely.. +*/ + if (unlikely(pos >= inode->i_sb->s_maxbytes)) + return -EFBIG; + + *count = min(*count, inode->i_sb->s_maxbytes - pos); + return 0; +} + +static int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + unsigned long limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, (loff_t)limit - pos); + } + + return generic_access_check_limits(file, pos, count); +} + /* * Performs necessary checks before doing a write * @@ -2926,8 +2969,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - unsigned long limit = rlimit(RLIMIT_FSIZE); - loff_t pos; + loff_t count; + int ret; if (!iov_iter_count(from)) return 0; @@ -2936,40 +2979,15 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - pos = iocb->ki_pos; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - if (limit != RLIM_INFINITY) { - if (iocb->ki_pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - iov_iter_truncate(from, limit - (unsigned long)pos); - } + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; - /* -* LFS rule -*/ - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (pos >= MAX_NON_LFS) - return -EFBIG; - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); - } - - /* -* Are we about to exceed the fs block limit ? -* -* If we have written data it becomes a short write. If we have -* exceeded without writing data we send a signal and return EFBIG. -* Linus frestrict idea will clean these up nicely.. -*/ - if (unlikely(pos >= inode->i_sb->s_maxbytes)) - return -EFBIG; - - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); + iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); @@ -2991,6 +3009,7 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, uint64_t bcount; loff_t size_in, size_out; loff_t bs = inode_out->i_sb->s_blocksize; + int ret; /* The start of both ranges must be aligned to an fs block. */
[PATCH 12/26] vfs: pass remap flags to generic_remap_checks
From: Darrick J. Wong Pass the same remap flags to generic_remap_checks for consistency. Signed-off-by: Darrick J. Wong Reviewed-by: Amir Goldstein --- fs/read_write.c|2 +- include/linux/fs.h |2 +- mm/filemap.c |4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 201381689284..ebcbfc4f2907 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1782,7 +1782,7 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - (remap_flags & REMAP_FILE_DEDUP)); + remap_flags); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index c2800953937a..1aa3bc1bb092 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2981,7 +2981,7 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - uint64_t *count, bool is_dedupe); + uint64_t *count, unsigned int remap_flags); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 08ad210fee49..b0f1f6d93d9c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3001,7 +3001,7 @@ EXPORT_SYMBOL(generic_write_checks); */ int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, -uint64_t *req_count, bool is_dedupe) +uint64_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; @@ -3023,7 +3023,7 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, size_out = i_size_read(inode_out); /* Dedupe requires both ranges to be within EOF. */ - if (is_dedupe && + if ((remap_flags & REMAP_FILE_DEDUP) && (pos_in >= size_in || pos_in + count > size_in || pos_out >= size_out || pos_out + count > size_out)) return -EINVAL;
[PATCH 03/26] vfs: check file ranges before cloning files
From: Darrick J. Wong Move the file range checks from vfs_clone_file_prep into a separate generic_remap_checks function so that all the checks are collected in a central location. This forms the basis for adding more checks from generic_write_checks that will make cloning's input checking more consistent with write input checking. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Amir Goldstein --- fs/ocfs2/refcounttree.c |2 + fs/read_write.c | 55 + fs/xfs/xfs_reflink.c|2 + include/linux/fs.h |9 -- mm/filemap.c| 69 +++ 5 files changed, 90 insertions(+), 47 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..19e03936c5e1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out, &len, is_dedupe); if (ret <= 0) goto out_unlock; diff --git a/fs/read_write.c b/fs/read_write.c index 260797b01851..d6e8e242a15f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) * Returns: 0 for "nothing to clone", 1 for "something to clone", or * the usual negative error code. */ -int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, - struct inode *inode_out, loff_t pos_out, - u64 *len, bool is_dedupe) +int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + u64 *len, bool is_dedupe) { - loff_t bs = inode_out->i_sb->s_blocksize; - loff_t blen; - loff_t isize; + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; @@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - isize = i_size_read(inode_in); - /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { + loff_t isize = i_size_read(inode_in); + if (is_dedupe || pos_in == isize) return 0; if (pos_in > isize) @@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, *len = isize - pos_in; } - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + *len < pos_in || pos_out + *len < pos_out || - pos_in + *len > isize) - return -EINVAL; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + *len > disize) - return -EINVAL; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + *len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = *len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - return -EINVAL; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - return -EINVAL; - } + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + is_dedupe); + if (ret) + return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); @@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return 1; } -EXPORT_SYMBOL(vfs_clone_file_prep_inodes); +EXPORT_SYMBOL(vfs_clone_file_prep); int do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) @@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - if (pos_in + len > i_size_read(inode_in)) -
[PATCH 01/26] xfs: add a per-xfs trace_printk macro
From: Darrick J. Wong Add a "xfs_tprintk" macro so that developers can use trace_printk to print out arbitrary debugging information with the XFS device name attached to the trace output. Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_error.h |6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..5caa8bdf6c38 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -76,6 +76,11 @@ extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, unsigned int tag_value); extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); extern int xfs_errortag_clearall(struct xfs_mount *mp); + +/* trace printk version of xfs_err and friends */ +#define xfs_tprintk(mp, fmt, args...) \ + trace_printk("dev %d:%d " fmt, MAJOR((mp)->m_super->s_dev), \ + MINOR((mp)->m_super->s_dev), ##args) #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) @@ -83,6 +88,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) +#define xfs_tprintk(mp, fmt, args...) do { } while (0) #endif /* DEBUG */ /*
[PATCH 04/26] vfs: exit early from zero length remap operations
From: Darrick J. Wong If a remap caller asks us to remap to the source file's EOF and the source file has zero bytes, exit early. Signed-off-by: Darrick J. Wong --- fs/read_write.c |2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/read_write.c b/fs/read_write.c index d6e8e242a15f..2456da3f8a41 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1748,6 +1748,8 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in, if (pos_in > isize) return -EINVAL; *len = isize - pos_in; + if (*len == 0) + return 0; } /* Check that we don't violate system file offset limits. */
[PATCH v5 00/26] fs: fixes for serious clone/dedupe problems
Hi all, Dave, Eric, and I have been chasing a stale data exposure bug in the XFS reflink implementation, and tracked it down to reflink forgetting to do some of the file-extending activities that must happen for regular writes. We then started auditing the clone, dedupe, and copyfile code and realized that from a file contents perspective, clonerange isn't any different from a regular file write. Unfortunately, we also noticed that *unlike* a regular write, clonerange skips a ton of overflow checks, such as validating the ranges against s_maxbytes, MAX_NON_LFS, and RLIMIT_FSIZE. We also observed that cloning into a file did not strip security privileges (suid, capabilities) like a regular write would. I also noticed that xfs and ocfs2 need to dump the page cache before remapping blocks, not after. In fixing the range checking problems I also realized that both dedupe and copyfile tell userspace how much of the requested operation was acted upon. Since the range validation can shorten a clone request (or we can ENOSPC midway through), we might as well plumb the short operation reporting back through the VFS indirection code to userspace. So, here's the whole giant pile of patches[1] that fix all the problems. This branch is against current upstream (4.19-rc8). The patch "generic: test reflink side effects" recently sent to fstests exercises the fixes in this series. Tests are in [2]. --D [1] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=djwong-devel [2] https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=djwong-devel
[PATCH 02/26] vfs: vfs_clone_file_prep_inodes should return EINVAL for a clone from beyond EOF
From: Darrick J. Wong vfs_clone_file_prep_inodes cannot return 0 if it is asked to remap from a zero byte file because that's what btrfs does. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/read_write.c |3 --- 1 file changed, 3 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 8a2737f0d61d..260797b01851 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1740,10 +1740,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; - /* Are we going all the way to the end? */ isize = i_size_read(inode_in); - if (isize == 0) - return 0; /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) {
Re: Spurious mount point
Chris Murphy: > I wasn't aware that SUSE was now using the @ location for > snapshots, or that it was using Btrfs for /home. For a > while it's been XFS with a Btrfs sysroot. Ours does use btrfs for `/' and xfs for `/home' *except* `/home/hana', which is strange and wonderful, because the standard usage is there in fstab and hana mount is created somewhere else. -- () ascii ribbon campaign -- against html e-mail /\ http://preview.tinyurl.com/qcy6mjc [archived]
Re: Spurious mount point
On Mon, Oct 15, 2018 at 3:26 PM, Anton Shepelev wrote: > Chris Murphy to Anton Shepelev: > >> > How can I track down the origin of this mount point: >> > >> > /dev/sda2 on /home/hana type btrfs >> > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana) >> > >> > if it is not present in /etc/fstab? I shouldn't like to >> > find/grep thoughout the whole filesystem. >> >> Sounds like some service taking snapshots regularly is >> managing this. Maybe this is Mint or Ubuntu and you're >> using Timeshift? > > It is SUSE Linux and (probably) its tool called `snapper', > but I have not found a clue in its documentation. I wasn't aware that SUSE was now using the @ location for snapshots, or that it was using Btrfs for /home. For a while it's been XFS with a Btrfs sysroot. -- Chris Murphy
Re: Spurious mount point
Chris Murphy to Anton Shepelev: > > How can I track down the origin of this mount point: > > > > /dev/sda2 on /home/hana type btrfs > > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana) > > > > if it is not present in /etc/fstab? I shouldn't like to > > find/grep thoughout the whole filesystem. > > Sounds like some service taking snapshots regularly is > managing this. Maybe this is Mint or Ubuntu and you're > using Timeshift? It is SUSE Linux and (probably) its tool called `snapper', but I have not found a clue in its documentation. > Maybe it'll show up in the journal if you add boot > parameter 'systemd.log_level=debug' and reboot; then use > 'journalctl -b | grep mount' and it should show all > instances logged instances of mount events: systemd, > udisks2, maybe others? I will try that as soon as the client lets me reboot that machine. -- () ascii ribbon campaign -- against html e-mail /\ http://preview.tinyurl.com/qcy6mjc [archived]
Re: Spurious mount point
On Mon, Oct 15, 2018 at 9:05 AM, Anton Shepelev wrote: > Hello, all > > How can I track down the origin of this mount point: > >/dev/sda2 on /home/hana type btrfs > (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana) > > if it is not present in /etc/fstab? I shouldn't like to > find/grep thoughout the whole filesystem. > > -- > () ascii ribbon campaign - against html e-mail > /\ http://preview.tinyurl.com/qcy6mjc [archived] Sounds like some service taking snapshots regularly is managing this. Maybe this is Mint or Ubuntu and you're using Timeshift? Maybe it'll show up in the journal if you add boot parameter 'systemd.log_level=debug' and reboot; then use 'journalctl -b | grep mount' and it should show all instances logged instances of mount events: systemd, udisks2, maybe others? -- Chris Murphy
Re: reproducible builds with btrfs seed feature
On Mon, Oct 15, 2018 at 6:29 AM, Austin S. Hemmelgarn wrote: > On 2018-10-13 18:28, Chris Murphy wrote: >> The end result is creating two Btrfs volumes would yield image files >> with matching hashes. > > So in other words, you care about matching the block layout _exactly_. Only because that's the easiest way to verify reproducibility without any ambiguity. If someone's compromised a build system such that everyone is getting the malicious payload, but they can hide it behind a subvolume or reflink that's not used by default, could someone plausibly cause selective use of their malicious payload? I dunno I leave that for more crafty people. But even if it's a tiny bit of ambiguity, it's non-zero. Hashing a file that contains the entire file system is unambiguous. I think populating the image with --rootdir at mkfs time should be pretty deterministic. One stream in and out. No generations, no snapshot, no delayed allocation. It'd be quite similar to mksquashfs. I guess I'd have to try it a few times, and see if really the only differences are uuids and times, and not allocation related things. -- Chris Murphy
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Mon, Oct 15, 2018 at 10:13:17AM -0700, Darrick J. Wong wrote: > > RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep, > > so the file system should know about it Also looking at it again now > > it seems entirely superflous - we can just pass down then len == we > > use in higher level code instead of having a flag and will side step > > the issue here. > > I'm not a fan of hidden behaviors like that, particularly when we > already have a flags field where callers can explicitly ask for the > to-eof behavior. This just means we have a flag to mean ken is 0 and needs to be filled, rather than encoding that in the field itself. If you fell better we can replace 0 with 0x and still encode it in the field. > > RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can > > easily be solved by including it everywhere. > > CAN_SHORTEN isn't included everywhere Include it everywhere as in allow it in ever ->remap_file instance. > I sort of thought about introducing a new copy_file_range flag that > would just do deduplication and allow for opportunistic "dedup as much > as you can" but ... meh. Maybe I'll just drop the patch instead; we can > revisit that when anyone wants a better dedupe interface. Sounds fine to me. The btrfs ioctl is really ugly, but then again there is no pressing need for something better.
Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit
On Mon, Oct 15, 2018 at 08:32:19AM -0700, Darrick J. Wong wrote: > > Why not put that in the name to make it more descriptive? > > I'm all ears for better suggestions. :) I think the best idea is no flag - just carry through the special zero..
Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata
On Mon, Oct 15, 2018 at 09:30:01AM -0700, Darrick J. Wong wrote: > I originally thought "touch" because it updates [cm]time. :) > > Though looking at the final code, I think this can just be called from > the end of generic_remap_file_range_prep, so we can skip the export and > all that other stuff. I though about that, but the locking didn't seem to quite work out between xfs and ocfs. Nevermind the big elephant of actually converting btrfs to the "VFS" helper - I think if that doesn't work out it is rather questionable how generic they actually are.
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Mon, Oct 15, 2018 at 05:47:19AM -0700, Christoph Hellwig wrote: > On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote: > > I supposed you figured out the reason already. > > No, I hadn't. > > > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS. > > All those "advisory" flags, we want to pass them in to filesystem as FYI, > > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN > > to every filesystem, when vfs has already taken care of the advice. > > I don't think this model makes sense. If they really are purely > handled in the VFS we can mask them before passing them to the file > system, if not we need to check them, or the they are avisory and > we can have a simple #define instead of the helper. > > RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep, > so the file system should know about it Also looking at it again now > it seems entirely superflous - we can just pass down then len == we > use in higher level code instead of having a flag and will side step > the issue here. I'm not a fan of hidden behaviors like that, particularly when we already have a flags field where callers can explicitly ask for the to-eof behavior. > RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can > easily be solved by including it everywhere. CAN_SHORTEN isn't included everywhere -- FICLONE{,RANGE} don't enable it because they have no way to communicate the number of bytes cloned back to userspace. Either we can clone every byte the user asked for, or we send back -EINVAL. (Maybe I'm misinterpreting what you meant by 'solved by including it everywhere'?) > RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to > start with, as RFR_CAN_SHORTEN can be used instead. For now it's superfluous. At first I was thinking that we could return a short bytes_deduped if, say, the first part of the range actually did match, but it became pretty obvious via shared/010 that duperemove can't handle that, so we really must stick to the existing btrfs behavior. The existing btrfs behavior is that we can round the length down to avoid deduping partial EOF blocks, but we return the original length (i.e. lie) in bytes_deduped when we do that. I sort of thought about introducing a new copy_file_range flag that would just do deduplication and allow for opportunistic "dedup as much as you can" but ... meh. Maybe I'll just drop the patch instead; we can revisit that when anyone wants a better dedupe interface. > So something like this in fs.h: > > #define REMAP_FILE_ADVISORY_FLAGS REMAP_FILE_CAN_SHORTEN > > And then in the file system: > > if (flags & ~REMAP_FILE_ADVISORY_FLAGS) > -EINVAL; > > or > > if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP)) > -EINVAL; > > should be all that is needed. Sounds good to me. --D
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote: > > unsigned (*mmap_capabilities)(struct file *); > > #endif > > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > > loff_t, size_t, unsigned int); > > - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, > > u64); > > - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, > > u64); > > + int (*remap_file_range)(struct file *file_in, loff_t pos_in, > > + struct file *file_out, loff_t pos_out, > > + u64 len, unsigned int remap_flags); > > None of the other methods in this file name their parameters. While > I generally don't like people leaving them out, in the end consistency > is even more important. > > > +int btrfs_remap_file_range(struct file *src_file, loff_t off, > > + struct file *dst_file, loff_t destoff, u64 len, > > + unsigned int remap_flags) > > { > > + if (!remap_check_flags(remap_flags, RFR_SAME_DATA)) > > + return -EINVAL; > > + > > + if (remap_flags & RFR_SAME_DATA) { > > So at least for btrfs there seems to be no shared code at all below > the function calls. This kinda speaks against the argument that > they fundamentally are the same.. They /do/ share/ code -- eventually both btrfs_extent_same and btrfs_clone_files call btrfs_clone. xfs and ocfs2 call the same paths internally too; it's only the vfs helpers that have the extra page cache comparisons if it's a dedup operation. > > +/* > > + * These flags control the behavior of the remap_file_range function > > pointer. > > + * > > + * RFR_SAME_DATA: only remap if contents identical (i.e. deduplicate) > > + */ > > +#define RFR_SAME_DATA (1 << 0) > > + > > +#define RFR_VALID_FLAGS(RFR_SAME_DATA) > > RFR? Why not REMAP_FILE_* Also why not the well understood > REMAP_FILE_DEDUP instead of the odd SAME_DATA? Sure. I had begin to dislike typing RFR anyway. > > + > > +/* > > + * Filesystem remapping implementations should call this helper on their > > + * remap flags to filter out flags that the implementation doesn't support. > > + * > > + * Returns true if the flags are ok, false otherwise. > > + */ > > +static inline bool remap_check_flags(unsigned int remap_flags, > > +unsigned int supported_flags) > > +{ > > + return (remap_flags & ~(supported_flags & RFR_VALID_FLAGS)) == 0; > > +} > > Any reason to even bother with a helper for this? ->fallocate > seems to be doing fine without the helper, and the resulting code > seems a lot easier to understand to me. (Will respond to these at the current end of the flags thread.) > > @@ -1759,10 +1779,9 @@ struct file_operations { > > #endif > > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > > loff_t, size_t, unsigned int); > > - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, > > - u64); > > - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, > > - u64); > > + int (*remap_file_range)(struct file *file_in, loff_t pos_in, > > + struct file *file_out, loff_t pos_out, > > + u64 len, unsigned int remap_flags); > > Same comment here. Didn't we have some nice doc tools to avoid this > duplication? :) We do, but vfs.txt hasn't been ported to any of that. --D
Re: [PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata
On Sun, Oct 14, 2018 at 10:21:31AM -0700, Christoph Hellwig wrote: > > +/* Update inode timestamps and remove security privileges when remapping. > > */ > > +int generic_remap_file_range_touch(struct file *file, bool is_dedupe) > > +{ > > + int ret; > > + > > + /* If can't alter the file contents, we're done. */ > > + if (is_dedupe) > > + return 0; > > + > > + /* Update the timestamps, since we can alter file contents. */ > > + if (!(file->f_mode & FMODE_NOCMTIME)) { > > + ret = file_update_time(file); > > + if (ret) > > + return ret; > > + } > > + > > + /* > > +* Clear the security bits if the process is not being run by root. > > +* This keeps people from modifying setuid and setgid binaries. > > +*/ > > + return file_remove_privs(file); > > +} > > +EXPORT_SYMBOL(generic_remap_file_range_touch); > > The name seems a little out of touch with what it actually does. I originally thought "touch" because it updates [cm]time. :) Though looking at the final code, I think this can just be called from the end of generic_remap_file_range_prep, so we can skip the export and all that other stuff. > Also why a bool argument instead of the more descriptive flags which > introduced a few patches ago? Hmm, yes, the remap_flags transition can move up to before this patch. --D
Re: [PATCH 24/25] xfs: support returning partial reflink results
On Mon, Oct 15, 2018 at 10:05:36AM +1100, Dave Chinner wrote: > On Sun, Oct 14, 2018 at 10:35:46AM -0700, Christoph Hellwig wrote: > > On Fri, Oct 12, 2018 at 05:08:32PM -0700, Darrick J. Wong wrote: > > > From: Darrick J. Wong > > > > > > Back when the XFS reflink code only supported clone_file_range, we were > > > only able to return zero or negative error codes to userspace. However, > > > now that copy_file_range (which returns bytes copied) can use XFS' > > > clone_file_range, we have the opportunity to return partial results. > > > For example, if userspace sends a 1GB clone request and we run out of > > > space halfway through, we at least can tell userspace that we completed > > > 512M of that request like a regular write. > > > > > > Signed-off-by: Darrick J. Wong > > > --- > > > fs/xfs/xfs_file.c|5 + > > > fs/xfs/xfs_reflink.c | 20 +++- > > > fs/xfs/xfs_reflink.h |2 +- > > > 3 files changed, 17 insertions(+), 10 deletions(-) > > > > > > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > > index bc9e94bcb7a3..b2b15b8dc4a1 100644 > > > --- a/fs/xfs/xfs_file.c > > > +++ b/fs/xfs/xfs_file.c > > > @@ -928,14 +928,11 @@ xfs_file_remap_range( > > > loff_t len, > > > unsigned intremap_flags) > > > { > > > - int ret; > > > - > > > if (!remap_check_flags(remap_flags, RFR_SAME_DATA)) > > > return -EINVAL; > > > > > > - ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, > > > + return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, > > > len, remap_flags); > > > > Is there any reason not to merge xfs_file_remap_range and > > xfs_reflink_remap_range at this point? > > Yeah, that seems like a good idea to me - pulling all the > vfs/generic code interactions back up into xfs_file.c would match > how the rest of the file operations are layered w.r.t. external and > internal XFS code... Yeah, ditto ocfs2. I'll look into throwing a few more refactors onto the end of this series... 8-D --D > Cheers, > > Dave. > -- > Dave Chinner > da...@fromorbit.com
Re: Interpreting `btrfs filesystem show'
Hugo Mills - 15.10.18, 16:26: > On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote: > > Hello, all > > > > While trying to resolve free space problems, and found that > > > > I cannot interpret the output of: > > > btrfs filesystem show > > > > Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 > > Total devices 1 FS bytes used 34.06GiB > > devid1 size 40.00GiB used 37.82GiB path /dev/sda2 > > > > How come the total used value is less than the value listed > > for the only device? > >"Used" on the device is the mount of space allocated. "Used" on the > FS is the total amount of actual data and metadata in that > allocation. > >You will also need to look at the output of "btrfs fi df" to see > the breakdown of the 37.82 GiB into data, metadata and currently > unused. I usually use btrfs fi usage -T, cause 1. It has all the information. 2. It differentiates between used and allocated. % btrfs fi usage -T / Overall: Device size: 100.00GiB Device allocated: 54.06GiB Device unallocated: 45.94GiB Device missing: 0.00B Used: 46.24GiB Free (estimated): 25.58GiB (min: 25.58GiB) Data ratio: 2.00 Metadata ratio: 2.00 Global reserve: 70.91MiB (used: 0.00B) Data Metadata System Id Path RAID1RAID1 RAID1Unallocated -- - --- 2 /dev/mapper/msata-debian 25.00GiB 2.00GiB 32.00MiB22.97GiB 1 /dev/mapper/sata-debian 25.00GiB 2.00GiB 32.00MiB22.97GiB -- - --- Total25.00GiB 2.00GiB 32.00MiB45.94GiB Used 22.38GiB 754.66MiB 16.00KiB For RAID it in some place reports the raw size and sometimes the logical size. Especially in the "Total" line I find this a bit inconsistent. "RAID1" columns show logical size, "Unallocated" shows raw size. Also "Used:" in the global section shows raw size and "Free (estimated):" shows logical size. Thanks -- Martin
Re: [PATCH 11/25] vfs: pass remap flags to generic_remap_file_range_prep
On Sun, Oct 14, 2018 at 10:37:38AM -0700, Christoph Hellwig wrote: > > + bool is_dedupe = (remap_flags & RFR_SAME_DATA); > > Btw, I think the code would be cleaner if we dropped this variable. Ok to both. I'll move up the patch to replace is_dedupe with remap_flags to avoid churning the _touch function too, btw. --D
Re: [PATCH 16/25] vfs: make remapping to source file eof more explicit
On Sun, Oct 14, 2018 at 10:24:33AM -0700, Christoph Hellwig wrote: > On Fri, Oct 12, 2018 at 05:07:37PM -0700, Darrick J. Wong wrote: > > From: Darrick J. Wong > > > > Create a RFR_TO_SRC_EOF flag to explicitly declare that the caller wants > > the remap implementation to remap to the end of the source file, once > > the files are locked. > > The name looks like a cat threw up on your keyboard :) Yeah... :( > From reading the code this seems to ask for a whole file remap, right? Nope. In the original btrfs clonerange ioctl, length == 0 meant "to EOF". If you made a call like this: struct btrfs_ioctl_clone_range_args x = { .src_offset = 16384, .src_length = 0, .dest_offset = 0, .src_fd = whatever, }; ftruncate(dest_fd, 0); ioctl(dest_fd, BTRFS_IOC_CLONE, &x); Then dest_fd ends up with the contents of [16k...EOF] from src_fd. It's annoying to have the magic length number (and no flags?) but we're stuck with this quirk of the interface. > Why not put that in the name to make it more descriptive? I'm all ears for better suggestions. :) --D
Spurious mount point
Hello, all How can I track down the origin of this mount point: /dev/sda2 on /home/hana type btrfs (rw,relatime,space_cache,subvolid=259,subvol=/@/.snapshots/1/snapshot/hana) if it is not present in /etc/fstab? I shouldn't like to find/grep thoughout the whole filesystem. -- () ascii ribbon campaign - against html e-mail /\ http://preview.tinyurl.com/qcy6mjc [archived]
Re: Interpreting `btrfs filesystem show'
On 2018-10-15 10:42, Anton Shepelev wrote: Hugo Mills to Anton Shepelev: While trying to resolve free space problems, and found that I cannot interpret the output of: btrfs filesystem show Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 Total devices 1 FS bytes used 34.06GiB devid1 size 40.00GiB used 37.82GiB path /dev/sda2 How come the total used value is less than the value listed for the only device? "Used" on the device is the mount of space allocated. "Used" on the FS is the total amount of actual data and metadata in that allocation. You will also need to look at the output of "btrfs fi df" to see the breakdown of the 37.82 GiB into data, metadata and currently unused. See https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools for the details Thank you, Hugo, understood. mount/amount is a very fitting typo :-) Does the standard `du' tool work correctly for btfrfs? For the default 'physical usage' mode, it functionally does not work correctly, because it does not know about reflinks. The easiest way to see this is to create a couple of snapshots of a subvolume alongside the subvolume, and then run `du -s --totals` on those snapshots and the subvolume. It will report the total space usage to be equal to the sum of the values reported for each snapshot and the subvolume, when it should instead only count the space usage for shared data once. For the 'apparent usage' mode provided by the GNU implementation, it does work correctly.
Re: Interpreting `btrfs filesystem show'
On Mon, Oct 15, 2018 at 05:40:40PM +0300, Anton Shepelev wrote: > Hugo Mills to Anton Shepelev: > > >>While trying to resolve free space problems, and found > >>that I cannot interpret the output of: > >> > >>> btrfs filesystem show > >> > >>Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 > >>Total devices 1 FS bytes used 34.06GiB > >>devid1 size 40.00GiB used 37.82GiB path /dev/sda2 > >> > >>How come the total used value is less than the value > >>listed for the only device? > > > > "Used" on the device is the mount of space allocated. > >"Used" on the FS is the total amount of actual data and > >metadata in that allocation. > > > > You will also need to look at the output of "btrfs fi > >df" to see the breakdown of the 37.82 GiB into data, > >metadata and currently unused. > > > >See > >https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools > > for the details > > Thank you, Hugo, understood. mount/amount is a very fitting > typo :-) > > Do the standard `du' and `du' tools report correct values > with btrfs? Well... du will tell you the size of the files you asked it about, but it doesn't know about reflinks, so it'll double-count if you've got a reflink copy of something. Other than that, it should be accurate, I think. There's also a "btrfs fi du" which can tell you the amount of shared and unique data as well, so you can know, for example, how much space you'll reclaim if you delete those files. df should be mostly OK, but it does sometimes get its estimate of the total usable size of the FS wrong, particularly if the FS is unbalanced. However, as the FS fills up, the estimate gets better, because it gets more evenly balanced across devices over time. Hugo. -- Hugo Mills | "Your problem is that you have a negative hugo@... carfax.org.uk | personality." http://carfax.org.uk/ | "No, I don't!" PGP: E2AB1DE4 | Londo and Vir, Babylon 5 signature.asc Description: Digital signature
Re: Interpreting `btrfs filesystem show'
Hugo Mills to Anton Shepelev: >>While trying to resolve free space problems, and found >>that I cannot interpret the output of: >> >>> btrfs filesystem show >> >>Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 >>Total devices 1 FS bytes used 34.06GiB >>devid1 size 40.00GiB used 37.82GiB path /dev/sda2 >> >>How come the total used value is less than the value >>listed for the only device? > > "Used" on the device is the mount of space allocated. >"Used" on the FS is the total amount of actual data and >metadata in that allocation. > > You will also need to look at the output of "btrfs fi >df" to see the breakdown of the 37.82 GiB into data, >metadata and currently unused. > >See >https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools > for the details Thank you, Hugo, understood. mount/amount is a very fitting typo :-) Does the standard `du' tool work correctly for btfrfs? -- () ascii ribbon campaign - against html e-mail /\ http://preview.tinyurl.com/qcy6mjc [archived]
Re: Interpreting `btrfs filesystem show'
On Mon, Oct 15, 2018 at 02:26:41PM +, Hugo Mills wrote: > On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote: > > Hello, all > > > > While trying to resolve free space problems, and found that > > I cannot interpret the output of: > > > > > btrfs filesystem show > > > > Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 > > Total devices 1 FS bytes used 34.06GiB > > devid1 size 40.00GiB used 37.82GiB path /dev/sda2 > > > > How come the total used value is less than the value listed > > for the only device? > >"Used" on the device is the mount of space allocated. "Used" on the s/mount/amount/ > FS is the total amount of actual data and metadata in that allocation. > >You will also need to look at the output of "btrfs fi df" to see > the breakdown of the 37.82 GiB into data, metadata and currently > unused. > >See > https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools > for the details. > >Hugo. > -- Hugo Mills | "Your problem is that you have a negative hugo@... carfax.org.uk | personality." http://carfax.org.uk/ | "No, I don't!" PGP: E2AB1DE4 | Londo and Vir, Babylon 5 signature.asc Description: Digital signature
Re: Interpreting `btrfs filesystem show'
On Mon, Oct 15, 2018 at 05:24:08PM +0300, Anton Shepelev wrote: > Hello, all > > While trying to resolve free space problems, and found that > I cannot interpret the output of: > > > btrfs filesystem show > > Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 > Total devices 1 FS bytes used 34.06GiB > devid1 size 40.00GiB used 37.82GiB path /dev/sda2 > > How come the total used value is less than the value listed > for the only device? "Used" on the device is the mount of space allocated. "Used" on the FS is the total amount of actual data and metadata in that allocation. You will also need to look at the output of "btrfs fi df" to see the breakdown of the 37.82 GiB into data, metadata and currently unused. See https://btrfs.wiki.kernel.org/index.php/FAQ#Understanding_free_space.2C_using_the_original_tools for the details. Hugo. -- Hugo Mills | "Your problem is that you have a negative hugo@... carfax.org.uk | personality." http://carfax.org.uk/ | "No, I don't!" PGP: E2AB1DE4 | Londo and Vir, Babylon 5 signature.asc Description: Digital signature
Interpreting `btrfs filesystem show'
Hello, all While trying to resolve free space problems, and found that I cannot interpret the output of: > btrfs filesystem show Label: none uuid: 8971ce5b-71d9-4e46-ab25-ca37485784c8 Total devices 1 FS bytes used 34.06GiB devid1 size 40.00GiB used 37.82GiB path /dev/sda2 How come the total used value is less than the value listed for the only device? -- () ascii ribbon campaign - against html e-mail /\ http://preview.tinyurl.com/qcy6mjc [archived]
[PATCH] btrfs: Adjust loop in free_extent_buffer
The loop construct in free_extent_buffer was added in 242e18c7c1a8 ("Btrfs: reduce lock contention on extent buffer locks") as means of reducing the times the eb lock is taken, the non-last ref count is decremented and lock is released. As the special handling of UNMAPPED extent buffers was removed now there is only one decrement op which is happening for EXTENT_BUFFER_UNMAPPED case. This commit modifies the loop condition so that in case of UNMAPPED buffers the eb's lock is taken only if we are 100% sure the eb is going to be freed by the current executor of the code. Additionally, remove superfluous ref count ops in btrfs test. Signed-off-by: Nikolay Borisov --- This was tested with xfstest multiple times + unloading of btrfs module to ensure no eb leaks are present. fs/btrfs/extent_io.c | 4 +++- fs/btrfs/tests/btrfs-tests.c | 2 -- fs/btrfs/tests/inode-tests.c | 6 -- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0e9d4f28379d..544a1a5fd416 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5095,7 +5095,9 @@ void free_extent_buffer(struct extent_buffer *eb) while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 4cb8fcfd3ec4..8a59597f1883 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -177,8 +177,6 @@ void btrfs_free_dummy_root(struct btrfs_root *root) if (root->node) { /* One for allocate_extent_buffer */ free_extent_buffer(root->node); - /* One for get_exent_buffer */ - free_extent_buffer(root->node); } kfree(root); } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 64043f028820..af0c8e30d9e2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } - /* -* We will just free a dummy node if it's ref count is 2 so we need an -* extra ref so our searches don't accidentally release our page. -*/ - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); BTRFS_I(inode)->root = root; -- 2.7.4
Re: [PATCH 0/6] Some trivail cleanup about dealyed-refs
On Mon, Oct 15, 2018 at 10:39:19AM +0800, Lu Fengqi wrote: > On Thu, Oct 11, 2018 at 01:51:37PM +0200, David Sterba wrote: > >On Thu, Oct 11, 2018 at 01:40:32PM +0800, Lu Fengqi wrote: > >> There is no functional change. Just improve readablity. > >> > >> PATCH 1-4 parameter cleanup patches > >> PATCH 5 cleanup about btrfs_select_ref_head > >> PATCH 6 switch int to bool; add some comment > >> > >> Lu Fengqi (6): > >> btrfs: delayed-ref: pass delayed_refs directly to > >> btrfs_select_ref_head() > >> btrfs: delayed-ref: pass delayed_refs directly to > >> btrfs_delayed_ref_lock() > >> btrfs: remove fs_info from btrfs_check_space_for_delayed_refs > >> btrfs: remove fs_info from btrfs_should_throttle_delayed_refs > >> btrfs: simplify btrfs_select_ref_head and cleanup some local variables > >> btrfs: switch return_bigger to bool in find_ref_head > > > >1-4 and 6 added to misc-next, thanks. > > There is not patch 2 at the misc-next branch. So it was forgotten? You're right, got lost somewhere, now in misc-next. Thanks for checking.
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Sun, Oct 14, 2018 at 10:19:27AM -0700, Christoph Hellwig wrote: > > unsigned (*mmap_capabilities)(struct file *); > > #endif > > ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, > > loff_t, size_t, unsigned int); > > - int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, > > u64); > > - int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, > > u64); > > + int (*remap_file_range)(struct file *file_in, loff_t pos_in, > > + struct file *file_out, loff_t pos_out, > > + u64 len, unsigned int remap_flags); > > None of the other methods in this file name their parameters. While > I generally don't like people leaving them out, in the end consistency > is even more important. I would agree with you *except* that the parameters do not follow memcpy() traditional order (dst, src, len). Instead they are (src, dst, len), so we should probably name them to advise the poor sod who has to implement this that we've chosen an inconsistent API. Or we could fix it.
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Mon, Oct 15, 2018 at 3:47 PM Christoph Hellwig wrote: > > On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote: > > I supposed you figured out the reason already. > > No, I hadn't. > > > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS. > > All those "advisory" flags, we want to pass them in to filesystem as FYI, > > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN > > to every filesystem, when vfs has already taken care of the advice. > > I don't think this model makes sense. If they really are purely > handled in the VFS we can mask them before passing them to the file > system, if not we need to check them, or the they are avisory and > we can have a simple #define instead of the helper. > > RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep, > so the file system should know about it Also looking at it again now > it seems entirely superflous - we can just pass down then len == we > use in higher level code instead of having a flag and will side step > the issue here. > > RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can > easily be solved by including it everywhere. > > RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to > start with, as RFR_CAN_SHORTEN can be used instead. > > So something like this in fs.h: > > #define REMAP_FILE_ADVISORY_FLAGS REMAP_FILE_CAN_SHORTEN > > And then in the file system: > > if (flags & ~REMAP_FILE_ADVISORY_FLAGS) > -EINVAL; > > or > > if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP)) > -EINVAL; > > should be all that is needed. Yeh, I suppose that makes sense. Thanks, Amir.
Re: [PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range
On Mon, Oct 15, 2018 at 09:04:13AM +0300, Amir Goldstein wrote: > I supposed you figured out the reason already. No, I hadn't. > It makes it appearance in patch 16/25 as RFR_VFS_FLAGS. > All those "advisory" flags, we want to pass them in to filesystem as FYI, > but we don't want to explicitly add support for e.g. RFR_CAN_SHORTEN > to every filesystem, when vfs has already taken care of the advice. I don't think this model makes sense. If they really are purely handled in the VFS we can mask them before passing them to the file system, if not we need to check them, or the they are avisory and we can have a simple #define instead of the helper. RFR_TO_SRC_EOF is checked in generic_remap_file_range_prep, so the file system should know about it Also looking at it again now it seems entirely superflous - we can just pass down then len == we use in higher level code instead of having a flag and will side step the issue here. RFR_CAN_SHORTEN is advisory as no one has to shorten, but that can easily be solved by including it everywhere. RFR_SHORT_DEDUPE is as far as I can tell entirely superflous to start with, as RFR_CAN_SHORTEN can be used instead. So something like this in fs.h: #define REMAP_FILE_ADVISORY_FLAGS REMAP_FILE_CAN_SHORTEN And then in the file system: if (flags & ~REMAP_FILE_ADVISORY_FLAGS) -EINVAL; or if (flags & ~(REMAP_FILE_ADVISORY_FLAGS | REMAP_FILE_DEDUP)) -EINVAL; should be all that is needed.
Re: reproducible builds with btrfs seed feature
On 2018-10-13 18:28, Chris Murphy wrote: Is it practical and desirable to make Btrfs based OS installation images reproducible? Or is Btrfs simply too complex and non-deterministic? [1] The main three problems with Btrfs right now for reproducibility are: a. many objects have uuids other than the volume uuid; and mkfs only lets us set the volume uuid b. atime, ctime, mtime, otime; and no way to make them all the same c. non-deterministic allocation of file extents, compression, inode assignment, logical and physical address allocation I'm imagining reproducible image creation would be a mkfs feature that builds on Btrfs seed and --rootdir concepts to constrain Btrfs features to maybe make reproducible Btrfs volumes possible: - No raid - Either all objects needing uuids can have those uuids specified by switch, or possibly a defined set of uuids expressly for this use case, or possibly all of them can just be zeros (eek? not sure) - A flag to set all times the same - Possibly require that target block device is zero filled before creation of the Btrfs - Possibly disallow subvolumes and snapshots - Require the resulting image is seed/ro and maybe also a new compat_ro flag to enforce that such Btrfs file systems cannot be modified after the fact. - Enforce a consistent means of allocation and compression The end result is creating two Btrfs volumes would yield image files with matching hashes. So in other words, you care about matching the block layout _exactly_. This is a great idea for paranoid people, but it's usually overkill. Realistically, almost nothing in userspace cares about the block layout, worrying about it just makes verifying the reproduced image a bit easier (there's no reason you can't verify all the relevant data without doing a checksum or HMAC of the image as a whole). If I had to guess, the biggest challenge would be allocation. But it's also possible that such an image may have problems with "sprouts". A non-removable sprout seems fairly straightforward and safe; but if a "reproducible build" type of seed is removed, it seems like removal needs to be smart enough to refresh *all* uuids found in the sprout: a hard break from the seed. Competing file systems, ext4 with make_ext4 fork, and squashfs. At the moment I'm thinking it might be easier to teach squashfs integrity checking than to make Btrfs reproducible. But then I also think restricting Btrfs features, and applying some requirements to constrain Btrfs to make it reproducible, really enhances the Btrfs seed-sprout feature. Any thoughts? Useful? Difficult to implement? Squashfs might be a better fit for this use case *if* it can be taught about integrity checking. It does per file checksums for the purpose of deduplication but those checksums aren't retained for later integrity checking. I've seen projects with SquashFS that store integrity data separately but leverage other infrastructure. Methods I've seen so far include: * GPG-signed SquashFS images, usually with detached signatures * SquashFS with PAR2 integrity checking data * SquashFS on top of dm-verity * SquashFS on top of dm-integrity The first two need to be externally checked prior to mount, but doing so is not hard. The fourth is tricky to set up right, but provides better integration with encrypted images. The third does exactly what's needed though. You just use the embedded data variant of dm-verity, bind the resultant image to a loop device, activate dm-verity on the loop device, and mount the resultant mapped device like any other SquashFS image. I've also seen some talk of using SquashFS with IMA and IMA appraisal, but I've not seen anybody actually _do_ that, and it wouldn't be on quite the level you seem to want (it verifies the files in the image, but not the image as a whole).
Re: BTRFS bad block management. Does it exist?
On 2018-10-14 07:08, waxhead wrote: In case BTRFS fails to WRITE to a disk. What happens? Does the bad area get mapped out somehow? Does it try again until it succeed or until it "times out" or reach a threshold counter? Does it eventually try to write to a different disk (in case of using the raid1/10 profile?) Building on Qu's answer (which is absolutely correct), BTRFS makes the perfectly reasonable assumption that you're not trying to use known bad hardware. It's not alone in this respect either, pretty much every Linux filesystem makes the exact same assumption (and almost all non-Linux ones too), because it really is a perfectly reasonable assumption. The only exception is ext[234], but they only support it statically (you can set the bad block list at mkfs time, but not afterwards, and they don't update it at runtime), and it's a holdover from earlier filesystems which originated at a time when storage was sufficiently expensive _and_ unreliable that you kept using disks until they were essentially completely dead. The reality is that with modern storage hardware, if you have persistently bad sectors the device is either defective (and should be returned under warranty), or it's beyond expected EOL (and should just be replaced). Most people know about SSD's doing block remapping to avoid bad blocks, but hard drives do it to, and they're actually rather good at it. In both cases, enough spare blocks are provided that the device can handle average rates of media errors through the entirety of it's average life expectancy without running out of spare blocks. On top of all of that though, it's fully possible to work around bad blocks in the block layer if you take the time to actually do it. With a bit of reasonably simple math, you can easily set up an LVM volume that actively avoids all the bad blocks on a disk while still fully utilizing the rest of the volume. Similarly, with a bit of work (and a partition table that supports _lots_ of partitions) you can work around bad blocks with an MD concatenated device.
[PATCH] btrfs: fix test btrfs/007 to not leave temporary files in /tmp
From: Filipe Manana This test was using the "mktemp -d" command to create a temporary directory for storing send streams and computations from fssum, without ever deleting them when it finishes. Therefore after running it for many times it filled up all space from /tmp. Fix this by using a temporary directory in TEST_DEV instead, as all the more recent send/receive tests do, to store these files, and making sure they get deleted when the test finishes. On average the sum of the size of those files is between 5.5Mb to 6Mb, but changing the number of operations for fsstress makes it even bigger. Signed-off-by: Filipe Manana --- tests/btrfs/007 | 33 +++-- tests/btrfs/007.out | 1 - 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/tests/btrfs/007 b/tests/btrfs/007 index 09f2f011..438f2f27 100755 --- a/tests/btrfs/007 +++ b/tests/btrfs/007 @@ -16,14 +16,14 @@ seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" -tmp=`mktemp -d` +tmp=/tmp/$$ status=1 _cleanup() { - echo "*** unmount" - _scratch_unmount 2>/dev/null + cd / rm -f $tmp.* + rm -fr $send_files_dir } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -38,7 +38,11 @@ _require_scratch _require_fssum _require_seek_data_hole +send_files_dir=$TEST_DIR/btrfs-test-$seq + rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir workout() { @@ -57,19 +61,20 @@ workout() _run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/incr - echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap" \ + echo "# $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > ${send_files_dir}/base.snap" \ >> $seqres.full - $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $tmp/base.snap 2>> $seqres.full \ + $BTRFS_UTIL_PROG send $SCRATCH_MNT/base > $send_files_dir/base.snap 2>> $seqres.full \ || _fail "failed: '$@'" echo "# $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base\ - $SCRATCH_MNT/incr > $tmp/incr.snap" >> $seqres.full + $SCRATCH_MNT/incr > ${send_files_dir}/incr.snap" >> $seqres.full $BTRFS_UTIL_PROG send -p $SCRATCH_MNT/base \ - $SCRATCH_MNT/incr > $tmp/incr.snap 2>> $seqres.full \ + $SCRATCH_MNT/incr > $send_files_dir/incr.snap 2>> $seqres.full \ || _fail "failed: '$@'" - run_check $FSSUM_PROG -A -f -w $tmp/base.fssum $SCRATCH_MNT/base - run_check $FSSUM_PROG -A -f -w $tmp/incr.fssum -x $SCRATCH_MNT/incr/base \ - $SCRATCH_MNT/incr + run_check $FSSUM_PROG -A -f -w $send_files_dir/base.fssum \ + $SCRATCH_MNT/base + run_check $FSSUM_PROG -A -f -w $send_files_dir/incr.fssum \ + -x $SCRATCH_MNT/incr/base $SCRATCH_MNT/incr _scratch_unmount >/dev/null 2>&1 echo "*** mkfs -dsize=$fsz">>$seqres.full @@ -78,11 +83,11 @@ workout() || _fail "size=$fsz mkfs failed" _scratch_mount "-o noatime" - _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/base.snap - run_check $FSSUM_PROG -r $tmp/base.fssum $SCRATCH_MNT/base + _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/base.snap + run_check $FSSUM_PROG -r $send_files_dir/base.fssum $SCRATCH_MNT/base - _run_btrfs_util_prog receive $SCRATCH_MNT < $tmp/incr.snap - run_check $FSSUM_PROG -r $tmp/incr.fssum $SCRATCH_MNT/incr + _run_btrfs_util_prog receive $SCRATCH_MNT < $send_files_dir/incr.snap + run_check $FSSUM_PROG -r $send_files_dir/incr.fssum $SCRATCH_MNT/incr } echo "*** test send / receive" diff --git a/tests/btrfs/007.out b/tests/btrfs/007.out index 8f8cec7d..5d029ceb 100644 --- a/tests/btrfs/007.out +++ b/tests/btrfs/007.out @@ -1,4 +1,3 @@ QA output created by 007 *** test send / receive *** done -*** unmount -- 2.11.0
[PATCH] generic: test fsync after fallocate on a very small file
From: Filipe Manana Test that if we have a very small file, with a size smaller than the block size, then fallocate a very small range within the block size but past the file's current size, fsync the file and then power fail, after mounting the filesystem all the file data is there and the file size is correct. This test is motivated by a failure in btrfs where it triggered an assertion when using the no-holes feature, that is, when running with MKFS_OPTIONS="-O no-holes". The btrfs issue is fixed by a patch for the linux kernel titled: "Btrfs: fix assertion on fsync of regular file when using no-holes feature" Signed-off-by: Filipe Manana --- tests/generic/512 | 61 +++ tests/generic/512.out | 9 tests/generic/group | 1 + 3 files changed, 71 insertions(+) create mode 100755 tests/generic/512 create mode 100644 tests/generic/512.out diff --git a/tests/generic/512 b/tests/generic/512 new file mode 100755 index ..f4e13c68 --- /dev/null +++ b/tests/generic/512 @@ -0,0 +1,61 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved. +# +# FS QA Test No. 512 +# +# Test that if we have a very small file, with a size smaller than the block +# size, then fallocate a very small range within the block size but past the +# file's current size, fsync the file and then power fail, after mounting the +# filesystem all the file data is there and the file size is correct. +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +_cleanup() +{ + _cleanup_flakey + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter +. ./common/dmflakey + +# real QA test starts here +_supported_fs generic +_supported_os Linux +_require_scratch +_require_xfs_io_command "falloc" +_require_dm_target flakey + +rm -f $seqres.full + +_scratch_mkfs >>$seqres.full 2>&1 +_require_metadata_journaling $SCRATCH_DEV +_init_flakey +_mount_flakey + +$XFS_IO_PROG -f \ +-c "pwrite -S 0xb6 0 21" \ +-c "falloc 40 40" \ +-c "fsync" \ +$SCRATCH_MNT/foobar | _filter_xfs_io + +# Simulate a power failure and mount the filesystem. We expect no data loss +# and a correct file size. +_flakey_drop_and_remount + +echo "File content after power failure:" +od -t x1 -A d $SCRATCH_MNT/foobar + +_unmount_flakey + +status=0 +exit diff --git a/tests/generic/512.out b/tests/generic/512.out new file mode 100644 index ..19a0a1b1 --- /dev/null +++ b/tests/generic/512.out @@ -0,0 +1,9 @@ +QA output created by 512 +wrote 21/21 bytes at offset 0 +XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +File content after power failure: +000 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 b6 +016 b6 b6 b6 b6 b6 00 00 00 00 00 00 00 00 00 00 00 +032 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 +* +080 diff --git a/tests/generic/group b/tests/generic/group index 348214ac..d17a0248 100644 --- a/tests/generic/group +++ b/tests/generic/group @@ -514,3 +514,4 @@ 509 auto quick log 510 auto quick log 511 auto quick rw zero +512 auto quick log prealloc -- 2.11.0
[PATCH] Btrfs: fix assertion on fsync of regular file when using no-holes feature
From: Filipe Manana When using the NO_HOLES feature and logging a regular file, we were expecting that if we find an inline extent, that either its size in ram (uncompressed and unenconded) matches the size of the file or if it does not, that it matches the sector size and it represents compressed data. This assertion does not cover a case where the length of the inline extent is smaller then the sector size and also smaller the file's size, such case is possible through fallocate. Example: $ mkfs.btrfs -f -O no-holes /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -f -c "pwrite -S 0xb60 0 21" /mnt/foobar $ xfs_io -c "falloc 40 40" /mnt/foobar $ xfs_io -c "fsync" /mnt/foobar In the abobe example we trigger the assertion because the inline extent's length is 21 bytes while the file size is 80 bytes. The fallocate() call merely updated the file's size and did not touch the existing inline extent, as expected. So fix this by adjusting the assertion so that an inline extent length smaller then the file size is valid if the file size is smaller then the filesystem's sector size. A test case for fstests follows soon. Reported-by: Anatoly Trosinenko Link: https://lore.kernel.org/linux-btrfs/CAE5jQCfRSBC7n4pUTFJcmHh109=gwyT9mFkCOL+NKfzswmR=_...@mail.gmail.com/ Signed-off-by: Filipe Manana --- fs/btrfs/tree-log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ed455dba..1673dccc76c2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4652,7 +4652,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, ASSERT(len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE)); + BTRFS_COMPRESS_NONE) || + (len < i_size && i_size < fs_info->sectorsize)); return 0; } -- 2.11.0
Re: How to recover from btrfs scrub errors? (uncorrectable errors, checksum error at logical)
On 2018/10/15 下午3:50, Otto Kekäläinen wrote: > Hello! > > I am trying to figure out how to recover from errors detected by btrfs scrub. > > Scrub status reports: > > scrub status for 4f4479d5-648a-45b9-bcbf-978c766aeb41 > scrub started at Mon Oct 15 10:02:28 2018, running for 00:35:39 > total bytes scrubbed: 791.15GiB with 18 errors > error details: csum=18 > corrected errors: 0, uncorrectable errors: 18, unverified errors: 0 > > Kernel log contains lines like > > BTRFS warning (device dm-8): checksum error at logical 7351706472448 on dev > /dev/mapper/disk6tb, sector 61412648, root 12725, inode 152358265, > offset 483328: > path resolving failed with ret=-2 > > I've tried so far: > - deleting the files (when path is visible) Please ensure there are no other subvolumes/snapshots containing the same file or reflink to it. If path is not visible, please use the root and inode number to locate the culprit file. "find" command support to search using inode number. And "btrfs subvolume list" command will show the subvolume number. Also it's recommended to sync the fs before scrub, in case culprit inode only get orphaned but not deleted from disk. > - overwriting the files with new data If you're only overwriting the culprit sector, it could get CoWed and the original data extent is still there. You need to ensure the old data is not referred by any other root/inode. Please ensure there is no reflink/snapshot first. Then delete the file or overwrite the whole culprit file. Thanks, Qu > - changed disk (with btrfs replace) > > The checksum errors however persist. > How do I get rid of them? > > > The files are logs and other non-vital information. I am fine by > deleting the corrupted files. It is OK to recover so that I loose a > few gigabytes of data, but not the entire filesystem. > > Setup is a multi-disk btrfs filesystem, data single, metadata RAID-1 > Mounted with: > > /dev/mapper/wdc3td on /data type btrfs > (rw,noatime,compress=lzo,space_cache,subvolid=5,subvol=/) > > I've read lots of online sources on the topic but none of these help > me on how to recover from the current state: > > https://btrfs.wiki.kernel.org/index.php/Btrfsck > http://marc.merlins.org/perso/btrfs/post_2014-03-19_Btrfs-Tips_-Btrfs-Scrub-and-Btrfs-Filesystem-Repair.html > https://wiki.archlinux.org/index.php/Identify_damaged_files#Find_damaged_files > signature.asc Description: OpenPGP digital signature
How to recover from btrfs scrub errors? (uncorrectable errors, checksum error at logical)
Hello! I am trying to figure out how to recover from errors detected by btrfs scrub. Scrub status reports: scrub status for 4f4479d5-648a-45b9-bcbf-978c766aeb41 scrub started at Mon Oct 15 10:02:28 2018, running for 00:35:39 total bytes scrubbed: 791.15GiB with 18 errors error details: csum=18 corrected errors: 0, uncorrectable errors: 18, unverified errors: 0 Kernel log contains lines like BTRFS warning (device dm-8): checksum error at logical 7351706472448 on dev /dev/mapper/disk6tb, sector 61412648, root 12725, inode 152358265, offset 483328: path resolving failed with ret=-2 I've tried so far: - deleting the files (when path is visible) - overwriting the files with new data - changed disk (with btrfs replace) The checksum errors however persist. How do I get rid of them? The files are logs and other non-vital information. I am fine by deleting the corrupted files. It is OK to recover so that I loose a few gigabytes of data, but not the entire filesystem. Setup is a multi-disk btrfs filesystem, data single, metadata RAID-1 Mounted with: /dev/mapper/wdc3td on /data type btrfs (rw,noatime,compress=lzo,space_cache,subvolid=5,subvol=/) I've read lots of online sources on the topic but none of these help me on how to recover from the current state: https://btrfs.wiki.kernel.org/index.php/Btrfsck http://marc.merlins.org/perso/btrfs/post_2014-03-19_Btrfs-Tips_-Btrfs-Scrub-and-Btrfs-Filesystem-Repair.html https://wiki.archlinux.org/index.php/Identify_damaged_files#Find_damaged_files