Re: [PATCH v3] geneirc/077 fix min size for btrfs

2018-10-10 Thread Darrick J. Wong
On Thu, Oct 11, 2018 at 11:26:00AM +0800, Anand Jain wrote:
> If btrfs need to be tested at its default blockgroup which is non-mixed,
> then it needs at least 256mb.
> 
> Signed-off-by: Anand Jain 
> ---
> v2->v3:
>   separated from the patch set of 9.
>   notrun for the cases where filler is not big enough to fill the
>   fssize.
> v2->v1: ref the cover-letter of the set.
> 
>  tests/generic/077 | 11 +++
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/tests/generic/077 b/tests/generic/077
> index ef6af18c83e3..784afe448940 100755
> --- a/tests/generic/077
> +++ b/tests/generic/077
> @@ -13,7 +13,7 @@ echo "QA output created by $seq"
>  here=`pwd`
>  tmp=/tmp/$$
>  status=1
> -# Something w/ enough data to fill 50M of fs...
> +# Something w/ enough data to fill 256M of fs...
>  filler=/lib/modules/
>  
>  # fall back in case /lib/modules doesn't exist
> @@ -38,6 +38,11 @@ _supported_os Linux
>  
>  [ ! -d $filler ] && _notrun "No directory to source files from"
>  
> +# check if two iterations of the assigned filler is big enough to fill fssize
> +fs_size=$((256 * 1024 * 1024))
> +[ $(( $(du -h -m /usr | tail -1| cut -f1) * 2 )) -lt 256 ] && \

Err... what does measuring /usr have to do with /lib/modules?

Also, /lib/modules is 58M on my test VM, which means that a 256M
filesystem isn't going to ENOSPC.

(Though weirdly it doesn't fail despite the lack of ENOSPC even at the
50M size, so I'm not sure what this test is actually supposed to do...)

--D

> + _notrun "filler $filler isn't big enough to fill fssize $fssize"
> +
>  _require_scratch
>  _require_attrs
>  _require_acls
> @@ -49,9 +54,7 @@ rm -f $seqres.full
>  _scratch_unmount >/dev/null 2>&1
>  echo "*** MKFS ***" >>$seqres.full
>  echo "" >>$seqres.full
> -SIZE=`expr 50 \* 1024 \* 1024`
> -_scratch_mkfs_sized $SIZE   >>$seqres.full 2>&1 \
> - || _fail "mkfs failed"
> +_scratch_mkfs_sized $fs_size >> $seqres.full 2>&1 || _fail "mkfs failed"
>  _scratch_mount
>  mkdir $SCRATCH_MNT/subdir
>  
> -- 
> 1.8.3.1
> 


[PATCH 24/25] xfs: support returning partial reflink results

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Back when the XFS reflink code only supported clone_file_range, we were
only able to return zero or negative error codes to userspace.  However,
now that copy_file_range (which returns bytes copied) can use XFS'
clone_file_range, we have the opportunity to return partial results.
For example, if userspace sends a 1GB clone request and we run out of
space halfway through, we at least can tell userspace that we completed
512M of that request like a regular write.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_file.c|5 +
 fs/xfs/xfs_reflink.c |   19 ++-
 fs/xfs/xfs_reflink.h |2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)


diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bc9e94bcb7a3..b2b15b8dc4a1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -928,14 +928,11 @@ xfs_file_remap_range(
loff_t  len,
unsigned intremap_flags)
 {
-   int ret;
-
if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
return -EINVAL;
 
-   ret = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
len, remap_flags);
-   return ret < 0 ? ret : len;
 }
 
 STATIC int
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e1592e751cc2..12a1fe92454e 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1123,6 +1123,7 @@ xfs_reflink_remap_blocks(
struct xfs_inode*dest,
xfs_fileoff_t   destoff,
xfs_filblks_t   len,
+   xfs_filblks_t   *remapped,
xfs_off_t   new_isize)
 {
struct xfs_bmbt_irecimap;
@@ -1130,6 +1131,7 @@ xfs_reflink_remap_blocks(
int error = 0;
xfs_filblks_t   range_len;
 
+   *remapped = 0;
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
uintlock_mode;
@@ -1168,6 +1170,7 @@ xfs_reflink_remap_blocks(
srcoff += range_len;
destoff += range_len;
len -= range_len;
+   *remapped += range_len;
}
 
return 0;
@@ -1391,7 +1394,7 @@ xfs_reflink_remap_prep(
 /*
  * Link a range of blocks from one file to another.
  */
-int
+loff_t
 xfs_reflink_remap_range(
struct file *file_in,
loff_t  pos_in,
@@ -1406,9 +1409,9 @@ xfs_reflink_remap_range(
struct xfs_inode*dest = XFS_I(inode_out);
struct xfs_mount*mp = src->i_mount;
xfs_fileoff_t   sfsbno, dfsbno;
-   xfs_filblks_t   fsblen;
+   xfs_filblks_t   fsblen, remapped = 0;
xfs_extlen_tcowextsize;
-   ssize_t ret;
+   int ret;
 
if (!xfs_sb_version_hasreflink(>m_sb))
return -EOPNOTSUPP;
@@ -1424,11 +1427,17 @@ xfs_reflink_remap_range(
 
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
+   if (len == 0) {
+   ret = 0;
+   goto out_unlock;
+   }
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-   pos_out + len);
+   , pos_out + len);
+   remapped = min_t(int64_t, len, XFS_FSB_TO_B(mp, remapped));
if (ret)
goto out_unlock;
 
@@ -1451,7 +1460,7 @@ xfs_reflink_remap_range(
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-   return ret;
+   return remapped > 0 ? remapped : ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c3c46c276fe1..cbc26ff79a8f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -27,7 +27,7 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, 
xfs_off_t offset,
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
unsigned int remap_flags);
 extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,



[PATCH 25/25] xfs: remove redundant remap partial EOF block checks

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Now that we've moved the partial EOF block checks to the VFS helpers, we
can remove the redundantn functionality from XFS.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_reflink.c |   20 
 1 file changed, 20 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 12a1fe92454e..4450443f1148 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1307,8 +1307,6 @@ xfs_reflink_remap_prep(
struct inode*inode_out = file_inode(file_out);
struct xfs_inode*dest = XFS_I(inode_out);
boolsame_inode = (inode_in == inode_out);
-   boolis_dedupe = (remap_flags & RFR_SAME_DATA);
-   u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
 
/* Lock both files against IO */
@@ -1336,24 +1334,6 @@ xfs_reflink_remap_prep(
if (ret <= 0)
goto out_unlock;
 
-   /*
-* If the dedupe data matches, chop off the partial EOF block
-* from the source file so we don't try to dedupe the partial
-* EOF block.
-*/
-   if (is_dedupe) {
-   *len &= ~blkmask;
-   } else if (*len & blkmask) {
-   /*
-* The user is attempting to share a partial EOF block,
-* if it's inside the destination EOF then reject it.
-*/
-   if (pos_out + *len < i_size_read(inode_out)) {
-   ret = -EINVAL;
-   goto out_unlock;
-   }
-   }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)



[PATCH 22/25] ocfs2: support partial clone range and dedupe range

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Change the ocfs2 remap code to allow for returning partial results.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/file.c |7 +
 fs/ocfs2/refcounttree.c |   73 ++-
 fs/ocfs2/refcounttree.h |   12 
 3 files changed, 48 insertions(+), 44 deletions(-)


diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e6ffed70398e..061ae2c4bd4a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2531,14 +2531,11 @@ static loff_t ocfs2_remap_file_range(struct file 
*file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 loff_t len, unsigned int remap_flags)
 {
-   int ret;
-
if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
return -EINVAL;
 
-   ret = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-   len, remap_flags);
-   return ret < 0 ? ret : len;
+   return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+   len, remap_flags);
 }
 
 const struct inode_operations ocfs2_file_iops = {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b9e0418a1974..4eacdd703874 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4507,14 +4507,14 @@ static int ocfs2_reflink_update_dest(struct inode *dest,
 }
 
 /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
-static int ocfs2_reflink_remap_extent(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len,
+struct ocfs2_cached_dealloc_ctxt 
*dealloc)
 {
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
@@ -4522,6 +4522,7 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
+   loff_t remapped = 0;
loff_t pstart, plen;
u32 p_cluster, num_clusters, slast, spos, tpos;
unsigned int ext_flags;
@@ -4605,30 +4606,32 @@ static int ocfs2_reflink_remap_extent(struct inode 
*s_inode,
 next_loop:
spos += num_clusters;
tpos += num_clusters;
+   remapped += ocfs2_clusters_to_bytes(t_inode->i_sb,
+   num_clusters);
}
 
-out:
-   return ret;
+   return remapped;
 out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
-   return ret;
+out:
+   return remapped > 0 ? remapped : ret;
 }
 
 /* Set up refcount tree and remap s_inode to t_inode. */
-static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
- struct buffer_head *s_bh,
- loff_t pos_in,
- struct inode *t_inode,
- struct buffer_head *t_bh,
- loff_t pos_out,
- loff_t len)
+static loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+struct buffer_head *s_bh,
+loff_t pos_in,
+struct inode *t_inode,
+struct buffer_head *t_bh,
+loff_t pos_out,
+loff_t len)
 {
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
-   int ret;
+   loff_t ret;
 
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
@@ -4700,7 +4703,7 @@ static int ocfs2_reflink_remap_blocks(struct inode 
*s_inode,
/* Actually remap extents now. */
ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
 pos_out, len, );
-   if (ret) {
+   if (ret < 0) {

[PATCH 23/25] xfs: fix pagecache truncation prior to reflink

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range shouldn't be a problem.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_reflink.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b24a2a1c4db1..e1592e751cc2 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1370,8 +1370,9 @@ xfs_reflink_remap_prep(
goto out_unlock;
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + *len) - 1);
+   truncate_inode_pages_range(_out->i_data,
+   round_down(pos_out, PAGE_SIZE),
+   round_up(pos_out + *len, PAGE_SIZE) - 1);
 
/*
 * Update inode timestamps and remove security privileges before we



[PATCH 20/25] ocfs2: truncate page cache for clone destination file before remapping

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

When cloning blocks into another file, truncate the page cache before we
start remapping blocks so that concurrent reads wait for us to finish.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |   10 --
 1 file changed, 4 insertions(+), 6 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a3df118bf3b9..851ba3ae7ce8 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4869,14 +4869,12 @@ int ocfs2_reflink_remap_range(struct file *file_in,
down_write_nested(_I(inode_out)->ip_alloc_sem,
  SINGLE_DEPTH_NESTING);
 
-   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-out_bh, pos_out, len);
-
/* Zap any page cache for the destination file's range. */
-   if (!ret)
-   truncate_inode_pages_range(_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(_out->i_data, pos_out,
+  PAGE_ALIGN(pos_out + len) - 1);
 
+   ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+out_bh, pos_out, len);
up_write(_I(inode_in)->ip_alloc_sem);
if (!same_inode)
up_write(_I(inode_out)->ip_alloc_sem);



[PATCH 21/25] ocfs2: fix pagecache truncation prior to reflink

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Prior to remapping blocks, it is necessary to remove pages from the
destination file's page cache.  Unfortunately, the truncation is not
aggressive enough -- if page size > block size, we'll end up zeroing
subpage blocks instead of removing them.  So, round the start offset
down and the end offset up to page boundaries.  We already wrote all
the dirty data so the larger range should be fine.

Signed-off-by: Darrick J. Wong 
---
 fs/ocfs2/refcounttree.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 851ba3ae7ce8..b9e0418a1974 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4870,8 +4870,9 @@ int ocfs2_reflink_remap_range(struct file *file_in,
  SINGLE_DEPTH_NESTING);
 
/* Zap any page cache for the destination file's range. */
-   truncate_inode_pages_range(_out->i_data, pos_out,
-  PAGE_ALIGN(pos_out + len) - 1);
+   truncate_inode_pages_range(_out->i_data,
+  round_down(pos_out, PAGE_SIZE),
+  round_up(pos_out + len, PAGE_SIZE) - 1);
 
ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
 out_bh, pos_out, len);



[PATCH 19/25] vfs: implement opportunistic short dedupe

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

For a given dedupe request, the bytes_deduped field in the control
structure tells userspace if we managed to deduplicate some, but not all
of, the requested regions starting from the file offsets supplied.
However, due to sloppy coding, the current dedupe code returns
FILE_DEDUPE_RANGE_DIFFERS if any part of the range is different.
Fix this so that we can actually support partial request completion.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|   48 ++--
 include/linux/fs.h |7 +--
 2 files changed, 43 insertions(+), 12 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index c88a443d9eb2..de055cb9c5ae 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1737,13 +1737,26 @@ static struct page *vfs_dedupe_get_page(struct inode 
*inode, loff_t offset)
return page;
 }
 
+static unsigned int vfs_dedupe_memcmp(const char *s1, const char *s2,
+ unsigned int len)
+{
+   const char *orig_s1;
+
+   for (orig_s1 = s1; len > 0; s1++, s2++, len--)
+   if (*s1 != *s2)
+   break;
+
+   return s1 - orig_s1;
+}
+
 /*
  * Compare extents of two files to see if they are the same.
  * Caller must have locked both inodes to prevent write races.
  */
 static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 struct inode *dest, loff_t destoff,
-loff_t len, bool *is_same)
+loff_t *req_len,
+unsigned int remap_flags)
 {
loff_t src_poff;
loff_t dest_poff;
@@ -1751,8 +1764,11 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
void *dest_addr;
struct page *src_page;
struct page *dest_page;
-   loff_t cmp_len;
+   loff_t len = *req_len;
+   loff_t same_len = 0;
bool same;
+   unsigned int cmp_len;
+   unsigned int cmp_same;
int error;
 
error = -EINVAL;
@@ -1762,7 +1778,7 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
dest_poff = destoff & (PAGE_SIZE - 1);
cmp_len = min(PAGE_SIZE - src_poff,
  PAGE_SIZE - dest_poff);
-   cmp_len = min(cmp_len, len);
+   cmp_len = min_t(loff_t, cmp_len, len);
if (cmp_len <= 0)
goto out_error;
 
@@ -1784,7 +1800,10 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
flush_dcache_page(src_page);
flush_dcache_page(dest_page);
 
-   if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+   cmp_same = vfs_dedupe_memcmp(src_addr + src_poff,
+dest_addr + dest_poff, cmp_len);
+   same_len += cmp_same;
+   if (cmp_same != cmp_len)
same = false;
 
kunmap_atomic(dest_addr);
@@ -1802,7 +1821,17 @@ static int vfs_dedupe_file_range_compare(struct inode 
*src, loff_t srcoff,
len -= cmp_len;
}
 
-   *is_same = same;
+   /*
+* If less than the whole range matched, we have to back down to the
+* nearest block boundary.
+*/
+   if (*req_len != same_len) {
+   if (!(remap_flags & RFR_SHORT_DEDUPE))
+   return -EBADE;
+
+   *req_len = ALIGN_DOWN(same_len, dest->i_sb->s_blocksize);
+   }
+
return 0;
 
 out_error:
@@ -1881,13 +1910,11 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 * Check that the extents are the same.
 */
if (is_dedupe) {
-   boolis_same = false;
-
ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
-   inode_out, pos_out, *len, _same);
+   inode_out, pos_out, len, remap_flags);
if (ret)
return ret;
-   if (!is_same)
+   if (*len == 0)
return -EBADE;
}
 
@@ -2013,7 +2040,8 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
 {
loff_t ret;
 
-   WARN_ON_ONCE(remap_flags & ~(RFR_SAME_DATA));
+   WARN_ON_ONCE(remap_flags & ~(RFR_SAME_DATA | RFR_CAN_SHORTEN |
+RFR_SHORT_DEDUPE));
 
ret = mnt_want_write_file(dst_file);
if (ret)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0603ed007e9..18b6db85ab64 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1727,16 +1727,19 @@ struct block_device_operations;
  * RFR_SAME_DATA: only remap if contents identical (i.e. ded

[PATCH 18/25] vfs: hide file range comparison function

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

There are no callers of vfs_dedupe_file_range_compare, so we might as
well make it a static helper and remove the export.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|  191 ++--
 include/linux/fs.h |3 -
 2 files changed, 95 insertions(+), 99 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 3713893b7e38..c88a443d9eb2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1714,6 +1714,101 @@ static int remap_verify_area(struct file *file, loff_t 
pos, loff_t len,
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
 
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+   struct address_space *mapping;
+   struct page *page;
+   pgoff_t n;
+
+   n = offset >> PAGE_SHIFT;
+   mapping = inode->i_mapping;
+   page = read_mapping_page(mapping, n, NULL);
+   if (IS_ERR(page))
+   return page;
+   if (!PageUptodate(page)) {
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   lock_page(page);
+   return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+struct inode *dest, loff_t destoff,
+loff_t len, bool *is_same)
+{
+   loff_t src_poff;
+   loff_t dest_poff;
+   void *src_addr;
+   void *dest_addr;
+   struct page *src_page;
+   struct page *dest_page;
+   loff_t cmp_len;
+   bool same;
+   int error;
+
+   error = -EINVAL;
+   same = true;
+   while (len) {
+   src_poff = srcoff & (PAGE_SIZE - 1);
+   dest_poff = destoff & (PAGE_SIZE - 1);
+   cmp_len = min(PAGE_SIZE - src_poff,
+ PAGE_SIZE - dest_poff);
+   cmp_len = min(cmp_len, len);
+   if (cmp_len <= 0)
+   goto out_error;
+
+   src_page = vfs_dedupe_get_page(src, srcoff);
+   if (IS_ERR(src_page)) {
+   error = PTR_ERR(src_page);
+   goto out_error;
+   }
+   dest_page = vfs_dedupe_get_page(dest, destoff);
+   if (IS_ERR(dest_page)) {
+   error = PTR_ERR(dest_page);
+   unlock_page(src_page);
+   put_page(src_page);
+   goto out_error;
+   }
+   src_addr = kmap_atomic(src_page);
+   dest_addr = kmap_atomic(dest_page);
+
+   flush_dcache_page(src_page);
+   flush_dcache_page(dest_page);
+
+   if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+   same = false;
+
+   kunmap_atomic(dest_addr);
+   kunmap_atomic(src_addr);
+   unlock_page(dest_page);
+   unlock_page(src_page);
+   put_page(dest_page);
+   put_page(src_page);
+
+   if (!same)
+   break;
+
+   srcoff += cmp_len;
+   destoff += cmp_len;
+   len -= cmp_len;
+   }
+
+   *is_same = same;
+   return 0;
+
+out_error:
+   return error;
+}
+
 /*
  * Check that the two inodes are eligible for cloning, the ranges make
  * sense, and then flush all dirty data.  Caller must ensure that the
@@ -1912,102 +2007,6 @@ loff_t vfs_clone_file_range(struct file *file_in, 
loff_t pos_in,
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
 
-/*
- * Read a page's worth of file data into the page cache.  Return the page
- * locked.
- */
-static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
-{
-   struct address_space *mapping;
-   struct page *page;
-   pgoff_t n;
-
-   n = offset >> PAGE_SHIFT;
-   mapping = inode->i_mapping;
-   page = read_mapping_page(mapping, n, NULL);
-   if (IS_ERR(page))
-   return page;
-   if (!PageUptodate(page)) {
-   put_page(page);
-   return ERR_PTR(-EIO);
-   }
-   lock_page(page);
-   return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- * Caller must have locked both inodes to prevent write races.
- */
-int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t destoff,
- loff_t len, bool *is_same)
-{
-   loff_t src_poff;
-   loff_t dest_poff;
-   void *src_addr;
-   void *dest_addr;
-   struct page *src_page;
-

[PATCH 16/25] vfs: make remapping to source file eof more explicit

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Create a RFR_TO_SRC_EOF flag to explicitly declare that the caller wants
the remap implementation to remap to the end of the source file, once
the files are locked.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ioctl.c |3 ++-
 fs/nfsd/vfs.c  |4 +++-
 fs/read_write.c|   13 -
 include/linux/fs.h |8 +++-
 4 files changed, 20 insertions(+), 8 deletions(-)


diff --git a/fs/ioctl.c b/fs/ioctl.c
index 505275ec5596..088cf240ca10 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -224,6 +224,7 @@ static long ioctl_file_clone(struct file *dst_file, 
unsigned long srcfd,
 {
struct fd src_file = fdget(srcfd);
loff_t cloned;
+   unsigned int remap_flags = olen == 0 ? RFR_TO_SRC_EOF : 0;
int ret;
 
if (!src_file.file)
@@ -232,7 +233,7 @@ static long ioctl_file_clone(struct file *dst_file, 
unsigned long srcfd,
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
- olen, 0);
+ olen, remap_flags);
if (cloned < 0)
ret = cloned;
else if (olen && cloned != olen)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 726fc5b2b27a..0dc65047df1a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -542,8 +542,10 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 
src_pos, struct file *dst,
u64 dst_pos, u64 count)
 {
loff_t cloned;
+   unsigned int remap_flags = count == 0 ? RFR_TO_SRC_EOF : 0;
 
-   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count,
+ remap_flags);
if (count && cloned != count)
cloned = -EINVAL;
return nfserrno(cloned < 0 ? cloned : 0);
diff --git a/fs/read_write.c b/fs/read_write.c
index a360274b0cdc..6ec908f9a69b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1746,15 +1746,18 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   /* Zero length dedupe exits immediately; reflink goes to EOF. */
-   if (*len == 0) {
+   /*
+* If the caller asked to go all the way to the end of the source file,
+* set *len now that we have the file locked.
+*/
+   if (remap_flags & RFR_TO_SRC_EOF) {
loff_t isize = i_size_read(inode_in);
 
-   if (is_dedupe || pos_in == isize)
-   return 0;
if (pos_in > isize)
return -EINVAL;
*len = isize - pos_in;
+   if (*len == 0)
+   return 0;
}
 
/* Check that we don't violate system file offset limits. */
@@ -1849,7 +1852,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
struct inode *inode_out = file_inode(file_out);
loff_t ret;
 
-   WARN_ON_ONCE(remap_flags);
+   WARN_ON_ONCE(remap_flags & ~(RFR_TO_SRC_EOF));
 
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d77b8d90d65e..b9c314f9d5a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1725,10 +1725,15 @@ struct block_device_operations;
  * These flags control the behavior of the remap_file_range function pointer.
  *
  * RFR_SAME_DATA: only remap if contents identical (i.e. deduplicate)
+ * RFR_TO_SRC_EOF: remap to the end of the source file
  */
 #define RFR_SAME_DATA  (1 << 0)
+#define RFR_TO_SRC_EOF (1 << 1)
 
-#define RFR_VALID_FLAGS(RFR_SAME_DATA)
+#define RFR_VALID_FLAGS(RFR_SAME_DATA | RFR_TO_SRC_EOF)
+
+/* Implemented by the VFS, so these are advisory. */
+#define RFR_VFS_FLAGS  (RFR_TO_SRC_EOF)
 
 /*
  * Filesystem remapping implementations should call this helper on their
@@ -1739,6 +1744,7 @@ struct block_device_operations;
 static inline bool remap_check_flags(unsigned int remap_flags,
 unsigned int supported_flags)
 {
+   remap_flags &= ~RFR_VFS_FLAGS;
return (remap_flags & ~(supported_flags & RFR_VALID_FLAGS)) == 0;
 }
 



[PATCH 17/25] vfs: enable remap callers that can handle short operations

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb in a remap flag that enables the filesystem remap handler to
shorten remapping requests for callers that can handle it.  Now
copy_file_range can report partial success (in case we run up against
alignment problems, resource limits, etc.).

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|   15 +--
 include/linux/fs.h |7 +--
 mm/filemap.c   |   16 
 3 files changed, 26 insertions(+), 12 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 6ec908f9a69b..3713893b7e38 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1593,7 +1593,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t 
pos_in,
 
cloned = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
-   min_t(loff_t, MAX_RW_COUNT, len), 0);
+   min_t(loff_t, MAX_RW_COUNT, len),
+   RFR_CAN_SHORTEN);
if (cloned > 0) {
ret = cloned;
goto done;
@@ -1804,16 +1805,18 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 * If the user is attempting to remap a partial EOF block and
 * it's inside the destination EOF then reject it.
 *
-* We don't support shortening requests, so we can only reject
-* them.
+* If possible, shorten the request instead of rejecting it.
 */
if (is_dedupe)
ret = -EBADE;
else if (pos_out + *len < i_size_read(inode_out))
ret = -EINVAL;
 
-   if (ret)
-   return ret;
+   if (ret) {
+   if (!(remap_flags & RFR_CAN_SHORTEN))
+   return ret;
+   *len &= ~blkmask;
+   }
}
 
return 1;
@@ -2112,7 +2115,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
info->dest_offset, len,
-   0);
+   RFR_CAN_SHORTEN);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b9c314f9d5a4..57cb56bbc30a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1726,14 +1726,17 @@ struct block_device_operations;
  *
  * RFR_SAME_DATA: only remap if contents identical (i.e. deduplicate)
  * RFR_TO_SRC_EOF: remap to the end of the source file
+ * RFR_CAN_SHORTEN: caller can handle a shortened request
  */
 #define RFR_SAME_DATA  (1 << 0)
 #define RFR_TO_SRC_EOF (1 << 1)
+#define RFR_CAN_SHORTEN(1 << 2)
 
-#define RFR_VALID_FLAGS(RFR_SAME_DATA | RFR_TO_SRC_EOF)
+#define RFR_VALID_FLAGS(RFR_SAME_DATA | RFR_TO_SRC_EOF | \
+RFR_CAN_SHORTEN)
 
 /* Implemented by the VFS, so these are advisory. */
-#define RFR_VFS_FLAGS  (RFR_TO_SRC_EOF)
+#define RFR_VFS_FLAGS  (RFR_TO_SRC_EOF | RFR_CAN_SHORTEN)
 
 /*
  * Filesystem remapping implementations should call this helper on their
diff --git a/mm/filemap.c b/mm/filemap.c
index 369cfd164e90..bccbd3621238 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3051,8 +3051,12 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
if (pos_in + count == size_in) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
-   if (!IS_ALIGNED(count, bs))
-   return -EINVAL;
+   if (!IS_ALIGNED(count, bs)) {
+   if (remap_flags & RFR_CAN_SHORTEN)
+   count = ALIGN_DOWN(count, bs);
+   else
+   return -EINVAL;
+   }
 
bcount = count;
}
@@ -3063,10 +3067,14 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
pos_out < pos_in + bcount)
return -EINVAL;
 
-   /* For now we don't support changing the length. */
-   if (*req_count != count)
+   /*
+* We shortened the request but the caller can't deal with that, so
+* bounce the request back to userspace.
+*/
+   if (*req_count != count && !(remap_flags & RFR_CAN_SHORTEN))
return -EINVAL;
 
+   *req_count = count;
return 0;
 }
 



[PATCH 15/25] vfs: plumb RFR_* remap flags through the vfs dedupe functions

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb a remap_flags argument through the vfs_dedupe_file_range_one
functions so that dedupe can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/overlayfs/file.c |3 ++-
 fs/read_write.c |9 ++---
 include/linux/fs.h  |2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)


diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index e5cc17281d0b..8f7a162768f2 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -467,7 +467,8 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_DEDUPE:
ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
-   real_out.file, pos_out, len);
+   real_out.file, pos_out, len,
+   flags);
break;
}
revert_creds(old_cred);
diff --git a/fs/read_write.c b/fs/read_write.c
index b3f8b4a2bdfc..a360274b0cdc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2004,10 +2004,12 @@ EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
 
 loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
 struct file *dst_file, loff_t dst_pos,
-loff_t len)
+loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags & ~(RFR_SAME_DATA));
+
ret = mnt_want_write_file(dst_file);
if (ret)
return ret;
@@ -2038,7 +2040,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
}
 
ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
-   dst_pos, len, RFR_SAME_DATA);
+   dst_pos, len, remap_flags | RFR_SAME_DATA);
 out_drop_write:
mnt_drop_write_file(dst_file);
 
@@ -2106,7 +2108,8 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
}
 
deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-   info->dest_offset, len);
+   info->dest_offset, len,
+   0);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4acda4809027..d77b8d90d65e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1863,7 +1863,7 @@ extern int vfs_dedupe_file_range(struct file *file,
 struct file_dedupe_range *same);
 extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
-   loff_t len);
+   loff_t len, unsigned int remap_flags);
 
 
 struct super_operations {



[PATCH 12/25] vfs: pass remap flags to generic_remap_checks

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Pass the same remap flags to generic_remap_checks for consistency.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c|2 +-
 include/linux/fs.h |2 +-
 mm/filemap.c   |4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index bd5f8d724b13..5de5d102ef4d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1755,7 +1755,7 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-   is_dedupe);
+   remap_flags);
if (ret)
return ret;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b67f108932a5..b59637b2f484 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2990,7 +2990,7 @@ extern int generic_file_readonly_mmap(struct file *, 
struct vm_area_struct *);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
-   uint64_t *count, bool is_dedupe);
+   uint64_t *count, unsigned int remap_flags);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 08ad210fee49..c34a89a35d5a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3001,7 +3001,7 @@ EXPORT_SYMBOL(generic_write_checks);
  */
 int generic_remap_checks(struct file *file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
-uint64_t *req_count, bool is_dedupe)
+uint64_t *req_count, unsigned int remap_flags)
 {
struct inode *inode_in = file_in->f_mapping->host;
struct inode *inode_out = file_out->f_mapping->host;
@@ -3023,7 +3023,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
size_out = i_size_read(inode_out);
 
/* Dedupe requires both ranges to be within EOF. */
-   if (is_dedupe &&
+   if ((remap_flags & RFR_SAME_DATA) &&
(pos_in >= size_in || pos_in + count > size_in ||
 pos_out >= size_out || pos_out + count > size_out))
return -EINVAL;



[PATCH 13/25] vfs: make remap_file_range functions take and return bytes completed

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Change the remap_file_range functions to take a number of bytes to
operate upon and return the number of bytes they operated on.  This is a
requirement for allowing fs implementations to return short clone/dedupe
results to the user, which will enable us to obey resource limits in a
graceful manner.

A subsequent patch will enable copy_file_range to signal to the
->clone_file_range implementation that it can handle a short length,
which will be returned in the function's return value.  For now the
short return is not implemented anywhere so the behavior won't change --
either copy_file_range manages to clone the entire range or it tries an
alternative.

Neither clone ioctl can take advantage of this, alas.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |6 ++---
 fs/btrfs/ctree.h  |6 ++---
 fs/btrfs/ioctl.c  |   13 ++
 fs/cifs/cifsfs.c  |6 ++---
 fs/ioctl.c|   10 +++-
 fs/nfs/nfs4file.c |6 ++---
 fs/nfsd/vfs.c |8 +-
 fs/ocfs2/file.c   |   16 ++---
 fs/ocfs2/refcounttree.c   |2 +-
 fs/ocfs2/refcounttree.h   |2 +-
 fs/overlayfs/copy_up.c|6 ++---
 fs/overlayfs/file.c   |   12 +
 fs/read_write.c   |   47 -
 fs/xfs/xfs_file.c |9 +--
 fs/xfs/xfs_reflink.c  |4 ++-
 fs/xfs/xfs_reflink.h  |2 +-
 include/linux/fs.h|   27 -
 mm/filemap.c  |2 +-
 18 files changed, 105 insertions(+), 79 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 2ec27203e4a6..393909585bd8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,9 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 len, unsigned int remap_flags);
+   loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 124a05662fc2..771a961d77ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3247,9 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len,
-  unsigned int remap_flags);
+loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bed5b8f9ec09..3e0aaca9e072 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4328,10 +4328,12 @@ static noinline int btrfs_clone_files(struct file 
*file, struct file *file_src,
return ret;
 }
 
-int btrfs_remap_file_range(struct file *src_file, loff_t off,
-   struct file *dst_file, loff_t destoff, u64 len,
+loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+   struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
 {
+   int ret;
+
if (!remap_check_flags(remap_flags, RFR_SAME_DATA))
return -EINVAL;
 
@@ -4349,10 +4351,11 @@ int btrfs_remap_file_range(struct file *src_file, 
loff_t off,
return -EINVAL;
}
 
-   return btrfs_extent_same(src, off, len, dst, destoff);
+   ret = btrfs_extent_same(src, off, len, dst, destoff);
+   } else {
+   ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
-
-   return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+   return ret < 0 ? ret : len;
 }
 
 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 06b2587fcc77..816a1b52767e 100644
--- a/fs/cifs/cifsfs.c
+++ b/f

[PATCH 14/25] vfs: plumb RFR_* remap flags through the vfs clone functions

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Plumb a remap_flags argument through the {do,vfs}_clone_file_range
functions so that clone can take advantage of it.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ioctl.c |2 +-
 fs/nfsd/vfs.c  |2 +-
 fs/overlayfs/copy_up.c |2 +-
 fs/overlayfs/file.c|6 +++---
 fs/read_write.c|   13 +
 include/linux/fs.h |4 ++--
 6 files changed, 17 insertions(+), 12 deletions(-)


diff --git a/fs/ioctl.c b/fs/ioctl.c
index 72537b68c272..505275ec5596 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -232,7 +232,7 @@ static long ioctl_file_clone(struct file *dst_file, 
unsigned long srcfd,
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
- olen);
+ olen, 0);
if (cloned < 0)
ret = cloned;
else if (olen && cloned != olen)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ac6cb6101cbe..726fc5b2b27a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -543,7 +543,7 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 
src_pos, struct file *dst,
 {
loff_t cloned;
 
-   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count);
+   cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (count && cloned != count)
cloned = -EINVAL;
return nfserrno(cloned < 0 ? cloned : 0);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8750b7235516..5f82fece64a0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -142,7 +142,7 @@ static int ovl_copy_up_data(struct path *old, struct path 
*new, loff_t len)
}
 
/* Try to use clone_file_range to clone up within the same fs */
-   cloned = do_clone_file_range(old_file, 0, new_file, 0, len);
+   cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
goto out;
/* Couldn't clone, so now we try to copy the data */
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 177731b21bad..e5cc17281d0b 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -462,7 +462,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t 
pos_in,
 
case OVL_CLONE:
ret = vfs_clone_file_range(real_in.file, pos_in,
-  real_out.file, pos_out, len);
+  real_out.file, pos_out, len, flags);
break;
 
case OVL_DEDUPE:
@@ -512,8 +512,8 @@ static loff_t ovl_remap_file_range(struct file *file_in, 
loff_t pos_in,
 !ovl_inode_upper(file_inode(file_out
return -EPERM;
 
-   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, 0,
-   op);
+   return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
+   remap_flags, op);
 }
 
 const struct file_operations ovl_file_operations = {
diff --git a/fs/read_write.c b/fs/read_write.c
index 461acd5fcc4a..b3f8b4a2bdfc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1842,12 +1842,15 @@ int generic_remap_file_range_touch(struct file *file, 
unsigned int remap_flags)
 EXPORT_SYMBOL(generic_remap_file_range_touch);
 
 loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, loff_t len)
+  struct file *file_out, loff_t pos_out,
+  loff_t len, unsigned int remap_flags)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
loff_t ret;
 
+   WARN_ON_ONCE(remap_flags);
+
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
@@ -1878,7 +1881,7 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
return ret;
 
ret = file_in->f_op->remap_file_range(file_in, pos_in,
-   file_out, pos_out, len, 0);
+   file_out, pos_out, len, remap_flags);
if (ret < 0)
return ret;
 
@@ -1889,12 +1892,14 @@ loff_t do_clone_file_range(struct file *file_in, loff_t 
pos_in,
 EXPORT_SYMBOL(do_clone_file_range);
 
 loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out, loff_t len)
+   struct file *file_out, loff_t pos_out,
+   loff_t len, unsigned int remap_flags)
 {
loff_t ret;
 
file_start_write(file_out);
-   ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);

[PATCH 09/25] vfs: rename clone_verify_area to remap_verify_area

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Since we use clone_verify_area for both clone and dedupe range checks,
rename the function to make it clear that it's for both.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index ebf62ffca57b..60cdfb576d81 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1686,7 +1686,7 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t 
__user *, off_in,
return ret;
 }
 
-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
+static int remap_verify_area(struct file *file, loff_t pos, u64 len, bool 
write)
 {
struct inode *inode = file_inode(file);
 
@@ -1839,11 +1839,11 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
 
-   ret = clone_verify_area(file_in, pos_in, len, false);
+   ret = remap_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;
 
-   ret = clone_verify_area(file_out, pos_out, len, true);
+   ret = remap_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;
 
@@ -1976,7 +1976,7 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (ret)
return ret;
 
-   ret = clone_verify_area(dst_file, dst_pos, len, true);
+   ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret < 0)
goto out_drop_write;
 
@@ -2038,7 +2038,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;
 
-   ret = clone_verify_area(file, off, len, false);
+   ret = remap_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;



[PATCH 06/25] vfs: skip zero-length dedupe requests

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Don't bother calling the filesystem for a zero-length dedupe request;
we can return zero and exit.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/read_write.c |5 +
 1 file changed, 5 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index 8498991e2f33..48d83231968f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1996,6 +1996,11 @@ int vfs_dedupe_file_range_one(struct file *src_file, 
loff_t src_pos,
if (!dst_file->f_op->dedupe_file_range)
goto out_drop_write;
 
+   if (len == 0) {
+   ret = 0;
+   goto out_drop_write;
+   }
+
ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
dst_file, dst_pos, len);
 out_drop_write:



[PATCH 08/25] vfs: rename vfs_clone_file_prep to be more descriptive

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

The vfs_clone_file_prep is a generic function to be called by filesystem
implementations only.  Rename the prefix to generic_ and make it more
clear that it applies to remap operations, not just clones.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +-
 fs/read_write.c |8 
 fs/xfs/xfs_reflink.c|2 +-
 include/linux/fs.h  |6 +++---
 4 files changed, 9 insertions(+), 9 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19e03936c5e1..36c56dfbe485 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index 4d6855671bf3..ebf62ffca57b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1717,9 +1717,9 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-   struct file *file_out, loff_t pos_out,
-   u64 *len, bool is_dedupe)
+int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 *len, bool is_dedupe)
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1809,7 +1809,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep);
+EXPORT_SYMBOL(generic_remap_file_range_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 281d5f53f2ec..a7757a128a78 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1326,7 +1326,7 @@ xfs_reflink_remap_prep(
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
 
-   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
+   ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11fe36576d34..686905be04c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1844,9 +1844,9 @@ extern ssize_t vfs_readv(struct file *, const struct 
iovec __user *,
unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
   loff_t, size_t, unsigned int);
-extern int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out,
-  u64 *count, bool is_dedupe);
+extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+struct file *file_out, loff_t pos_out,
+u64 *count, bool is_dedupe);
 extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
   struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,



[PATCH 07/25] vfs: combine the clone and dedupe into a single remap_file_range

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Combine the clone_file_range and dedupe_file_range operations into a
single remap_file_range file operation dispatch since they're
fundamentally the same operation.  The differences between the two can
be made in the prep functions.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 Documentation/filesystems/vfs.txt |   12 --
 fs/btrfs/ctree.h  |8 ++-
 fs/btrfs/file.c   |3 +-
 fs/btrfs/ioctl.c  |   45 +++--
 fs/cifs/cifsfs.c  |   22 +++---
 fs/nfs/nfs4file.c |   10 ++--
 fs/ocfs2/file.c   |   24 +++-
 fs/overlayfs/file.c   |   30 ++---
 fs/read_write.c   |   18 +++
 fs/xfs/xfs_file.c |   23 ++-
 include/linux/fs.h|   27 +++---
 11 files changed, 116 insertions(+), 106 deletions(-)


diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index a6c6a8af48a2..2ec27203e4a6 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -883,8 +883,9 @@ struct file_operations {
unsigned (*mmap_capabilities)(struct file *);
 #endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, 
loff_t, size_t, unsigned int);
-   int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
-   int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, 
u64);
+   int (*remap_file_range)(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
 };
 
@@ -960,11 +961,8 @@ otherwise noted.
 
   copy_file_range: called by the copy_file_range(2) system call.
 
-  clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
-   FICLONE commands.
-
-  dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
-   command.
+  remap_file_range: called by the ioctl(2) system call for FICLONERANGE and
+   FICLONE and FIDEDUPERANGE commands to remap file ranges.
 
   fadvise: possibly called by the fadvise64() system call.
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..124a05662fc2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3218,9 +3218,6 @@ void btrfs_get_block_group_info(struct list_head 
*groups_list,
struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
   struct btrfs_ioctl_balance_args *bargs);
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen);
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
@@ -3250,8 +3247,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page 
**pages,
  size_t num_pages, loff_t pos, size_t write_bytes,
  struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-  struct file *file_out, loff_t pos_out, u64 len);
+int btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+  struct file *file_out, loff_t pos_out, u64 len,
+  unsigned int remap_flags);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..9a963f061393 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3269,8 +3269,7 @@ const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
.compat_ioctl   = btrfs_compat_ioctl,
 #endif
-   .clone_file_range = btrfs_clone_file_range,
-   .dedupe_file_range = btrfs_dedupe_file_range,
+   .remap_file_range = btrfs_remap_file_range,
 };
 
 void __cold btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..bed5b8f9ec09 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3627,26 +3627,6 @@ static int btrfs_extent_same(struct inode *src, u64 
loff, u64 olen,
return ret;
 }
 
-int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-   struct file *dst_file, loff_t dst_loff,
-   u64 olen)
-{
-   struct inode *src = file_inode(src_file);
-   struct inode *dst = file_inode(dst_file);
-   u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-
-   if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-   /*
-* Btrfs does not support blocks

[PATCH 10/25] vfs: create generic_remap_file_range_touch to update inode metadata

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Create a new VFS helper to handle inode metadata updates when remapping
into a file.  If the operation can possibly alter the file contents, we
must update the ctime and mtime and remove security privileges, just
like we do for regular file writes.  Wire up ocfs2 to ensure consistent
behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |8 
 fs/read_write.c |   24 
 fs/xfs/xfs_reflink.c|   29 +++--
 include/linux/fs.h  |1 +
 4 files changed, 40 insertions(+), 22 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 36c56dfbe485..ee1ed11379b3 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4855,6 +4855,14 @@ int ocfs2_reflink_remap_range(struct file *file_in,
if (ret <= 0)
goto out_unlock;
 
+   /*
+* Update inode timestamps and remove security privileges before we
+* take the ilock.
+*/
+   ret = generic_remap_file_range_touch(file_out, is_dedupe);
+   if (ret)
+   goto out_unlock;
+
/* Lock out changes to the allocation maps and remap. */
down_write(_I(inode_in)->ip_alloc_sem);
if (!same_inode)
diff --git a/fs/read_write.c b/fs/read_write.c
index 60cdfb576d81..b233fe019fae 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1811,6 +1811,30 @@ int generic_remap_file_range_prep(struct file *file_in, 
loff_t pos_in,
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
+/* Update inode timestamps and remove security privileges when remapping. */
+int generic_remap_file_range_touch(struct file *file, bool is_dedupe)
+{
+   int ret;
+
+   /* If can't alter the file contents, we're done. */
+   if (is_dedupe)
+   return 0;
+
+   /* Update the timestamps, since we can alter file contents. */
+   if (!(file->f_mode & FMODE_NOCMTIME)) {
+   ret = file_update_time(file);
+   if (ret)
+   return ret;
+   }
+
+   /*
+* Clear the security bits if the process is not being run by root.
+* This keeps people from modifying setuid and setgid binaries.
+*/
+   return file_remove_privs(file);
+}
+EXPORT_SYMBOL(generic_remap_file_range_touch);
+
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
 {
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a7757a128a78..99f2ea4fcaba 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1371,28 +1371,13 @@ xfs_reflink_remap_prep(
truncate_inode_pages_range(_out->i_data, pos_out,
   PAGE_ALIGN(pos_out + *len) - 1);
 
-   /* If we're altering the file contents... */
-   if (!is_dedupe) {
-   /*
-* ...update the timestamps (which will grab the ilock again
-* from xfs_fs_dirty_inode, so we have to call it before we
-* take the ilock).
-*/
-   if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-   ret = file_update_time(file_out);
-   if (ret)
-   goto out_unlock;
-   }
-
-   /*
-* ...clear the security bits if the process is not being run
-* by root.  This keeps people from modifying setuid and setgid
-* binaries.
-*/
-   ret = file_remove_privs(file_out);
-   if (ret)
-   goto out_unlock;
-   }
+   /*
+* Update inode timestamps and remove security privileges before we
+* take the ilock.
+*/
+   ret = generic_remap_file_range_touch(file_out, is_dedupe);
+   if (ret)
+   goto out_unlock;
 
return 1;
 out_unlock:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 686905be04c0..91fd3c77763b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1847,6 +1847,7 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t 
, struct file *,
 extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 struct file *file_out, loff_t pos_out,
 u64 *count, bool is_dedupe);
+extern int generic_remap_file_range_touch(struct file *file, bool is_dedupe);
 extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
   struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,



[PATCH 05/25] vfs: avoid problematic remapping requests into partial EOF block

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

A deduplication data corruption is exposed by fstests generic/505 on
XFS. It is caused by extending the block match range to include the
partial EOF block, but then allowing unknown data beyond EOF to be
considered a "match" to data in the destination file because the
comparison is only made to the end of the source file. This corrupts the
destination file when the source extent is shared with it.

The VFS remapping prep functions  only support whole block dedupe, but
we still need to appear to support whole file dedupe correctly.  Hence
if the dedupe request includes the last block of the souce file, don't
include it in the actual dedupe operation. If the rest of the range
dedupes successfully, then reject the entire request.  A subsequent
patch will enable us to shorten dedupe requests correctly.

When reflinking sub-file ranges, a data corruption can occur when the
source file range includes a partial EOF block. This shares the unknown
data beyond EOF into the second file at a position inside EOF, exposing
stale data in the second file.

If the reflink request includes the last block of the souce file, only
proceed with the reflink operation if it lands at or past the
destination file's current EOF. If it lands within the destination file
EOF, reject the entire request with -EINVAL and make the caller go the
hard way.  A subsequent patch will enable us to shorten reflink requests
correctly.

Signed-off-by: Darrick J. Wong 
---
 fs/read_write.c |   22 ++
 1 file changed, 22 insertions(+)


diff --git a/fs/read_write.c b/fs/read_write.c
index d6e8e242a15f..8498991e2f33 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1723,6 +1723,7 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
 {
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
+   u64 blkmask = i_blocksize(inode_in) - 1;
bool same_inode = (inode_in == inode_out);
int ret;
 
@@ -1785,6 +1786,27 @@ int vfs_clone_file_prep(struct file *file_in, loff_t 
pos_in,
return -EBADE;
}
 
+   /* Are we doing a partial EOF block remapping of some kind? */
+   if (*len & blkmask) {
+   /*
+* If the dedupe data matches, don't try to dedupe the partial
+* EOF block.
+*
+* If the user is attempting to remap a partial EOF block and
+* it's inside the destination EOF then reject it.
+*
+* We don't support shortening requests, so we can only reject
+* them.
+*/
+   if (is_dedupe)
+   ret = -EBADE;
+   else if (pos_out + *len < i_size_read(inode_out))
+   ret = -EINVAL;
+
+   if (ret)
+   return ret;
+   }
+
return 1;
 }
 EXPORT_SYMBOL(vfs_clone_file_prep);



[PATCH 03/25] vfs: check file ranges before cloning files

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Move the file range checks from vfs_clone_file_prep into a separate
generic_remap_checks function so that all the checks are collected in a
central location.  This forms the basis for adding more checks from
generic_write_checks that will make cloning's input checking more
consistent with write input checking.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Amir Goldstein 
---
 fs/ocfs2/refcounttree.c |2 +
 fs/read_write.c |   55 +
 fs/xfs/xfs_reflink.c|2 +
 include/linux/fs.h  |9 --
 mm/filemap.c|   69 +++
 5 files changed, 90 insertions(+), 47 deletions(-)


diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..19e03936c5e1 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4850,7 +4850,7 @@ int ocfs2_reflink_remap_range(struct file *file_in,
(OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
goto out_unlock;
 
-   ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+   ret = vfs_clone_file_prep(file_in, pos_in, file_out, pos_out,
, is_dedupe);
if (ret <= 0)
goto out_unlock;
diff --git a/fs/read_write.c b/fs/read_write.c
index 260797b01851..d6e8e242a15f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1717,13 +1717,12 @@ static int clone_verify_area(struct file *file, loff_t 
pos, u64 len, bool write)
  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
  * the usual negative error code.
  */
-int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-  struct inode *inode_out, loff_t pos_out,
-  u64 *len, bool is_dedupe)
+int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
+   struct file *file_out, loff_t pos_out,
+   u64 *len, bool is_dedupe)
 {
-   loff_t bs = inode_out->i_sb->s_blocksize;
-   loff_t blen;
-   loff_t isize;
+   struct inode *inode_in = file_inode(file_in);
+   struct inode *inode_out = file_inode(file_out);
bool same_inode = (inode_in == inode_out);
int ret;
 
@@ -1740,10 +1739,10 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   isize = i_size_read(inode_in);
-
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {
+   loff_t isize = i_size_read(inode_in);
+
if (is_dedupe || pos_in == isize)
return 0;
if (pos_in > isize)
@@ -1751,36 +1750,11 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
*len = isize - pos_in;
}
 
-   /* Ensure offsets don't wrap and the input is inside i_size */
-   if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-   pos_in + *len > isize)
-   return -EINVAL;
-
-   /* Don't allow dedupe past EOF in the dest file */
-   if (is_dedupe) {
-   loff_t  disize;
-
-   disize = i_size_read(inode_out);
-   if (pos_out >= disize || pos_out + *len > disize)
-   return -EINVAL;
-   }
-
-   /* If we're linking to EOF, continue to the block boundary. */
-   if (pos_in + *len == isize)
-   blen = ALIGN(isize, bs) - pos_in;
-   else
-   blen = *len;
-
-   /* Only reflink if we're aligned to block boundaries */
-   if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-   !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-   return -EINVAL;
-
-   /* Don't allow overlapped reflink within the same file */
-   if (same_inode) {
-   if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-   return -EINVAL;
-   }
+   /* Check that we don't violate system file offset limits. */
+   ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+   is_dedupe);
+   if (ret)
+   return ret;
 
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode_in);
@@ -1813,7 +1787,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
 
return 1;
 }
-EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+EXPORT_SYMBOL(vfs_clone_file_prep);
 
 int do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, u64 len)
@@ -1851,9 +1825,6 @@ int do_clone_file_range(struct file *file_in, loff_t 
pos_in,
if (ret)
   

[PATCH 02/25] vfs: vfs_clone_file_prep_inodes should return EINVAL for a clone from beyond EOF

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

vfs_clone_file_prep_inodes cannot return 0 if it is asked to remap from
a zero byte file because that's what btrfs does.

Signed-off-by: Darrick J. Wong 
---
 fs/read_write.c |3 ---
 1 file changed, 3 deletions(-)


diff --git a/fs/read_write.c b/fs/read_write.c
index 8a2737f0d61d..260797b01851 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1740,10 +1740,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
 
-   /* Are we going all the way to the end? */
isize = i_size_read(inode_in);
-   if (isize == 0)
-   return 0;
 
/* Zero length dedupe exits immediately; reflink goes to EOF. */
if (*len == 0) {



[PATCH 04/25] vfs: strengthen checking of file range inputs to generic_remap_checks

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

File range remapping, if allowed to run past the destination file's EOF,
is an optimization on a regular file write.  Regular file writes that
extend the file length are subject to various constraints which are not
checked by range cloning.

This is a correctness problem because we're never allowed to touch
ranges that the page cache can't support (s_maxbytes); we're not
supposed to deal with large offsets (MAX_NON_LFS) if O_LARGEFILE isn't
set; and we must obey resource limits (RLIMIT_FSIZE).

Therefore, add these checks to the new generic_remap_checks function so
that we curtail unexpected behavior.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Amir Goldstein 
---
 mm/filemap.c |   91 ++
 1 file changed, 59 insertions(+), 32 deletions(-)


diff --git a/mm/filemap.c b/mm/filemap.c
index 47e6bfd45a91..08ad210fee49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2915,6 +2915,49 @@ struct page *read_cache_page_gfp(struct address_space 
*mapping,
 }
 EXPORT_SYMBOL(read_cache_page_gfp);
 
+static int generic_access_check_limits(struct file *file, loff_t pos,
+  loff_t *count)
+{
+   struct inode *inode = file->f_mapping->host;
+
+   /* Don't exceed the LFS limits. */
+   if (unlikely(pos + *count > MAX_NON_LFS &&
+   !(file->f_flags & O_LARGEFILE))) {
+   if (pos >= MAX_NON_LFS)
+   return -EFBIG;
+   *count = min(*count, (loff_t)MAX_NON_LFS - pos);
+   }
+
+   /*
+* Don't operate on ranges the page cache doesn't support.
+*
+* If we have written data it becomes a short write.  If we have
+* exceeded without writing data we send a signal and return EFBIG.
+* Linus frestrict idea will clean these up nicely..
+*/
+   if (unlikely(pos >= inode->i_sb->s_maxbytes))
+   return -EFBIG;
+
+   *count = min(*count, inode->i_sb->s_maxbytes - pos);
+   return 0;
+}
+
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+   unsigned long limit = rlimit(RLIMIT_FSIZE);
+
+   if (limit != RLIM_INFINITY) {
+   if (pos >= limit) {
+   send_sig(SIGXFSZ, current, 0);
+   return -EFBIG;
+   }
+   *count = min(*count, (loff_t)limit - pos);
+   }
+
+   return generic_access_check_limits(file, pos, count);
+}
+
 /*
  * Performs necessary checks before doing a write
  *
@@ -2926,8 +2969,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
 {
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
-   unsigned long limit = rlimit(RLIMIT_FSIZE);
-   loff_t pos;
+   loff_t count;
+   int ret;
 
if (!iov_iter_count(from))
return 0;
@@ -2936,40 +2979,15 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
 
-   pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
 
-   if (limit != RLIM_INFINITY) {
-   if (iocb->ki_pos >= limit) {
-   send_sig(SIGXFSZ, current, 0);
-   return -EFBIG;
-   }
-   iov_iter_truncate(from, limit - (unsigned long)pos);
-   }
+   count = iov_iter_count(from);
+   ret = generic_write_check_limits(file, iocb->ki_pos, );
+   if (ret)
+   return ret;
 
-   /*
-* LFS rule
-*/
-   if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-   !(file->f_flags & O_LARGEFILE))) {
-   if (pos >= MAX_NON_LFS)
-   return -EFBIG;
-   iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
-   }
-
-   /*
-* Are we about to exceed the fs block limit ?
-*
-* If we have written data it becomes a short write.  If we have
-* exceeded without writing data we send a signal and return EFBIG.
-* Linus frestrict idea will clean these up nicely..
-*/
-   if (unlikely(pos >= inode->i_sb->s_maxbytes))
-   return -EFBIG;
-
-   iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+   iov_iter_truncate(from, count);
return iov_iter_count(from);
 }
 EXPORT_SYMBOL(generic_write_checks);
@@ -2991,6 +3009,7 @@ int generic_remap_checks(struct file *file_in, loff_t 
pos_in,
uint64_t bcount;
loff_t size_in, size_out;
loff_t bs = inode_out->i_sb-&

[PATCH v3 00/25] fs: fixes for serious clone/dedupe problems

2018-10-10 Thread Darrick J. Wong
Hi all,

Dave, Eric, and I have been chasing a stale data exposure bug in the XFS
reflink implementation, and tracked it down to reflink forgetting to do
some of the file-extending activities that must happen for regular
writes.

We then started auditing the clone, dedupe, and copyfile code and
realized that from a file contents perspective, clonerange isn't any
different from a regular file write.  Unfortunately, we also noticed
that *unlike* a regular write, clonerange skips a ton of overflow
checks, such as validating the ranges against s_maxbytes, MAX_NON_LFS,
and RLIMIT_FSIZE.  We also observed that cloning into a file did not
strip security privileges (suid, capabilities) like a regular write
would.  I also noticed that xfs and ocfs2 need to dump the page cache
before remapping blocks, not after.

In fixing the range checking problems I also realized that both dedupe
and copyfile tell userspace how much of the requested operation was
acted upon.  Since the range validation can shorten a clone request (or
we can ENOSPC midway through), we might as well plumb the short
operation reporting back through the VFS indirection code to userspace.

So, here's the whole giant pile of patches[1] that fix all the problems.
This branch is against 4.19-rc7 with Dave Chinner's XFS for-next branch.
The patch "generic: test reflink side effects" recently sent to fstests
exercises the fixes in this series.  Tests are in [2].

--D

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git/log/?h=djwong-devel
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/log/?h=djwong-devel


[PATCH 01/25] xfs: add a per-xfs trace_printk macro

2018-10-10 Thread Darrick J. Wong
From: Darrick J. Wong 

Add a "xfs_tprintk" macro so that developers can use trace_printk to
print out arbitrary debugging information with the XFS device name
attached to the trace output.

Signed-off-by: Darrick J. Wong 
---
 fs/xfs/xfs_error.h |6 ++
 1 file changed, 6 insertions(+)


diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..5caa8bdf6c38 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -76,6 +76,11 @@ extern int xfs_errortag_set(struct xfs_mount *mp, unsigned 
int error_tag,
unsigned int tag_value);
 extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
 extern int xfs_errortag_clearall(struct xfs_mount *mp);
+
+/* trace printk version of xfs_err and friends */
+#define xfs_tprintk(mp, fmt, args...) \
+   trace_printk("dev %d:%d " fmt, MAJOR((mp)->m_super->s_dev), \
+   MINOR((mp)->m_super->s_dev), ##args)
 #else
 #define xfs_errortag_init(mp)  (0)
 #define xfs_errortag_del(mp)
@@ -83,6 +88,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
 #define xfs_errortag_set(mp, tag, val) (ENOSYS)
 #define xfs_errortag_add(mp, tag)  (ENOSYS)
 #define xfs_errortag_clearall(mp)  (ENOSYS)
+#define xfs_tprintk(mp, fmt, args...)  do { } while (0)
 #endif /* DEBUG */
 
 /*



Re: cross-fs copy support

2018-10-02 Thread Darrick J. Wong
On Tue, Oct 02, 2018 at 10:15:44AM +0200, David Sterba wrote:
> On Mon, Oct 01, 2018 at 01:51:09PM -0600, Andreas Dilger wrote:
> > > Yes, I would expect there to be problems with his modified kernel
> > > for a filesystem that supports clone_file_range, because
> > > vfs_copy_file_range() will clone if possible, and this should fail across
> > > filesystems.
> > > 
> > > In general, though, I don't know for sure why we don't fall back to
> > > do_splice_direct() across filesystems, although the filesystems that
> > > implement their own ->copy_file_range ops may have their own,
> > > further restrictions within their implementations.
> > > 
> > > This call /is/ documented in the manpage as only being valid for
> > > files on the same filesystem, though:
> > > http://man7.org/linux/man-pages/man2/copy_file_range.2.html
> > 
> > There was a patch to allow cross-mount copy for NFS, but it hasn't landed
> > yet.
> 
> I found https://marc.info/?l=linux-nfs=144138779721907=2 that lifts
> the VFS check (part of a series that can't be easily linked to).
> 
> The lack of cross-mount reflink (based on the copy_file_ragne) is often
> confusing users, there are common setups that mount subvolumes
> separately and reflinking between them would require mount of the
> toplevel subvolume.
> 
> If there are 2 in-kernel users of the relaxed cross-mount copy, I think
> this would help to push the series forward.

I don't have any objection to cross-mountpoint same-filesystem clones,
though obviously we need to all agree that from now on the vfs /does/
support certain IO operations across mountpoints.

(I haven't any opinion on cross-filesystem copies, as XFS is incapable
of such things.)

--D


Re: [patch] file dedupe (and maybe clone) data corruption (was Re: [PATCH] generic: test for deduplication between different files)

2018-08-20 Thread Darrick J. Wong
On Mon, Aug 20, 2018 at 11:09:32AM +1000, Dave Chinner wrote:
> [cc linux-fsdevel now, too]
> 
> On Mon, Aug 20, 2018 at 09:11:26AM +1000, Dave Chinner wrote:
> > [cc linux-...@vger.kernel.org]
> > 
> > On Fri, Aug 17, 2018 at 09:39:24AM +0100, fdman...@kernel.org wrote:
> > > From: Filipe Manana 
> > > 
> > > Test that deduplication of an entire file that has a size that is not
> > > aligned to the filesystem's block size into a different file does not
> > > corrupt the destination's file data.
> 
> Ok, I've looked at this now. My first question is where did all the
> magic offsets in this test come from? i.e. how was this bug
> found and who is it affecting?
> 
> > > This test is motivated by a bug found in Btrfs which is fixed by the
> > > following patch for the linux kernel:
> > > 
> > >   "Btrfs: fix data corruption when deduplicating between different files"
> > > 
> > > XFS also fails this test, at least as of linux kernel 4.18-rc7, exactly
> > > with the same corruption as in Btrfs - some bytes of a block get replaced
> > > with zeroes after the deduplication.
> > 
> > Filipe, in future can please report XFS bugs you find to the XFS
> > list the moment you find them. We shouldn't ever find out about a
> > data corruption bug we need to fix via a "oh, by the way" comment in
> > a commit message for a regression test
> 
> This becomes much more relevant because of what I've just found
> 
> .
> 
> > > +# The first byte with a value of 0xae starts at an offset (2518890) 
> > > which is not
> > > +# a multiple of the block size.
> > > +$XFS_IO_PROG -f \
> > > + -c "pwrite -S 0x6b 0 2518890" \
> > > + -c "pwrite -S 0xae 2518890 102398" \
> > > + $SCRATCH_MNT/foo | _filter_xfs_io
> > > +
> > > +# Create a second file with a length not aligned to the block size, 
> > > whose bytes
> > > +# all have the value 0x6b, so that its extent(s) can be deduplicated 
> > > with the
> > > +# first file.
> > > +$XFS_IO_PROG -f -c "pwrite -S 0x6b 0 557771" $SCRATCH_MNT/bar | 
> > > _filter_xfs_io
> > > +
> > > +# The file is filled with bytes having the value 0x6b from offset 0 to 
> > > offset
> > > +# 2518889 and with the value 0xae from offset 2518890 to offset 2621287.
> > > +echo "File content before deduplication:"
> > > +od -t x1 $SCRATCH_MNT/foo
> 
> Please use "od -Ad -t x1 " so the file offsets reported by od
> match the offsets used in the test (i.e. in decimal, not octal).
> 
> > > +
> > > +# Now deduplicate the entire second file into a range of the first file 
> > > that
> > > +# also has all bytes with the value 0x6b. The destination range's end 
> > > offset
> > > +# must not be aligned to the block size and must be less then the offset 
> > > of
> > > +# the first byte with the value 0xae (byte at offset 2518890).
> > > +$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/bar 0 1957888 557771" 
> > > $SCRATCH_MNT/foo \
> > > + | _filter_xfs_io
> 
> Ok, now it gets fun. dedupe to non-block aligned rtanges is supposed
> to be rejected by the kernel in vfs_clone_file_prep_inodes(). i.e
> this check:
> 
> /* Only reflink if we're aligned to block boundaries */
> if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
> !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
> return -EINVAL;
> 
> And it's pretty clear that a length of 557771 is not block aligned
> (being an odd number).
> 
> So why was this dedupe request even accepted by the kernel? Well,
> I think there's a bug in the check just above this:
> 
> /* If we're linking to EOF, continue to the block boundary. */
> if (pos_in + *len == isize)
> blen = ALIGN(isize, bs) - pos_in;
> else
> blen = *len;
> 
> basically, when the "in" file dedupe/reflink range is to EOF, it
> expands the range to include /all/ of the block that contains the
> EOF byte. IOWs, it now requests us to dedupe /undefined data beyond
> EOF/. But when we go to compare the data in these ranges, it
> truncates the comparison to the length that the user asked for
> (i.e. *len) and not the extended block length.
> 
> IOWs, it doesn't compare the bytes beyond EOF in the source block to
> the data in the destination block it would replace, and so doesn't
> fail the compare like it should.
> 
> And, well, btrfs has the same bug. extent_same_check_offsets()
> extends the range for alignment so it passes alignment checks, but
> then /explicitly/ uses the original length for the data compare
> and dedupe. i.e:
> 
>/* pass original length for comparison so we stay within i_size */
> ret = btrfs_cmp_data(olen, cmp);
> if (ret == 0)
> ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
> 
> This is what we should see if someone tried to dedupe the EOF block
> of a file:
> 
> generic/505 - output mismatch (see 
> /home/dave/src/xfstests-dev/results//xfs/generic/505.out.bad)
> --- tests/generic/505.out   2018-08-20 

Re: [RESEND][PATCH v4 0/2] vfs: better dedupe permission check

2018-07-17 Thread Darrick J. Wong
On Tue, Jul 17, 2018 at 12:09:04PM -0700, Mark Fasheh wrote:
> Hi Al,
> 
> The following patches fix a couple of issues with the permission check
> we do in vfs_dedupe_file_range(). I sent them out for a few times now,
> a changelog is attached. If they look ok to you, I'd appreciate them
> being pushed upstream.
> 
> You can get them from git if you like:
> 
> git pull https://github.com/markfasheh/linux dedupe-perms
> 
> I also have a set of patches against 4.17 if you prefer. The code and
> testing are identical:
> 
> git pull https://github.com/markfasheh/linux dedupe-perms-v4.17
> 
> 
> The first patch expands our check to allow dedupe of a file if the
> user owns it or otherwise would be allowed to write to it.
> 
> Current behavior is that we'll allow dedupe only if:
> 
> - the user is an admin (root)
> - the user has the file open for write
> 
> This makes it impossible for a user to dedupe their own file set
> unless they do it as root, or ensure that all files have write
> permission. There's a couple of duperemove bugs open for this:
> 
> https://github.com/markfasheh/duperemove/issues/129
> https://github.com/markfasheh/duperemove/issues/86
> 
> The other problem we have is also related to forcing the user to open
> target files for write - A process trying to exec a file currently
> being deduped gets ETXTBUSY. The answer (as above) is to allow them to
> open the targets ro - root can already do this. There was a patch from
> Adam Borowski to fix this back in 2016:
> 
> https://lkml.org/lkml/2016/7/17/130
> 
> which I have incorporated into my changes.
> 
> 
> The 2nd patch fixes our return code for permission denied to be
> EPERM. For some reason we're returning EINVAL - I think that's
> probably my fault. At any rate, we need to be returning something
> descriptive of the actual problem, otherwise callers see EINVAL and
> can't really make a valid determination of what's gone wrong.
> 
> This has also popped up in duperemove, mostly in the form of cryptic
> error messages. Because this is a code returned to userspace, I did
> check the other users of extent-same that I could find. Both 'bees'
> and 'rust-btrfs' do the same as duperemove and simply report the error
> (as they should).
> 
> Please apply.
> 
> Thanks,
>   --Mark
> 
> Changes from V3 to V4:
> - Add a patch (below) to ioctl_fideduperange.2 explaining our
>   changes. I will send this patch once the kernel update is
>   accepted. Thanks to Darrick Wong for this suggestion.
> - V3 discussion: https://www.spinics.net/lists/linux-btrfs/msg79135.html
> 
> Changes from V2 to V3:
> - Return bool from allow_file_dedupe
> - V2 discussion: https://www.spinics.net/lists/linux-btrfs/msg78421.html
> 
> Changes from V1 to V2:
> - Add inode_permission check as suggested by Adam Borowski
> - V1 discussion: https://marc.info/?l=linux-xfs=152606684017965=2
> 
> 
> From: Mark Fasheh 
> 
> [PATCH] ioctl_fideduperange.2: clarify permission requirements
> 
> dedupe permission checks were recently relaxed - update our man page to
> reflect those changes.
> 
> Signed-off-by: Mark Fasheh 
> ---
>  man2/ioctl_fideduperange.2 | 8 +---

Mmm, man page update, thank you for editing the documentation too!

Please cc linux-api and Michael Kerrisk so this can go upstream.

>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/man2/ioctl_fideduperange.2 b/man2/ioctl_fideduperange.2
> index 84d20a276..7dea0323d 100644
> --- a/man2/ioctl_fideduperange.2
> +++ b/man2/ioctl_fideduperange.2
> @@ -105,9 +105,11 @@ The field
>  must be zero.
>  During the call,
>  .IR src_fd
> -must be open for reading and
> +must be open for reading.
>  .IR dest_fd
> -must be open for writing.
> +can be open for writing, or reading. If

Manpages usually start each new sentence on its own line (though I defer
to mkerrisk on that).

> +.IR dest_fd
> +is open for reading, the user should be have write access to the file.

"...the user must have write access..."

>  The combined size of the struct
>  .IR file_dedupe_range
>  and the struct
> @@ -185,8 +187,8 @@ This can appear if the filesystem does not support 
> deduplicating either file
>  descriptor, or if either file descriptor refers to special inodes.
>  .TP
>  .B EPERM
> +This will be returned if the user lacks permission to dedupe the file 
> referenced by
>  .IR dest_fd
> -is immutable.

(Did the period fall off the end of the sentence here?  I am bad at
reading manpage markup...)

--D

>  .TP
>  .B ETXTBSY
>  One of the files is a swap file.
> -- 
> 2.15.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 1/2] vfs: allow dedupe of user owned read-only files

2018-06-07 Thread Darrick J. Wong
On Thu, Jun 07, 2018 at 10:38:53AM -0700, Mark Fasheh wrote:
> The permission check in vfs_dedupe_file_range() is too coarse - We
> only allow dedupe of the destination file if the user is root, or
> they have the file open for write.
> 
> This effectively limits a non-root user from deduping their own read-only
> files. In addition, the write file descriptor that the user is forced to
> hold open can prevent execution of files. As file data during a dedupe
> does not change, the behavior is unexpected and this has caused a number of
> issue reports. For an example, see:
> 
> https://github.com/markfasheh/duperemove/issues/129
> 
> So change the check so we allow dedupe on the target if:
> 
> - the root or admin is asking for it
> - the process has write access
> - the owner of the file is asking for the dedupe
> - the process could get write access
> 
> That way users can open read-only and still get dedupe.
> 
> Signed-off-by: Mark Fasheh 

Looks ok, but could you please update the manpage for
ioctl_fideduperange to elaborate on when userspace can expect EPERM?

Acked-by: Darrick J. Wong 

--D

> ---
>  fs/read_write.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index e83bd9744b5d..71e9077f8bc1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1964,6 +1964,20 @@ int vfs_dedupe_file_range_compare(struct inode *src, 
> loff_t srcoff,
>  }
>  EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
>  
> +/* Check whether we are allowed to dedupe the destination file */
> +static bool allow_file_dedupe(struct file *file)
> +{
> + if (capable(CAP_SYS_ADMIN))
> + return true;
> + if (file->f_mode & FMODE_WRITE)
> + return true;
> + if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
> + return true;
> + if (!inode_permission(file_inode(file), MAY_WRITE))
> + return true;
> + return false;
> +}
> +
>  int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>  {
>   struct file_dedupe_range_info *info;
> @@ -1972,7 +1986,6 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>   u64 len;
>   int i;
>   int ret;
> - bool is_admin = capable(CAP_SYS_ADMIN);
>   u16 count = same->dest_count;
>   struct file *dst_file;
>   loff_t dst_off;
> @@ -2036,7 +2049,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>  
>   if (info->reserved) {
>   info->status = -EINVAL;
> - } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
> + } else if (!allow_file_dedupe(dst_file)) {
>   info->status = -EINVAL;
>   } else if (file->f_path.mnt != dst_file->f_path.mnt) {
>   info->status = -EXDEV;
> -- 
> 2.15.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 11/12] xfs: convert to bioset_init()/mempool_init()

2018-05-21 Thread Darrick J. Wong
On Sun, May 20, 2018 at 06:25:57PM -0400, Kent Overstreet wrote:
> Signed-off-by: Kent Overstreet <kent.overstr...@gmail.com>

Looks ok, I guess...
Acked-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  fs/xfs/xfs_aops.c  |  2 +-
>  fs/xfs/xfs_aops.h  |  2 +-
>  fs/xfs/xfs_super.c | 11 +--
>  3 files changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 0ab824f574..102463543d 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -594,7 +594,7 @@ xfs_alloc_ioend(
>   struct xfs_ioend*ioend;
>   struct bio  *bio;
>  
> - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
> + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, _ioend_bioset);
>   xfs_init_bio_from_bh(bio, bh);
>  
>   ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 69346d460d..694c85b038 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -18,7 +18,7 @@
>  #ifndef __XFS_AOPS_H__
>  #define __XFS_AOPS_H__
>  
> -extern struct bio_set *xfs_ioend_bioset;
> +extern struct bio_set xfs_ioend_bioset;
>  
>  /*
>   * Types of I/O for bmap clustering and I/O completion tracking.
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d714240529..f643d76db5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -63,7 +63,7 @@
>  #include 
>  
>  static const struct super_operations xfs_super_operations;
> -struct bio_set *xfs_ioend_bioset;
> +struct bio_set xfs_ioend_bioset;
>  
>  static struct kset *xfs_kset;/* top-level xfs sysfs dir */
>  #ifdef DEBUG
> @@ -1845,10 +1845,9 @@ MODULE_ALIAS_FS("xfs");
>  STATIC int __init
>  xfs_init_zones(void)
>  {
> - xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE,
> + if (bioset_init(_ioend_bioset, 4 * MAX_BUF_PER_PAGE,
>   offsetof(struct xfs_ioend, io_inline_bio),
> - BIOSET_NEED_BVECS);
> - if (!xfs_ioend_bioset)
> + BIOSET_NEED_BVECS))
>   goto out;
>  
>   xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
> @@ -1997,7 +1996,7 @@ xfs_init_zones(void)
>   out_destroy_log_ticket_zone:
>   kmem_zone_destroy(xfs_log_ticket_zone);
>   out_free_ioend_bioset:
> - bioset_free(xfs_ioend_bioset);
> + bioset_exit(_ioend_bioset);
>   out:
>   return -ENOMEM;
>  }
> @@ -2029,7 +2028,7 @@ xfs_destroy_zones(void)
>   kmem_zone_destroy(xfs_btree_cur_zone);
>   kmem_zone_destroy(xfs_bmap_free_item_zone);
>   kmem_zone_destroy(xfs_log_ticket_zone);
> - bioset_free(xfs_ioend_bioset);
> + bioset_exit(_ioend_bioset);
>  }
>  
>  STATIC int __init
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] vfs: dedupe should return EPERM if permission is not granted

2018-05-18 Thread Darrick J. Wong
On Fri, May 18, 2018 at 02:57:27PM -0700, Mark Fasheh wrote:
> Right now we return EINVAL if a process does not have permission to dedupe a
> file. This was an oversight on my part. EPERM gives a true description of
> the nature of our error, and EINVAL is already used for the case that the
> filesystem does not support dedupe.
> 
> Signed-off-by: Mark Fasheh <mfas...@suse.de>

Looks ok what with all the okays after I squawked last time,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  fs/read_write.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index cbea4ce58ad1..2238928ca819 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -2050,7 +2050,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>   if (info->reserved) {
>   info->status = -EINVAL;
>   } else if (!allow_file_dedupe(dst_file)) {
> - info->status = -EINVAL;
> + info->status = -EPERM;
>   } else if (file->f_path.mnt != dst_file->f_path.mnt) {
>   info->status = -EXDEV;
>   } else if (S_ISDIR(dst->i_mode)) {
> -- 
> 2.15.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/2] vfs: allow dedupe of user owned read-only files

2018-05-18 Thread Darrick J. Wong
On Fri, May 18, 2018 at 02:57:26PM -0700, Mark Fasheh wrote:
> The permission check in vfs_dedupe_file_range() is too coarse - We
> only allow dedupe of the destination file if the user is root, or
> they have the file open for write.
> 
> This effectively limits a non-root user from deduping their own read-only
> files. In addition, the write file descriptor that the user is forced to
> hold open can prevent execution of files. As file data during a dedupe
> does not change, the behavior is unexpected and this has caused a number of
> issue reports. For an example, see:
> 
> https://github.com/markfasheh/duperemove/issues/129
> 
> So change the check so we allow dedupe on the target if:
> 
> - the root or admin is asking for it
> - the process has write access
> - the owner of the file is asking for the dedupe
> - the process could get write access
> 
> That way users can open read-only and still get dedupe.
> 
> Signed-off-by: Mark Fasheh 
> ---
>  fs/read_write.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index c4eabbfc90df..cbea4ce58ad1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1964,6 +1964,20 @@ int vfs_dedupe_file_range_compare(struct inode *src, 
> loff_t srcoff,
>  }
>  EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
>  
> +/* Check whether we are allowed to dedupe the destination file */
> +static int allow_file_dedupe(struct file *file)

Shouldn't this return bool?  It's a predicate, after all...

--D

> +{
> + if (capable(CAP_SYS_ADMIN))
> + return 1;
> + if (file->f_mode & FMODE_WRITE)
> + return 1;
> + if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
> + return 1;
> + if (!inode_permission(file_inode(file), MAY_WRITE))
> + return 1;
> + return 0;
> +}
> +
>  int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>  {
>   struct file_dedupe_range_info *info;
> @@ -1972,7 +1986,6 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>   u64 len;
>   int i;
>   int ret;
> - bool is_admin = capable(CAP_SYS_ADMIN);
>   u16 count = same->dest_count;
>   struct file *dst_file;
>   loff_t dst_off;
> @@ -2036,7 +2049,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>  
>   if (info->reserved) {
>   info->status = -EINVAL;
> - } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
> + } else if (!allow_file_dedupe(dst_file)) {
>   info->status = -EINVAL;
>   } else if (file->f_path.mnt != dst_file->f_path.mnt) {
>   info->status = -EXDEV;
> -- 
> 2.15.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 5/5] generic: test invalid swap file activation

2018-05-18 Thread Darrick J. Wong
On Wed, May 16, 2018 at 01:38:49PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> Swap files cannot have holes, and they must at least two pages.
> swapon(8) and mkswap(8) have stricter restrictions, so add versions of
> those commands without any restrictions.
> 
> Signed-off-by: Omar Sandoval 
> ---
>  .gitignore|  2 ++
>  src/Makefile  |  2 +-
>  src/mkswap.c  | 83 +++
>  src/swapon.c  | 24 +
>  tests/generic/490 | 77 +++
>  tests/generic/490.out |  5 +++
>  tests/generic/group   |  1 +
>  7 files changed, 193 insertions(+), 1 deletion(-)
>  create mode 100644 src/mkswap.c
>  create mode 100644 src/swapon.c
>  create mode 100755 tests/generic/490
>  create mode 100644 tests/generic/490.out
> 
> diff --git a/.gitignore b/.gitignore
> index 53029e24..efc73a7c 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -92,6 +92,7 @@
>  /src/lstat64
>  /src/makeextents
>  /src/metaperf
> +/src/mkswap
>  /src/mmapcat
>  /src/multi_open_unlink
>  /src/nametest
> @@ -111,6 +112,7 @@
>  /src/seek_sanity_test
>  /src/stale_handle
>  /src/stat_test
> +/src/swapon
>  /src/t_access_root
>  /src/t_dir_offset
>  /src/t_dir_offset2
> diff --git a/src/Makefile b/src/Makefile
> index c42d3bb1..01fe99ef 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -26,7 +26,7 @@ LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize 
> preallo_rw_pattern_reader \
>   renameat2 t_getcwd e4compact test-nextquota punch-alternating \
>   attr-list-by-handle-cursor-test listxattr dio-interleaved t_dir_type \
>   dio-invalidate-cache stat_test t_encrypted_d_revalidate \
> - attr_replace_test
> + attr_replace_test swapon mkswap
>  
>  SUBDIRS = log-writes perf
>  
> diff --git a/src/mkswap.c b/src/mkswap.c
> new file mode 100644
> index ..d0bce2bd
> --- /dev/null
> +++ b/src/mkswap.c
> @@ -0,0 +1,83 @@
> +/* mkswap(8) without any sanity checks */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct swap_header {
> + charbootbits[1024];
> + uint32_tversion;
> + uint32_tlast_page;
> + uint32_tnr_badpages;
> + unsigned char   sws_uuid[16];
> + unsigned char   sws_volume[16];
> + uint32_tpadding[117];
> + uint32_tbadpages[1];
> +};
> +
> +int main(int argc, char **argv)
> +{
> + struct swap_header *hdr;
> + FILE *file;
> + struct stat st;
> + long page_size;
> + int ret;
> +
> + if (argc != 2) {
> + fprintf(stderr, "usage: %s PATH\n", argv[0]);
> + return EXIT_FAILURE;
> + }
> +
> + page_size = sysconf(_SC_PAGESIZE);
> + if (page_size == -1) {
> + perror("sysconf");
> + return EXIT_FAILURE;
> + }
> +
> + hdr = calloc(1, page_size);
> + if (!hdr) {
> + perror("calloc");
> + return EXIT_FAILURE;
> + }
> +
> + file = fopen(argv[1], "r+");
> + if (!file) {
> + perror("fopen");
> + free(hdr);
> + return EXIT_FAILURE;
> + }
> +
> + ret = fstat(fileno(file), );
> + if (ret) {
> + perror("fstat");
> + free(hdr);
> + fclose(file);
> + return EXIT_FAILURE;
> + }
> +
> + hdr->version = 1;
> + hdr->last_page = st.st_size / page_size - 1;
> + memset(>sws_uuid, 0x99, sizeof(hdr->sws_uuid));
> + memcpy((char *)hdr + page_size - 10, "SWAPSPACE2", 10);
> +
> + if (fwrite(hdr, page_size, 1, file) != 1) {
> + perror("fwrite");
> + free(hdr);
> + fclose(file);
> + return EXIT_FAILURE;
> + }
> +
> + if (fclose(file) == EOF) {
> + perror("fwrite");
> + free(hdr);
> + return EXIT_FAILURE;
> + }
> +
> + free(hdr);
> +
> + return EXIT_SUCCESS;
> +}
> diff --git a/src/swapon.c b/src/swapon.c
> new file mode 100644
> index ..0cb7108a
> --- /dev/null
> +++ b/src/swapon.c
> @@ -0,0 +1,24 @@
> +/* swapon(8) without any sanity checks; simply calls swapon(2) directly. */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +int main(int argc, char **argv)
> +{
> + int ret;
> +
> + if (argc != 2) {
> + fprintf(stderr, "usage: %s PATH\n", argv[0]);
> + return EXIT_FAILURE;
> + }
> +
> + ret = swapon(argv[1], 0);
> + if (ret) {
> + perror("swapon");
> + return EXIT_FAILURE;
> + }
> +
> + return EXIT_SUCCESS;
> +}
> diff --git a/tests/generic/490 b/tests/generic/490
> new file mode 100755
> index ..6ba2ecb3
> --- /dev/null
> +++ b/tests/generic/490
> @@ -0,0 +1,77 @@
> +#! /bin/bash
> +# FS QA Test 490
> +#
> +# Test invalid swap files.
> +#
> 

Re: [PATCH 14/14] mm: turn on vm_fault_t type checking

2018-05-16 Thread Darrick J. Wong
On Wed, May 16, 2018 at 07:43:48AM +0200, Christoph Hellwig wrote:
> Switch vm_fault_t to point to an unsigned int with __bÑ–twise annotations.
> This both catches any old ->fault or ->page_mkwrite instance with plain
> compiler type checking, as well as finding more intricate problems with
> sparse.
> 
> Signed-off-by: Christoph Hellwig <h...@lst.de>
> ---



For the iomap and xfs parts,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

That said...

> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 54f1e05ecf3e..da2b77a19911 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -22,7 +22,8 @@
>  #endif
>  #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
>  
> -typedef int vm_fault_t;
> +typedef unsigned __bitwise vm_fault_t;
> +
>  
>  struct address_space;
>  struct mem_cgroup;
> @@ -619,7 +620,7 @@ struct vm_special_mapping {
>* If non-NULL, then this is called to resolve page faults
>* on the special mapping.  If used, .pages is not checked.
>*/
> - int (*fault)(const struct vm_special_mapping *sm,
> + vm_fault_t (*fault)(const struct vm_special_mapping *sm,

Uh, we're changing function signatures /and/ redefinining vm_fault_t?
All in the same 90K patch?

I /was/ expecting a series of "convert X and all callers/users"
patches followed by a trivial one to switch the definition, not a giant
pile of change.  FWIW I don't mind so much if you make a patch
containing a change for some super-common primitive and a hojillion
little diff hunks tree-wide, but only one logical change at a time for a
big patch, please...

I quite prefer seeing the whole series from start to finish all packaged
up in one series, but wow this was overwhelming. :/

--D


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] vfs: dedupe should return EPERM if permission is not granted

2018-05-13 Thread Darrick J. Wong
On Sun, May 13, 2018 at 06:21:52PM +, Mark Fasheh wrote:
> On Fri, May 11, 2018 at 05:06:34PM -0700, Darrick J. Wong wrote:
> > On Fri, May 11, 2018 at 12:26:51PM -0700, Mark Fasheh wrote:
> > > Right now we return EINVAL if a process does not have permission to 
> > > dedupe a
> > > file. This was an oversight on my part. EPERM gives a true description of
> > > the nature of our error, and EINVAL is already used for the case that the
> > > filesystem does not support dedupe.
> > > 
> > > Signed-off-by: Mark Fasheh <mfas...@suse.de>
> > > ---
> > >  fs/read_write.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > index 77986a2e2a3b..8edef43a182c 100644
> > > --- a/fs/read_write.c
> > > +++ b/fs/read_write.c
> > > @@ -2038,7 +2038,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
> > > file_dedupe_range *same)
> > >   info->status = -EINVAL;
> > >   } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE) ||
> > >uid_eq(current_fsuid(), dst->i_uid))) {
> > > - info->status = -EINVAL;
> > > + info->status = -EPERM;
> > 
> > Hmm, are we allowed to change this aspect of the kabi after the fact?
> > 
> > Granted, we're only trading one error code for another, but will the
> > existing users of this care?  xfs_io won't and I assume duperemove won't
> > either, but what about bees? :)
> 
> Yeah if you see my initial e-mail I check bees and also rust-btrfs. I think
> this is fine as we're simply expanding on an error code return. There's no
> magic behavior expected with respect to these error codes either.

Ok.  No objections from me, then.

Acked-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

>   --Mark
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2 V2] hoist BTRFS_IOC_[SG]ET_FSLABEL to vfs

2018-05-11 Thread Darrick J. Wong
On Fri, May 11, 2018 at 04:41:45PM +0200, David Sterba wrote:
> On Fri, May 11, 2018 at 09:36:09AM -0500, Eric Sandeen wrote:
> > On 5/11/18 9:32 AM, Chris Mason wrote:
> > > On 11 May 2018, at 10:10, David Sterba wrote:
> > > 
> > >> On Thu, May 10, 2018 at 08:16:09PM +0100, Al Viro wrote:
> > >>> On Thu, May 10, 2018 at 01:13:57PM -0500, Eric Sandeen wrote:
> >  Move the btrfs label ioctls up to the vfs for general use.
> > 
> >  This retains 256 chars as the maximum size through the interface, which
> >  is the btrfs limit and AFAIK exceeds any other filesystem's maximum
> >  label size.
> > 
> >  Signed-off-by: Eric Sandeen 
> >  Reviewed-by: Andreas Dilger 
> >  Reviewed-by: David Sterba 
> > >>>
> > >>> No objections (and it obviously ought to go through btrfs tree).
> > >>
> > >> I can take it through my tree, but Eric mentioned that there's a patch
> > >> for xfs that depends on it. In this case it would make sense to take
> > >> both patches at once via the xfs tree. There are no pending conflicting
> > >> changes in btrfs.
> > > 
> > > Probably easiest to just have a separate pull dedicated just for this 
> > > series.  That way it doesn't really matter which tree it goes through.
> > 
> > Actually, I just realized that the changes to include/uapi/linux/fs.h are 
> > completely
> > independent of any btrfs changes, right - there's nothing wrong w/ 
> > redefining
> > the common ioctl under a different name in btrfs.  So the fs.h patch could 
> > go first,
> > through the xfs tree since it'll be using it.
> > 
> > Once the common ioctl definition goes in, then btrfs can change to define 
> > its ioctls to
> > the common ioctls, or act on them directly as my patch did, etc.  Would 
> > that be
> > a better plan?  IOWs there's no urgent need to coordinate a btrfs change.
> 
> Agreed, I like that plan.

Ok, I'll await a new series with all the patches that Eric wants to
squeeze through the xfs tree.  I don't mind carrying the btrfs changes
too, so long as they're one-liners and the btrfs maintainers ack/rvb it.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] vfs: dedupe should return EPERM if permission is not granted

2018-05-11 Thread Darrick J. Wong
On Fri, May 11, 2018 at 12:26:51PM -0700, Mark Fasheh wrote:
> Right now we return EINVAL if a process does not have permission to dedupe a
> file. This was an oversight on my part. EPERM gives a true description of
> the nature of our error, and EINVAL is already used for the case that the
> filesystem does not support dedupe.
> 
> Signed-off-by: Mark Fasheh 
> ---
>  fs/read_write.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 77986a2e2a3b..8edef43a182c 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -2038,7 +2038,7 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>   info->status = -EINVAL;
>   } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE) ||
>uid_eq(current_fsuid(), dst->i_uid))) {
> - info->status = -EINVAL;
> + info->status = -EPERM;

Hmm, are we allowed to change this aspect of the kabi after the fact?

Granted, we're only trading one error code for another, but will the
existing users of this care?  xfs_io won't and I assume duperemove won't
either, but what about bees? :)

--D

>   } else if (file->f_path.mnt != dst_file->f_path.mnt) {
>   info->status = -EXDEV;
>   } else if (S_ISDIR(dst->i_mode)) {
> -- 
> 2.15.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] vfs: allow dedupe of user owned read-only files

2018-05-11 Thread Darrick J. Wong
On Fri, May 11, 2018 at 12:26:50PM -0700, Mark Fasheh wrote:
> The permission check in vfs_dedupe_file_range() is too coarse - We
> only allow dedupe of the destination file if the user is root, or
> they have the file open for write.
> 
> This effectively limits a non-root user from deduping their own
> read-only files. As file data during a dedupe does not change,
> this is unexpected behavior and this has caused a number of issue
> reports. For an example, see:
> 
> https://github.com/markfasheh/duperemove/issues/129
> 
> So change the check so we allow dedupe on the target if:
> 
> - the root or admin is asking for it
> - the owner of the file is asking for the dedupe
> - the process has write access
> 
> Signed-off-by: Mark Fasheh <mfas...@suse.de>

Sounds fine I guess
Acked-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  fs/read_write.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index c4eabbfc90df..77986a2e2a3b 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -2036,7 +2036,8 @@ int vfs_dedupe_file_range(struct file *file, struct 
> file_dedupe_range *same)
>  
>   if (info->reserved) {
>   info->status = -EINVAL;
> - } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
> + } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE) ||
> +  uid_eq(current_fsuid(), dst->i_uid))) {
>   info->status = -EINVAL;
>   } else if (file->f_path.mnt != dst_file->f_path.mnt) {
>   info->status = -EXDEV;
> -- 
> 2.15.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v2 2/4] xfs: add verifier check for symlink with append/immutable flags

2018-05-10 Thread Darrick J. Wong
On Thu, May 10, 2018 at 04:13:57PM -0700, Luis R. Rodriguez wrote:
> The Linux VFS does not allow a way to set append/immuttable attributes
> to symlinks, this is just not possible. If this is detected we can
> correct this with xfs_repair, so inform the user.
> 
> Signed-off-by: Luis R. Rodriguez 
> ---
>  fs/xfs/libxfs/xfs_symlink_remote.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c 
> b/fs/xfs/libxfs/xfs_symlink_remote.c
> index 5ef5f354587e..42dd81ede3d6 100644
> --- a/fs/xfs/libxfs/xfs_symlink_remote.c
> +++ b/fs/xfs/libxfs/xfs_symlink_remote.c
> @@ -242,5 +242,10 @@ xfs_symlink_shortform_verify(
>   /* We /did/ null-terminate the buffer, right? */
>   if (*endp != 0)
>   return __this_address;
> +
> + /* Immutable and append flags are not allowed on symlinks */
> + if (ip->i_d.di_flags & (XFS_DIFLAG_APPEND | XFS_DIFLAG_IMMUTABLE))
> + return __this_address;

This belongs in xfs_dinode_verify so that it checks all symlinks, not
just the one shortform ones.

--D

> +
>   return NULL;
>  }
> -- 
> 2.17.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] fs: hoist BTRFS_IOC_[SG]ET_FSLABEL to vfs

2018-05-09 Thread Darrick J. Wong
On Wed, May 09, 2018 at 11:15:46AM -0600, Andreas Dilger wrote:
> On May 9, 2018, at 10:10 AM, Darrick J. Wong <darrick.w...@oracle.com> wrote:
> > 
> > On Wed, May 09, 2018 at 11:01:21AM -0500, Eric Sandeen wrote:
> >> Move the btrfs label ioctls up to the vfs for general use.
> >> 
> >> This retains 256 chars as the maximum size through the interface, which
> >> is the btrfs limit and AFAIK exceeds any other filesystem's maximum
> >> label size.
> >> 
> >> Signed-off-by: Eric Sandeen <sand...@redhat.com>
> >> ---
> >> 
> >> Let the bikeshedding on the exact ioctl name begin ;)
> >> 
> >> fs/btrfs/ioctl.c   | 8 
> >> include/uapi/linux/btrfs.h | 6 ++
> >> include/uapi/linux/fs.h| 8 ++--
> >> 3 files changed, 12 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> >> index 632e26d..2dd4cdf 100644
> >> --- a/fs/btrfs/ioctl.c
> >> +++ b/fs/btrfs/ioctl.c
> >> @@ -5444,6 +5444,10 @@ long btrfs_ioctl(struct file *file, unsigned int
> >>return btrfs_ioctl_setflags(file, argp);
> >>case FS_IOC_GETVERSION:
> >>return btrfs_ioctl_getversion(file, argp);
> >> +  case FS_IOC_GETFSLABEL:
> >> +  return btrfs_ioctl_get_fslabel(file, argp);
> >> +  case FS_IOC_SETFSLABEL:
> >> +  return btrfs_ioctl_set_fslabel(file, argp);
> >>case FITRIM:
> >>return btrfs_ioctl_fitrim(file, argp);
> >>case BTRFS_IOC_SNAP_CREATE:
> >> @@ -,10 +5559,6 @@ long btrfs_ioctl(struct file *file, unsigned int
> >>return btrfs_ioctl_quota_rescan_wait(file, argp);
> >>case BTRFS_IOC_DEV_REPLACE:
> >>return btrfs_ioctl_dev_replace(fs_info, argp);
> >> -  case BTRFS_IOC_GET_FSLABEL:
> >> -  return btrfs_ioctl_get_fslabel(file, argp);
> >> -  case BTRFS_IOC_SET_FSLABEL:
> >> -  return btrfs_ioctl_set_fslabel(file, argp);
> >>case BTRFS_IOC_GET_SUPPORTED_FEATURES:
> >>return btrfs_ioctl_get_supported_features(argp);
> >>case BTRFS_IOC_GET_FEATURES:
> >> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> >> index c8d99b9..ec611c8 100644
> >> --- a/include/uapi/linux/btrfs.h
> >> +++ b/include/uapi/linux/btrfs.h
> >> @@ -823,10 +823,8 @@ enum btrfs_err_code {
> >> #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \
> >>   struct btrfs_ioctl_quota_rescan_args)
> >> #define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46)
> >> -#define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \
> >> - char[BTRFS_LABEL_SIZE])
> >> -#define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \
> >> - char[BTRFS_LABEL_SIZE])
> >> +#define BTRFS_IOC_GET_FSLABEL FS_IOC_GETFSLABEL
> >> +#define BTRFS_IOC_SET_FSLABEL FS_IOC_SETFSLABEL
> >> #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
> >>  struct btrfs_ioctl_get_dev_stats)
> >> #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
> >> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> >> index d2a8313..1df3707 100644
> >> --- a/include/uapi/linux/fs.h
> >> +++ b/include/uapi/linux/fs.h
> >> @@ -242,6 +242,8 @@ struct fsxattr {
> >> #define FICLONERANGE   _IOW(0x94, 13, struct file_clone_range)
> >> #define FIDEDUPERANGE  _IOWR(0x94, 54, struct file_dedupe_range)
> >> 
> >> +#define FSLABEL_MAX 256   /* Max chars for the interface; each fs may 
> >> differ */
> >> +
> >> #defineFS_IOC_GETFLAGS _IOR('f', 1, long)
> >> #defineFS_IOC_SETFLAGS _IOW('f', 2, long)
> >> #defineFS_IOC_GETVERSION   _IOR('v', 1, long)
> >> @@ -251,8 +253,10 @@ struct fsxattr {
> >> #define FS_IOC32_SETFLAGS  _IOW('f', 2, int)
> >> #define FS_IOC32_GETVERSION_IOR('v', 1, int)
> >> #define FS_IOC32_SETVERSION_IOW('v', 2, int)
> >> -#define FS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr)
> >> -#define FS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr)
> >> +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
> >> +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct f

Re: [PATCH 2/2] man2: New page documenting filesystem get/set label ioctls

2018-05-09 Thread Darrick J. Wong
On Wed, May 09, 2018 at 11:04:03AM -0500, Eric Sandeen wrote:
> This documents the proposed new vfs-level ioctls which can
> get or set a mounted filesytem's label.
> 
> Signed-off-by: Eric Sandeen 
> ---
> 
> btrfs folks, please verify that this accurately describes your
> current behavior, thanks.
> 
> diff --git a/man2/ioctl_fslabel.2 b/man2/ioctl_fslabel.2
> new file mode 100644
> index 000..150aa53
> --- /dev/null
> +++ b/man2/ioctl_fslabel.2
> @@ -0,0 +1,83 @@
> +.\" Copyright (c) 2018, Red Hat, Inc.  All rights reserved.
> +.\"
> +.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
> +.\" This is free documentation; you can redistribute it and/or
> +.\" modify it under the terms of the GNU General Public License as
> +.\" published by the Free Software Foundation; either version 2 of
> +.\" the License, or (at your option) any later version.
> +.\"
> +.\" The GNU General Public License's references to "object code"
> +.\" and "executables" are to be interpreted as the output of any
> +.\" document formatting or typesetting system, including
> +.\" intermediate and printed output.
> +.\"
> +.\" This manual is distributed in the hope that it will be useful,
> +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
> +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +.\" GNU General Public License for more details.
> +.\"
> +.\" You should have received a copy of the GNU General Public
> +.\" License along with this manual; if not, see
> +.\" .
> +.\" %%%LICENSE_END
> +.TH IOCTL-FSLABEL 2 2018-05-02 "Linux" "Linux Programmer's Manual"
> +.SH NAME
> +ioctl_fslabel \- get or set a filesystem label
> +.SH SYNOPSIS
> +.br
> +.B #include 
> +.br
> +.B #include 
> +.sp
> +.BI "int ioctl(int " fd ", FS_IOC_GETFSLABEL, char " label [FSLABEL_MAX]);
> +.br
> +.BI "int ioctl(int " fd ", FS_IOC_SETFSLABEL, char " label [FSLABEL_MAX]);
> +.SH DESCRIPTION
> +If a filesystem supports online label manipulation, these
> +.BR ioctl (2)
> +operations can be used to get or set the filesystem label for the filesystem
> +on which
> +.B fd
> +resides.

Does the calling process need special capabilities or permissions?  If
so, those should be listed here.

> +.SH RETURN VALUE
> +On success zero is returned.  On error, \-1 is returned, and
> +.I errno
> +is set to indicate the error.
> +.PP
> +.SH ERRORS
> +Error codes can be one of, but are not limited to, the following:
> +.TP
> +.B EINVAL
> +The specified label exceeds the maximum label length for the filesystem.
> +.TP
> +.B ENOTTY
> +This can appear if the filesystem does not support online label manipulation.
> +.TP
> +.B EPERM
> +The calling process does not have sufficient permissions to set the label.
> +.TP
> +.B EFAULT
> +.I label
> +references an inaccessible memory area.
> +.SH VERSIONS
> +These ioctl operations first appeared in Linux 4.18.
> +They were previously known as
> +.B BTRFS_IOC_GET_FSLABEL
> +and
> +.B BTRFS_IOC_SET_FSLABEL
> +and were private to Btrfs.
> +.SH CONFORMING TO
> +This API is Linux-specific.
> +.SH NOTES
> +The maximum string length for this interface is
> +.BR FSLABEL_MAX ,
> +including the terminating null byte (\(aq\\0\(aq).
> +Filesystems have differing maximum label lengths, which may or
> +may not include the terminating null.  The string provided to
> +.B FS_IOC_SETFSLABEL
> +must always be null-terminated, and the string returned by
> +.B FS_IOC_GETFSLABEL
> +will always be null-terminated.
> +.SH SEE ALSO
> +.BR ioctl (2),
> +.BR blkid (8)
> diff --git a/man2/ioctl_getfslabel.2 b/man2/ioctl_getfslabel.2
> new file mode 100644
> index 000..bfa8dca
> --- /dev/null
> +++ b/man2/ioctl_getfslabel.2
> @@ -0,0 +1 @@
> +.so man2/ioctl_fslabel.2
> diff --git a/man2/ioctl_setfslabel.2 b/man2/ioctl_setfslabel.2
> new file mode 100644
> index 000..bfa8dca
> --- /dev/null
> +++ b/man2/ioctl_setfslabel.2
> @@ -0,0 +1 @@
> +.so man2/ioctl_fslabel.2

Put all the manpage content into ioctl_getfslabel.2 and have
ioctl_setfslabel.2 point to it, rather than three files.

--D

> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] fs: hoist BTRFS_IOC_[SG]ET_FSLABEL to vfs

2018-05-09 Thread Darrick J. Wong
On Wed, May 09, 2018 at 11:01:21AM -0500, Eric Sandeen wrote:
> Move the btrfs label ioctls up to the vfs for general use.
> 
> This retains 256 chars as the maximum size through the interface, which
> is the btrfs limit and AFAIK exceeds any other filesystem's maximum
> label size.
> 
> Signed-off-by: Eric Sandeen <sand...@redhat.com>
> ---
> 
> Let the bikeshedding on the exact ioctl name begin ;)
> 
>  fs/btrfs/ioctl.c   | 8 
>  include/uapi/linux/btrfs.h | 6 ++
>  include/uapi/linux/fs.h| 8 ++--
>  3 files changed, 12 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 632e26d..2dd4cdf 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -5444,6 +5444,10 @@ long btrfs_ioctl(struct file *file, unsigned int
>   return btrfs_ioctl_setflags(file, argp);
>   case FS_IOC_GETVERSION:
>   return btrfs_ioctl_getversion(file, argp);
> + case FS_IOC_GETFSLABEL:
> + return btrfs_ioctl_get_fslabel(file, argp);
> + case FS_IOC_SETFSLABEL:
> + return btrfs_ioctl_set_fslabel(file, argp);
>   case FITRIM:
>   return btrfs_ioctl_fitrim(file, argp);
>   case BTRFS_IOC_SNAP_CREATE:
> @@ -,10 +5559,6 @@ long btrfs_ioctl(struct file *file, unsigned int
>   return btrfs_ioctl_quota_rescan_wait(file, argp);
>   case BTRFS_IOC_DEV_REPLACE:
>   return btrfs_ioctl_dev_replace(fs_info, argp);
> - case BTRFS_IOC_GET_FSLABEL:
> - return btrfs_ioctl_get_fslabel(file, argp);
> - case BTRFS_IOC_SET_FSLABEL:
> - return btrfs_ioctl_set_fslabel(file, argp);
>   case BTRFS_IOC_GET_SUPPORTED_FEATURES:
>   return btrfs_ioctl_get_supported_features(argp);
>   case BTRFS_IOC_GET_FEATURES:
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index c8d99b9..ec611c8 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -823,10 +823,8 @@ enum btrfs_err_code {
>  #define BTRFS_IOC_QUOTA_RESCAN_STATUS _IOR(BTRFS_IOCTL_MAGIC, 45, \
>  struct btrfs_ioctl_quota_rescan_args)
>  #define BTRFS_IOC_QUOTA_RESCAN_WAIT _IO(BTRFS_IOCTL_MAGIC, 46)
> -#define BTRFS_IOC_GET_FSLABEL _IOR(BTRFS_IOCTL_MAGIC, 49, \
> -char[BTRFS_LABEL_SIZE])
> -#define BTRFS_IOC_SET_FSLABEL _IOW(BTRFS_IOCTL_MAGIC, 50, \
> -char[BTRFS_LABEL_SIZE])
> +#define BTRFS_IOC_GET_FSLABELFS_IOC_GETFSLABEL
> +#define BTRFS_IOC_SET_FSLABELFS_IOC_SETFSLABEL
>  #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
> struct btrfs_ioctl_get_dev_stats)
>  #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index d2a8313..1df3707 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -242,6 +242,8 @@ struct fsxattr {
>  #define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
>  #define FIDEDUPERANGE_IOWR(0x94, 54, struct file_dedupe_range)
>  
> +#define FSLABEL_MAX 256  /* Max chars for the interface; each fs may 
> differ */
> +
>  #define  FS_IOC_GETFLAGS _IOR('f', 1, long)
>  #define  FS_IOC_SETFLAGS _IOW('f', 2, long)
>  #define  FS_IOC_GETVERSION   _IOR('v', 1, long)
> @@ -251,8 +253,10 @@ struct fsxattr {
>  #define FS_IOC32_SETFLAGS_IOW('f', 2, int)
>  #define FS_IOC32_GETVERSION  _IOR('v', 1, int)
>  #define FS_IOC32_SETVERSION  _IOW('v', 2, int)
> -#define FS_IOC_FSGETXATTR_IOR ('X', 31, struct fsxattr)
> -#define FS_IOC_FSSETXATTR_IOW ('X', 32, struct fsxattr)
> +#define FS_IOC_FSGETXATTR_IOR('X', 31, struct fsxattr)
> +#define FS_IOC_FSSETXATTR_IOW('X', 32, struct fsxattr)

Separate patch for whitespace cleanup.

> +#define FS_IOC_GETFSLABEL_IOR(0x94, 49, char[FSLABEL_MAX])
> +#define FS_IOC_SETFSLABEL_IOW(0x94, 50, char[FSLABEL_MAX])

Looks ok otherwise,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

>  
>  /*
>   * File system encryption support
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs-cleaner / snapshot performance analysis

2018-02-13 Thread Darrick J. Wong
On Sun, Feb 11, 2018 at 02:40:16PM +0800, Qu Wenruo wrote:
> 
> 
> On 2018年02月10日 00:45, Ellis H. Wilson III wrote:
> > Hi all,
> > 
> > I am trying to better understand how the cleaner kthread (btrfs-cleaner)
> > impacts foreground performance, specifically during snapshot deletion.
> > My experience so far has been that it can be dramatically disruptive to
> > foreground I/O.
> > 
> > Looking through the wiki at kernel.org I have not yet stumbled onto any
> > analysis that would shed light on this specific problem.  I have found
> > numerous complaints about btrfs-cleaner online, especially relating to
> > quotas being enabled.  This has proven thus far less than helpful, as
> > the response tends to be "use less snapshots," or "disable quotas," both
> > of which strike me as intellectually unsatisfying answers, especially
> > the former in a filesystem where snapshots are supposed to be
> > "first-class citizens."
> 
> Yes, snapshots of btrfs is really "first-class citizen".
> Tons of designs are all biased to snapshot.
> 
> But one should be clear about one thing:
> Snapshot creation and backref walk (used in qgroup, relocation and
> extent deletion), are two conflicting workload in fact.
> 
> Btrfs puts snapshot creation on a very high priority, so that it greatly
> degrades the performance of backref walk (used in snapshot deletion,
> relocation and extent exclusive/shared calculation of qgroup).
> 
> Let me explain this problem in detail.
> 
> Just as explained by Peter Grandi, for any snapshot system (or any
> system supports reflink) there must be a reserved mapping tree, to tell
> which extent is used by who.
> 
> It's very critical, to determine if an extent is shared so we determine
> if we need to do CoW.
> 
> There are several different ways to implement it, and this hugely
> affects snapshot creation performance.
> 
> 1) Direct mapping record
>Just records exactly which extent is used by who, directly.
>So when we needs to check the owner, just search the tree ONCE, then
>we get it.
> 
>This is simple and it seems that LVM thin-provision and LVM
>traditional targets are all using them.
>(Maybe XFS also follows this way?)

Yes, it does.

>Pros:
>*FAST* backref walk, which means quick extent deletion and CoW
>condition check.
> 
> 
>Cons:
>*SLOW* snapshot creation.
>Each snapshot creation needs to insert new owner relationship into
>the tree. This modification grow with the size of snapshot source.

...of course xfs also doesn't support snapshots. :)

--D

> 2) Indirect mapping record
>Records upper level referencer only.
> 
>To get all direct owner of an extent, it will needs multiple lookup
>in the reserved mapping tree.
> 
>And obviously, btrfs is using this method.
> 
>Pros:
>*FAST* owner inheritance, which means snapshot creation.
>(Well, the only advantage I can think of)
> 
>Cons:
>*VERY SLOW* backref walk, used by extent deletion, relocation, qgroup
>and Cow condition check.
>(That may also be why btrfs default to CoW data, so that it can skip
> the costy backref walk)
> 
> And a more detailed example of the difference between them will be:
> 
> [Basic tree layout]
>  Tree X
>  node A
>/\
> node B node C
> / \   /  \
>  leaf D  leaf E  leaf F  leaf G
> 
> Use above tree X as snapshot source.
> 
> [Snapshot creation: Direct mapping]
> Then for direct mapping record, if we are going to create snapshot Y
> then we would get:
> 
> Tree X  Tree Y
> node A 
>  |  \ / |
>  |   X  |
>  |  / \ |
> node B  node C
>  /  \  / \
>   leaf D  leaf E   leaf F   leaf G
> 
> We need to create new node H, and update the owner for node B/C/D/E/F/G.
> 
> That's to say, we need to create 1 new node, and update 6 references of
> existing nodes/leaves.
> And this will grow rapidly if the tree is large, but still should be a
> linear increase.
> 
> 
> [Snapshot creation: Indirect mapping]
> And if using indirect mapping tree, firstly, reserved mapping tree
> doesn't record exactly the owner for each leaf/node, but only records
> its parent(s).
> 
> So even when tree X exists along, without snapshot Y, if we need to know
> the owner of leaf D, we only knows its only parent is node B.
> And do the same query on node B until we read node A and knows it's
> owned by tree X.
> 
>  Tree X ^
>  node A ^ Look upward until
>/| we reach tree root
> node B  | to search the owner
> /   | of a leaf/node
>  

Re: [PATCH RFC] Btrfs: expose bad chunks in sysfs

2018-02-08 Thread Darrick J. Wong
On Mon, Feb 05, 2018 at 04:15:02PM -0700, Liu Bo wrote:
> Btrfs tries its best to tolerate write errors, but kind of silently
> (except some messages in kernel log).
> 
> For raid1 and raid10, this is usually not a problem because there is a
> copy as backup, while for parity based raid setup, i.e. raid5 and
> raid6, the problem is that, if a write error occurs due to some bad
> sectors, one horizonal stripe becomes degraded and the number of write
> errors it can tolerate gets reduced by one, now if two disk fails,
> data may be lost forever.
> 
> One way to mitigate the data loss pain is to expose 'bad chunks',
> i.e. degraded chunks, to users, so that they can use 'btrfs balance'
> to relocate the whole chunk and get the full raid6 protection again
> (if the relocation works).
> 
> This introduces 'bad_chunks' in btrfs's per-fs sysfs directory.  Once
> a chunk of raid5 or raid6 becomes degraded, it will appear in
> 'bad_chunks'.
> 
> Signed-off-by: Liu Bo 
> ---
> - In this patch, 'bad chunks' is not persistent on disk, but it can be
>   added if it's thought to be a good idea.
> - This is lightly tested, comments are very welcome.

Hmmm... sorry to be late to the party and dump a bunch of semirelated
work suggestions, but what if you implemented GETFSMAP for btrfs?  Then
you could define a new 'defective' fsmap type/flag/whatever and export
it for whatever metadata/filedata/whatever is now screwed up?  Existing
interface, you don't have to kludge sysfs data, none of this string
interpretation stuff...

--D

> 
>  fs/btrfs/ctree.h   |  8 +++
>  fs/btrfs/disk-io.c |  2 ++
>  fs/btrfs/extent-tree.c | 13 +++
>  fs/btrfs/raid56.c  | 59 
> --
>  fs/btrfs/sysfs.c   | 26 ++
>  fs/btrfs/volumes.c | 15 +++--
>  fs/btrfs/volumes.h |  2 ++
>  7 files changed, 121 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 13c260b..08aad65 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1101,6 +1101,9 @@ struct btrfs_fs_info {
>   spinlock_t ref_verify_lock;
>   struct rb_root block_tree;
>  #endif
> +
> + struct list_head bad_chunks;
> + seqlock_t bc_lock;
>  };
>  
>  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
> @@ -2568,6 +2571,11 @@ static inline gfp_t btrfs_alloc_write_mask(struct 
> address_space *mapping)
>  
>  /* extent-tree.c */
>  
> +struct btrfs_bad_chunk {
> + u64 chunk_offset;
> + struct list_head list;
> +};
> +
>  enum btrfs_inline_ref_type {
>   BTRFS_REF_TYPE_INVALID = 0,
>   BTRFS_REF_TYPE_BLOCK =   1,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a8ecccf..061e7f94 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2568,6 +2568,8 @@ int open_ctree(struct super_block *sb,
>   init_waitqueue_head(_info->async_submit_wait);
>  
>   INIT_LIST_HEAD(_info->pinned_chunks);
> + INIT_LIST_HEAD(_info->bad_chunks);
> + seqlock_init(_info->bc_lock);
>  
>   /* Usable values until the real ones are cached from the superblock */
>   fs_info->nodesize = 4096;
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 2f43285..3ca7cb4 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -9903,6 +9903,19 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
>   kobject_del(_info->kobj);
>   kobject_put(_info->kobj);
>   }
> +
> + /* Clean up bad chunks. */
> + write_seqlock_irq(>bc_lock);
> + while (!list_empty(>bad_chunks)) {
> + struct btrfs_bad_chunk *bc;
> +
> + bc = list_first_entry(>bad_chunks,
> +   struct btrfs_bad_chunk, list);
> + list_del_init(>list);
> + kfree(bc);
> + }
> + write_sequnlock_irq(>bc_lock);
> +
>   return 0;
>  }
>  
> diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
> index a7f7925..e960247 100644
> --- a/fs/btrfs/raid56.c
> +++ b/fs/btrfs/raid56.c
> @@ -888,14 +888,19 @@ static void rbio_orig_end_io(struct btrfs_raid_bio 
> *rbio, blk_status_t err)
>  }
>  
>  /*
> - * end io function used by finish_rmw.  When we finally
> - * get here, we've written a full stripe
> + * end io function used by finish_rmw.  When we finally get here, we've 
> written
> + * a full stripe.
> + *
> + * Note that this is not under interrupt context as we queued endio to 
> workers.
>   */
>  static void raid_write_end_io(struct bio *bio)
>  {
>   struct btrfs_raid_bio *rbio = bio->bi_private;
>   blk_status_t err = bio->bi_status;
>   int max_errors;
> + u64 stripe_start = rbio->bbio->raid_map[0];
> + struct btrfs_fs_info *fs_info = rbio->fs_info;
> + int err_cnt;
>  
>   if (err)
>   fail_bio_stripe(rbio, bio);
> @@ -908,12 +913,58 @@ static void raid_write_end_io(struct bio *bio)
>   

Re: [PATCH v5 01/78] xfs: Rename xa_ elements to ail_

2018-01-02 Thread Darrick J. Wong
On Fri, Dec 15, 2017 at 02:03:33PM -0800, Matthew Wilcox wrote:
> From: Matthew Wilcox <mawil...@microsoft.com>
> 
> This is a simple rename, except that xa_ail becomes ail_head.
> 
> Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>

That was an eyeful,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

> ---
>  fs/xfs/xfs_buf_item.c|  10 ++--
>  fs/xfs/xfs_dquot.c   |   4 +-
>  fs/xfs/xfs_dquot_item.c  |  11 ++--
>  fs/xfs/xfs_inode_item.c  |  22 +++
>  fs/xfs/xfs_log.c |   6 +-
>  fs/xfs/xfs_log_recover.c |  80 -
>  fs/xfs/xfs_trans.c   |  18 +++---
>  fs/xfs/xfs_trans_ail.c   | 152 
> +++
>  fs/xfs/xfs_trans_buf.c   |   4 +-
>  fs/xfs/xfs_trans_priv.h  |  42 ++---
>  10 files changed, 175 insertions(+), 174 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index e0a0af0946f2..6c5035544a93 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
> @@ -459,7 +459,7 @@ xfs_buf_item_unpin(
>   bp->b_fspriv = NULL;
>   bp->b_iodone = NULL;
>   } else {
> - spin_lock(>xa_lock);
> + spin_lock(>ail_lock);
>   xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
>   xfs_buf_item_relse(bp);
>   ASSERT(bp->b_fspriv == NULL);
> @@ -1056,13 +1056,13 @@ xfs_buf_do_callbacks_fail(
>   struct xfs_log_item *lip = bp->b_fspriv;
>   struct xfs_ail  *ailp = lip->li_ailp;
>  
> - spin_lock(>xa_lock);
> + spin_lock(>ail_lock);
>   for (; lip; lip = next) {
>   next = lip->li_bio_list;
>   if (lip->li_ops->iop_error)
>   lip->li_ops->iop_error(lip, bp);
>   }
> - spin_unlock(>xa_lock);
> + spin_unlock(>ail_lock);
>  }
>  
>  static bool
> @@ -1215,7 +1215,7 @@ xfs_buf_iodone(
>*
>* Either way, AIL is useless if we're forcing a shutdown.
>*/
> - spin_lock(>xa_lock);
> + spin_lock(>ail_lock);
>   xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
>   xfs_buf_item_free(BUF_ITEM(lip));
>  }
> @@ -1236,7 +1236,7 @@ xfs_buf_resubmit_failed_buffers(
>   /*
>* Clear XFS_LI_FAILED flag from all items before resubmit
>*
> -  * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
> +  * XFS_LI_FAILED set/clear is protected by ail_lock, caller  this
>* function already have it acquired
>*/
>   for (; lip; lip = next) {
> diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
> index f248708c10ff..e2a466df5dd1 100644
> --- a/fs/xfs/xfs_dquot.c
> +++ b/fs/xfs/xfs_dquot.c
> @@ -974,7 +974,7 @@ xfs_qm_dqflush_done(
>(lip->li_flags & XFS_LI_FAILED))) {
>  
>   /* xfs_trans_ail_delete() drops the AIL lock. */
> - spin_lock(>xa_lock);
> + spin_lock(>ail_lock);
>   if (lip->li_lsn == qip->qli_flush_lsn) {
>   xfs_trans_ail_delete(ailp, lip, 
> SHUTDOWN_CORRUPT_INCORE);
>   } else {
> @@ -984,7 +984,7 @@ xfs_qm_dqflush_done(
>*/
>   if (lip->li_flags & XFS_LI_FAILED)
>   xfs_clear_li_failed(lip);
> - spin_unlock(>xa_lock);
> + spin_unlock(>ail_lock);
>   }
>   }
>  
> diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
> index 664dea105e76..62637a226601 100644
> --- a/fs/xfs/xfs_dquot_item.c
> +++ b/fs/xfs/xfs_dquot_item.c
> @@ -160,8 +160,9 @@ xfs_dquot_item_error(
>  STATIC uint
>  xfs_qm_dquot_logitem_push(
>   struct xfs_log_item *lip,
> - struct list_head*buffer_list) __releases(>li_ailp->xa_lock)
> -   __acquires(>li_ailp->xa_lock)
> + struct list_head*buffer_list)
> + __releases(>li_ailp->ail_lock)
> + __acquires(>li_ailp->ail_lock)
>  {
>   struct xfs_dquot*dqp = DQUOT_ITEM(lip)->qli_dquot;
>   struct xfs_buf  *bp = lip->li_buf;
> @@ -208,7 +209,7 @@ xfs_qm_dquot_logitem_push(
>   goto out_unlock;
>   }
>  
> - spin_unlock(>li_ailp->xa_lock);
> + spin_unlock(>li_ailp->ail_lock);
>  
>   error = xfs_qm_dqflush(dqp, );
>   if (error) {
> @@ -220,7 +221,7 @@ xfs_qm_dquot_logitem_push(
&

Re: [PATCH v5 03/78] xarray: Add the xa_lock to the radix_tree_root

2018-01-02 Thread Darrick J. Wong
On Tue, Dec 26, 2017 at 07:58:15PM -0800, Matthew Wilcox wrote:
> On Tue, Dec 26, 2017 at 07:43:40PM -0800, Matthew Wilcox wrote:
> > Also add the xa_lock() and xa_unlock() family of wrappers to make it
> > easier to use the lock.  If we could rely on -fplan9-extensions in
> > the compiler, we could avoid all of this syntactic sugar, but that
> > wasn't added until gcc 4.6.
> 
> Oh, in case anyone's wondering, here's how I'd do it with plan9 extensions:
> 
> struct xarray {
> spinlock_t;
> int xa_flags;
> void *xa_head;
> };
> 
> ...
> spin_lock_irqsave(>pages, flags);
> __delete_from_page_cache(page, NULL);
> spin_unlock_irqrestore(>pages, flags);
> ...
> 
> The plan9 extensions permit passing a pointer to a struct which has an
> unnamed element to a function which is expecting a pointer to the type
> of that element.  The compiler does any necessary arithmetic to produce 
> a pointer.  It's exactly as if I had written:
> 
> spin_lock_irqsave(>pages.xa_lock, flags);
> __delete_from_page_cache(page, NULL);
> spin_unlock_irqrestore(>pages.xa_lock, flags);
> 
> More details here: https://9p.io/sys/doc/compiler.html

I read the link, and I understand (from section 3.3) that replacing
foo.bar.baz.goo with foo.goo is less typing, but otoh the first time I
read your example above I thought "we're passing (an array of pages |
something that doesn't have the word 'lock' in the name) to
spin_lock_irqsave? wtf?"

I suppose it does force me to go dig into whatever mapping->pages is to
figure out that there's an unnamed spinlock_t and that the compiler can
insert the appropriate pointer arithmetic, but now my brain trips over
'pages' being at the end of the selector for parameter 1 which slows
down my review reading...

OTOH I guess it /did/ motivate me to click the link, so well played,
sir. :)

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 14/19] xfs: convert to new i_version API

2017-12-22 Thread Darrick J. Wong
On Fri, Dec 22, 2017 at 07:05:51AM -0500, Jeff Layton wrote:
> From: Jeff Layton <jlay...@redhat.com>
> 
> Signed-off-by: Jeff Layton <jlay...@redhat.com>
> ---
>  fs/xfs/libxfs/xfs_inode_buf.c | 7 +--
>  fs/xfs/xfs_icache.c   | 5 +++--
>  fs/xfs/xfs_inode.c| 3 ++-
>  fs/xfs/xfs_inode_item.c   | 3 ++-
>  fs/xfs/xfs_trans_inode.c  | 4 +++-
>  5 files changed, 15 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
> index 6b7989038d75..b9c0bf80669c 100644
> --- a/fs/xfs/libxfs/xfs_inode_buf.c
> +++ b/fs/xfs/libxfs/xfs_inode_buf.c
> @@ -32,6 +32,8 @@
>  #include "xfs_ialloc.h"
>  #include "xfs_dir2.h"
>  
> +#include 

/me wonders if these ought to be in fs/xfs/xfs_linux.h since this is
libxfs, but seeing as I already let that horse escape I might as well
clean it up separately.

Looks ok,
Acked-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> +
>  /*
>   * Check that none of the inode's in the buffer have a next
>   * unlinked field of 0.
> @@ -264,7 +266,8 @@ xfs_inode_from_disk(
>   to->di_flags= be16_to_cpu(from->di_flags);
>  
>   if (to->di_version == 3) {
> - inode->i_version = be64_to_cpu(from->di_changecount);
> + inode_set_iversion_queried(inode,
> +be64_to_cpu(from->di_changecount));
>   to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
>   to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
>   to->di_flags2 = be64_to_cpu(from->di_flags2);
> @@ -314,7 +317,7 @@ xfs_inode_to_disk(
>   to->di_flags = cpu_to_be16(from->di_flags);
>  
>   if (from->di_version == 3) {
> - to->di_changecount = cpu_to_be64(inode->i_version);
> + to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
>   to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
>   to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
>   to->di_flags2 = cpu_to_be64(from->di_flags2);
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 43005fbe8b1e..4c315adb05e6 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -37,6 +37,7 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Allocate and initialise an xfs_inode.
> @@ -293,14 +294,14 @@ xfs_reinit_inode(
>   int error;
>   uint32_tnlink = inode->i_nlink;
>   uint32_tgeneration = inode->i_generation;
> - uint64_tversion = inode->i_version;
> + uint64_tversion = inode_peek_iversion(inode);
>   umode_t mode = inode->i_mode;
>  
>   error = inode_init_always(mp->m_super, inode);
>  
>   set_nlink(inode, nlink);
>   inode->i_generation = generation;
> - inode->i_version = version;
> + inode_set_iversion_queried(inode, version);
>   inode->i_mode = mode;
>   return error;
>  }
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 801274126648..dfc5e60d8af3 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -16,6 +16,7 @@
>   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>   */
>  #include 
> +#include 
>  
>  #include "xfs.h"
>  #include "xfs_fs.h"
> @@ -833,7 +834,7 @@ xfs_ialloc(
>   ip->i_d.di_flags = 0;
>  
>   if (ip->i_d.di_version == 3) {
> - inode->i_version = 1;
> + inode_set_iversion(inode, 1);
>   ip->i_d.di_flags2 = 0;
>   ip->i_d.di_cowextsize = 0;
>   ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
> index 6ee5c3bf19ad..7571abf5dfb3 100644
> --- a/fs/xfs/xfs_inode_item.c
> +++ b/fs/xfs/xfs_inode_item.c
> @@ -30,6 +30,7 @@
>  #include "xfs_buf_item.h"
>  #include "xfs_log.h"
>  
> +#include 
>  
>  kmem_zone_t  *xfs_ili_zone;  /* inode log item zone */
>  
> @@ -354,7 +355,7 @@ xfs_inode_to_log_dinode(
>   to->di_next_unlinked = NULLAGINO;
>  
>   if (from->di_version == 3) {
> - to->di_changecount = inode->i_version;
> + to->di_changecount = inode_peek_iversion(inode);
>   to->di_crtime.t_sec = from->di_crtime.t_sec;
>   to->di_crtime.t_nsec = from->di_crtime.t_nsec;
>   to->di_flags2 = from->di_flags2;
> diff --git

Re: [PATCH v4 17/19] xfs: avoid setting XFS_ILOG_CORE if i_version doesn't need incrementing

2017-12-22 Thread Darrick J. Wong
On Fri, Dec 22, 2017 at 07:05:54AM -0500, Jeff Layton wrote:
> From: Jeff Layton <jlay...@redhat.com>
> 
> If XFS_ILOG_CORE is already set then go ahead and increment it.
> 
> Signed-off-by: Jeff Layton <jlay...@redhat.com>

Looks ok,
Acked-by: Darrick J. Wong <darrick.w...@oracle.com>

> ---
>  fs/xfs/xfs_trans_inode.c | 14 --
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
> index 225544327c4f..4a89da4b6fe7 100644
> --- a/fs/xfs/xfs_trans_inode.c
> +++ b/fs/xfs/xfs_trans_inode.c
> @@ -112,15 +112,17 @@ xfs_trans_log_inode(
>  
>   /*
>* First time we log the inode in a transaction, bump the inode change
> -  * counter if it is configured for this to occur. We don't use
> -  * inode_inc_version() because there is no need for extra locking around
> -  * i_version as we already hold the inode locked exclusively for
> -  * metadata modification.
> +  * counter if it is configured for this to occur. While we have the
> +  * inode locked exclusively for metadata modification, we can usually
> +  * avoid setting XFS_ILOG_CORE if no one has queried the value since
> +  * the last time it was incremented. If we have XFS_ILOG_CORE already
> +  * set however, then go ahead and bump the i_version counter
> +  * unconditionally.
>*/
>   if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
>   IS_I_VERSION(VFS_I(ip))) {
> - inode_inc_iversion(VFS_I(ip));
> - flags |= XFS_ILOG_CORE;
> + if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
> + flags |= XFS_ILOG_CORE;
>   }
>  
>   tp->t_flags |= XFS_TRANS_DIRTY;
> -- 
> 2.14.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] Btrfs: btrfs_dedupe_file_range() ioctl, remove 16MiB restriction

2017-12-19 Thread Darrick J. Wong
On Tue, Dec 19, 2017 at 01:02:44PM +0300, Timofey Titovets wrote:
> At now btrfs_dedupe_file_range() restricted to 16MiB range for
> limit locking time and memory requirement for dedup ioctl()
> 
> For too big input range code silently set range to 16MiB
> 
> Let's remove that restriction by do iterating over dedup range.
> That's backward compatible and will not change anything for request
> less then 16MiB.
> 
> Changes:
>   v1 -> v2:
> - Refactor btrfs_cmp_data_prepare and btrfs_extent_same
> - Store memory of pages array between iterations
> - Lock inodes once, not on each iteration
> - Small inplace cleanups

/me wonders if you could take advantage of vfs_clone_file_prep_inodes,
which takes care of the content comparison (and flushing files, and inode
checks, etc.) ?

(ISTR Qu Wenruo(??) or someone remarking that this might not work well
with btrfs locking model, but I could be mistaken about all that...)

--D

> 
> Signed-off-by: Timofey Titovets 
> ---
>  fs/btrfs/ioctl.c | 160 
> ---
>  1 file changed, 94 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index be5bd81b3669..45a47d0891fc 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -2965,8 +2965,8 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
>   put_page(pg);
>   }
>   }
> - kfree(cmp->src_pages);
> - kfree(cmp->dst_pages);
> +
> + cmp->num_pages = 0;
>  }
>  
>  static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
> @@ -2974,41 +2974,22 @@ static int btrfs_cmp_data_prepare(struct inode *src, 
> u64 loff,
> u64 len, struct cmp_pages *cmp)
>  {
>   int ret;
> - int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
> - struct page **src_pgarr, **dst_pgarr;
> -
> - /*
> -  * We must gather up all the pages before we initiate our
> -  * extent locking. We use an array for the page pointers. Size
> -  * of the array is bounded by len, which is in turn bounded by
> -  * BTRFS_MAX_DEDUPE_LEN.
> -  */
> - src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
> - dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
> - if (!src_pgarr || !dst_pgarr) {
> - kfree(src_pgarr);
> - kfree(dst_pgarr);
> - return -ENOMEM;
> - }
> - cmp->num_pages = num_pages;
> - cmp->src_pages = src_pgarr;
> - cmp->dst_pages = dst_pgarr;
>  
>   /*
>* If deduping ranges in the same inode, locking rules make it mandatory
>* to always lock pages in ascending order to avoid deadlocks with
>* concurrent tasks (such as starting writeback/delalloc).
>*/
> - if (src == dst && dst_loff < loff) {
> - swap(src_pgarr, dst_pgarr);
> + if (src == dst && dst_loff < loff)
>   swap(loff, dst_loff);
> - }
>  
> - ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
> + cmp->num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
> +
> + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
>   if (ret)
>   goto out;
>  
> - ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
> + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, 
> dst_loff);
>  
>  out:
>   if (ret)
> @@ -3078,31 +3059,23 @@ static int extent_same_check_offsets(struct inode 
> *inode, u64 off, u64 *plen,
>   return 0;
>  }
>  
> -static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
> -  struct inode *dst, u64 dst_loff)
> +static int __btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
> +struct inode *dst, u64 dst_loff,
> +struct cmp_pages *cmp)
>  {
>   int ret;
>   u64 len = olen;
> - struct cmp_pages cmp;
>   bool same_inode = (src == dst);
>   u64 same_lock_start = 0;
>   u64 same_lock_len = 0;
>  
> - if (len == 0)
> - return 0;
> -
> - if (same_inode)
> - inode_lock(src);
> - else
> - btrfs_double_inode_lock(src, dst);
> -
>   ret = extent_same_check_offsets(src, loff, , olen);
>   if (ret)
> - goto out_unlock;
> + return ret;
>  
>   ret = extent_same_check_offsets(dst, dst_loff, , olen);
>   if (ret)
> - goto out_unlock;
> + return ret;
>  
>   if (same_inode) {
>   /*
> @@ -3119,32 +3092,21 @@ static int btrfs_extent_same(struct inode *src, u64 
> loff, u64 olen,
>* allow an unaligned length so long as it ends at
>* i_size.
>*/
> - if (len != olen) {
> - ret = -EINVAL;
> - goto out_unlock;
> - }
> + if (len != olen)
> + 

Re: [PATCH v9 0/5] Add the ability to do BPF directed error injection

2017-12-13 Thread Darrick J. Wong
On Wed, Dec 13, 2017 at 01:03:57PM -0500, Josef Bacik wrote:
> On Tue, Dec 12, 2017 at 03:11:50PM -0800, Darrick J. Wong wrote:
> > On Mon, Dec 11, 2017 at 11:36:45AM -0500, Josef Bacik wrote:
> > > This is the same as v8, just rebased onto the bpf tree.
> > > 
> > > v8->v9:
> > > - rebased onto the bpf tree.
> > > 
> > > v7->v8:
> > > - removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.
> > > 
> > > v6->v7:
> > > - moved the opt-in macro to bpf.h out of kprobes.h.
> > > 
> > > v5->v6:
> > > - add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support 
> > > this
> > >   feature.  This way only functions that opt-in will be allowed to be
> > >   overridden.
> > > - added a btrfs patch to allow error injection for open_ctree() so that 
> > > the bpf
> > >   sample actually works.
> > > 
> > > v4->v5:
> > > - disallow kprobe_override programs from being put in the prog map array 
> > > so we
> > >   don't tail call into something we didn't check.  This allows us to make 
> > > the
> > >   normal path still fast without a bunch of percpu operations.
> > > 
> > > v3->v4:
> > > - fix a build error found by kbuild test bot (I didn't wait long enough
> > >   apparently.)
> > > - Added a warning message as per Daniels suggestion.
> > > 
> > > v2->v3:
> > > - added a ->kprobe_override flag to bpf_prog.
> > > - added some sanity checks to disallow attaching bpf progs that have
> > >   ->kprobe_override set that aren't for ftrace kprobes.
> > > - added the trace_kprobe_ftrace helper to check if the trace_event_call 
> > > is a
> > >   ftrace kprobe.
> > > - renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only 
> > > read this
> > >   value in the kprobe path, and thus only write to it if we're overriding 
> > > or
> > >   clearing the override.
> > > 
> > > v1->v2:
> > > - moved things around to make sure that bpf_override_return could really 
> > > only be
> > >   used for an ftrace kprobe.
> > > - killed the special return values from trace_call_bpf.
> > > - renamed pc_modified to bpf_kprobe_state so bpf_override_return could 
> > > tell if
> > >   it was being called from an ftrace kprobe context.
> > > - reworked the logic in kprobe_perf_func to take advantage of 
> > > bpf_kprobe_state.
> > > - updated the test as per Alexei's review.
> > > 
> > > - Original message -
> > > 
> > > A lot of our error paths are not well tested because we have no good way 
> > > of
> > > injecting errors generically.  Some subystems (block, memory) have ways to
> > > inject errors, but they are random so it's hard to get reproduceable 
> > > results.
> > > 
> > > With BPF we can add determinism to our error injection.  We can use 
> > > kprobes and
> > > other things to verify we are injecting errors at the exact case we are 
> > > trying
> > > to test.  This patch gives us the tool to actual do the error injection 
> > > part.
> > > It is very simple, we just set the return value of the pt_regs we're 
> > > given to
> > > whatever we provide, and then override the PC with a dummy function that 
> > > simply
> > > returns.
> > 
> > Heh, this looks cool.  I decided to try it to see what happens, and saw
> > a bunch of dmesg pasted in below.  Is that supposed to happen?  Or am I
> > the only fs developer still running with lockdep enabled? :)
> > 
> > It looks like bpf_override_return has some sort of side effect such that
> > we get the splat, since commenting it out makes the symptom go away.
> > 
> > 
> > 
> > --D
> > 
> > [ 1847.769183] BTRFS error (device (null)): open_ctree failed
> > [ 1847.770130] BUG: sleeping function called from invalid context at 
> > /storage/home/djwong/cdev/work/linux-xfs/kernel/locking/rwsem.c:69
> > [ 1847.771976] in_atomic(): 1, irqs_disabled(): 0, pid: 1524, name: mount
> > [ 1847.773016] 1 lock held by mount/1524:
> > [ 1847.773530]  #0:  (>s_umount_key#34/1){+.+.}, at: 
> > [<653a9bb4>] sget_userns+0x302/0x4f0
> > [ 1847.774731] Preemption disabled at:
> > [ 1847.774735] [<  (null)>]   (null)
> > [ 1847.777009] CPU: 2 PID: 1524 Comm: mount Tainted: GW
> >

Re: [PATCH] fs/*/Kconfig: drop links to 404-compliant http://acl.bestbits.at

2017-12-12 Thread Darrick J. Wong
ttp://acl.bestbits.at/> for details).
> +   the kernel or by users (see the attr(5) manual page for details).
>  
> If unsure, say N.
>  
> @@ -49,9 +48,6 @@ config F2FS_FS_POSIX_ACL
> Posix Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the POSIX ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -
> If you don't know what Access Control Lists are, say N
>  
>  config F2FS_FS_SECURITY
> diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
> index 24bc20fd42f7..7cc8b4acf66a 100644
> --- a/fs/hfsplus/Kconfig
> +++ b/fs/hfsplus/Kconfig
> @@ -20,9 +20,6 @@ config HFSPLUS_FS_POSIX_ACL
> POSIX Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the POSIX ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -
> It needs to understand that POSIX ACLs are treated only under
> Linux. POSIX ACLs doesn't mean something under Mac OS X.
> Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
> diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
> index d8bb6c411e96..ad850c5bf2ca 100644
> --- a/fs/jffs2/Kconfig
> +++ b/fs/jffs2/Kconfig
> @@ -68,8 +68,7 @@ config JFFS2_FS_XATTR
>   default n
>   help
> Extended attributes are name:value pairs associated with inodes by
> -   the kernel or by users (see the attr(5) manual page, or visit
> -   <http://acl.bestbits.at/> for details).
> +   the kernel or by users (see the attr(5) manual page for details).
>  
> If unsure, say N.
>  
> @@ -82,9 +81,6 @@ config JFFS2_FS_POSIX_ACL
> Posix Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the Posix ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -
> If you don't know what Access Control Lists are, say N
>  
>  config JFFS2_FS_SECURITY
> diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
> index 57cef19951db..851de78fdabb 100644
> --- a/fs/jfs/Kconfig
> +++ b/fs/jfs/Kconfig
> @@ -16,9 +16,6 @@ config JFS_POSIX_ACL
> Posix Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the Posix ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -
> If you don't know what Access Control Lists are, say N
>  
>  config JFS_SECURITY
> diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
> index 7cd4ba2c..86e71c0caf48 100644
> --- a/fs/reiserfs/Kconfig
> +++ b/fs/reiserfs/Kconfig
> @@ -57,8 +57,7 @@ config REISERFS_FS_XATTR
>   depends on REISERFS_FS
>   help
> Extended attributes are name:value pairs associated with inodes by
> -   the kernel or by users (see the attr(5) manual page, or visit
> -   <http://acl.bestbits.at/> for details).
> +   the kernel or by users (see the attr(5) manual page for details).
>  
> If unsure, say N.
>  
> @@ -70,9 +69,6 @@ config REISERFS_FS_POSIX_ACL
> Posix Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the Posix ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -
> If you don't know what Access Control Lists are, say N
>  
>  config REISERFS_FS_SECURITY
> diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
> index f42fcf1b5465..46bcf0e649f5 100644
> --- a/fs/xfs/Kconfig
> +++ b/fs/xfs/Kconfig
> @@ -48,9 +48,6 @@ config XFS_POSIX_ACL
> POSIX Access Control Lists (ACLs) support permissions for users and
> groups beyond the owner/group/world scheme.
>  
> -   To learn more about Access Control Lists, visit the POSIX ACLs for
> -   Linux website <http://acl.bestbits.at/>.
> -

The XFS bits look ok,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> If you don't know what Access Control Lists are, say N.
>  
>  config XFS_RT
> -- 
> 2.15.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v9 0/5] Add the ability to do BPF directed error injection

2017-12-12 Thread Darrick J. Wong
On Mon, Dec 11, 2017 at 11:36:45AM -0500, Josef Bacik wrote:
> This is the same as v8, just rebased onto the bpf tree.
> 
> v8->v9:
> - rebased onto the bpf tree.
> 
> v7->v8:
> - removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.
> 
> v6->v7:
> - moved the opt-in macro to bpf.h out of kprobes.h.
> 
> v5->v6:
> - add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
>   feature.  This way only functions that opt-in will be allowed to be
>   overridden.
> - added a btrfs patch to allow error injection for open_ctree() so that the 
> bpf
>   sample actually works.
> 
> v4->v5:
> - disallow kprobe_override programs from being put in the prog map array so we
>   don't tail call into something we didn't check.  This allows us to make the
>   normal path still fast without a bunch of percpu operations.
> 
> v3->v4:
> - fix a build error found by kbuild test bot (I didn't wait long enough
>   apparently.)
> - Added a warning message as per Daniels suggestion.
> 
> v2->v3:
> - added a ->kprobe_override flag to bpf_prog.
> - added some sanity checks to disallow attaching bpf progs that have
>   ->kprobe_override set that aren't for ftrace kprobes.
> - added the trace_kprobe_ftrace helper to check if the trace_event_call is a
>   ftrace kprobe.
> - renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read 
> this
>   value in the kprobe path, and thus only write to it if we're overriding or
>   clearing the override.
> 
> v1->v2:
> - moved things around to make sure that bpf_override_return could really only 
> be
>   used for an ftrace kprobe.
> - killed the special return values from trace_call_bpf.
> - renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
>   it was being called from an ftrace kprobe context.
> - reworked the logic in kprobe_perf_func to take advantage of 
> bpf_kprobe_state.
> - updated the test as per Alexei's review.
> 
> - Original message -
> 
> A lot of our error paths are not well tested because we have no good way of
> injecting errors generically.  Some subystems (block, memory) have ways to
> inject errors, but they are random so it's hard to get reproduceable results.
> 
> With BPF we can add determinism to our error injection.  We can use kprobes 
> and
> other things to verify we are injecting errors at the exact case we are trying
> to test.  This patch gives us the tool to actual do the error injection part.
> It is very simple, we just set the return value of the pt_regs we're given to
> whatever we provide, and then override the PC with a dummy function that 
> simply
> returns.

Heh, this looks cool.  I decided to try it to see what happens, and saw
a bunch of dmesg pasted in below.  Is that supposed to happen?  Or am I
the only fs developer still running with lockdep enabled? :)

It looks like bpf_override_return has some sort of side effect such that
we get the splat, since commenting it out makes the symptom go away.



--D

[ 1847.769183] BTRFS error (device (null)): open_ctree failed
[ 1847.770130] BUG: sleeping function called from invalid context at 
/storage/home/djwong/cdev/work/linux-xfs/kernel/locking/rwsem.c:69
[ 1847.771976] in_atomic(): 1, irqs_disabled(): 0, pid: 1524, name: mount
[ 1847.773016] 1 lock held by mount/1524:
[ 1847.773530]  #0:  (>s_umount_key#34/1){+.+.}, at: [<653a9bb4>] 
sget_userns+0x302/0x4f0
[ 1847.774731] Preemption disabled at:
[ 1847.774735] [<  (null)>]   (null)
[ 1847.777009] CPU: 2 PID: 1524 Comm: mount Tainted: GW
4.15.0-rc3-xfsx #3
[ 1847.778800] Call Trace:
[ 1847.779047]  dump_stack+0x7c/0xbe
[ 1847.779361]  ___might_sleep+0x1f7/0x260
[ 1847.779720]  down_write+0x29/0xb0
[ 1847.780046]  unregister_shrinker+0x15/0x70
[ 1847.780427]  deactivate_locked_super+0x2e/0x60
[ 1847.780935]  btrfs_mount+0xbb6/0x1000 [btrfs]
[ 1847.781353]  ? __lockdep_init_map+0x5c/0x1d0
[ 1847.781750]  ? mount_fs+0xf/0x80
[ 1847.782065]  ? alloc_vfsmnt+0x1a1/0x230
[ 1847.782429]  mount_fs+0xf/0x80
[ 1847.782733]  vfs_kern_mount+0x62/0x160
[ 1847.783128]  btrfs_mount+0x3d3/0x1000 [btrfs]
[ 1847.783493]  ? __lockdep_init_map+0x5c/0x1d0
[ 1847.783849]  ? __lockdep_init_map+0x5c/0x1d0
[ 1847.784207]  ? mount_fs+0xf/0x80
[ 1847.784502]  mount_fs+0xf/0x80
[ 1847.784835]  vfs_kern_mount+0x62/0x160
[ 1847.785235]  do_mount+0x1b1/0xd50
[ 1847.785594]  ? _copy_from_user+0x5b/0x90
[ 1847.786028]  ? memdup_user+0x4b/0x70
[ 1847.786501]  SyS_mount+0x85/0xd0
[ 1847.786835]  entry_SYSCALL_64_fastpath+0x1f/0x96
[ 1847.787311] RIP: 0033:0x7f6ebecc1b5a
[ 1847.787691] RSP: 002b:7ffc7bd1c958 EFLAGS: 0202 ORIG_RAX: 
00a5
[ 1847.788383] RAX: ffda RBX: 7f6ebefba63a RCX: 7f6ebecc1b5a
[ 1847.789106] RDX: 00bfd010 RSI: 00bfa230 RDI: 00bfa210
[ 1847.789807] RBP: 00bfa0f0 R08:  R09: 0014
[ 1847.790511] R10: c0ed R11: 0202 R12: 7f6ebf1ca83c
[ 

Re: [PATCH v2] iomap: report collisions between directio and buffered writes to userspace

2017-11-21 Thread Darrick J. Wong
On Wed, Nov 22, 2017 at 09:28:06AM +1100, Dave Chinner wrote:
> On Tue, Nov 21, 2017 at 04:52:53AM -0800, Matthew Wilcox wrote:
> > On Tue, Nov 21, 2017 at 05:48:15PM +1100, Dave Chinner wrote:
> > > On Mon, Nov 20, 2017 at 08:32:40PM -0800, Matthew Wilcox wrote:
> > > > On Mon, Nov 20, 2017 at 05:37:53PM -0800, Darrick J. Wong wrote:
> > > > > On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > > > > > First thing I noticed was that "xa" as a prefix is already quite
> > > > > > widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> > > >
> > > > The X stands for 'eXpandable' or 'eXtending'.  I really don't want to
> > > > use more than a two-letter acronym for whatever the XArray ends up being
> > > > called.  One of the problems with the radix tree is that everything has
> > > > that 11-character 'radix_tree_' prefix; just replacing that with 'xa_'
> > > > makes a huge improvement to readability.
> > > 
> > > Yeah, understood. That's why
> > > we have very little clear
> > > prefix namespace left :/
> > > 
> > > [ somedays I write something that looks sorta like a haiku, and from
> > > that point on everything just starts falling out of my brain that
> > > way. I blame Eric for this today! :P ]
> > 
> > When the namespace is
> > tight we must consider the
> > competing users.
> > 
> > The earliest us'r
> > has a claim to a prefix
> > we are used to it.
> > 
> > Also a more wide-
> > spread user has a claim to
> > a shorter prefix.
> > 
> > Would you mind changing
> > your prefix to one only
> > one letter longer?
> 
> We can do something
> like that, though Darrick has the
> final say in it.

Keep this up and soon
I'll require patch changelogs
Written in Haiku. :P

(j/k)

Everyone in the US, have a happy Thanksgiving!

--D

> > ... ok, I give up ;-)
> 
> Top marks for effort :)
> 
> > All your current usage of the xa_ prefix looks somewhat like this:
> > 
> > fs/xfs/xfs_trans_ail.c: spin_lock(>xa_lock);
> > 
> > with honourable mentions to:
> > fs/xfs/xfs_log.c:   spin_lock(>m_ail->xa_lock);
> > 
> > Would you mind if I bolt a patch on to the front of the series called
> > something like "free up xa_ namespace" that renamed your xa_* to ail_*?
> > There are no uses of the 'ail_' prefix in the kernel today.
> > 
> > I don't think that
> > spin_lock(>ail_lock);
> > loses any readability.
> 
> Not sure that's going to work - there's an "xa_ail" member for the
> AIL list head. That would now read in places:
> 
>   if (list_empty(>ail_ail))
> 
> I'd be inclined to just drop the "xa_" prefix from the XFS
> structure.  There is no information loss by removing the prefix in
> the XFS code because the pointer name tells us what structure it is
> pointing at.
> 
> > 
> > By the way, what does AIL stand for?  It'd be nice if it were spelled out
> > in at least one of the header files, maybe fs/xfs/xfs_trans_priv.h?
> 
> Active Item List. See the section "Tracking Changes" in
> Documentation/filesystems/xfs-delayed-logging-design.txt for the
> full rundown, but in short:
> 
> "The log item is already used to track the log items that have been
> written to the log but not yet written to disk. Such log items are
> considered "active" and as such are stored in the Active Item List
> (AIL) which is a LSN-ordered double linked list. Items are inserted
> into this list during log buffer IO completion, after which they are
> unpinned and can be written to disk."
> 
> The lack of comments describing the AIL is historic - it's never
> been documented in the code, nor has the whole relogging concept it
> implements. I wrote the document above when introducing the CIL
> (Commited Item List) because almost no-one actively working on XFS
> understood how the whole journalling subsystem worked in any detail.
> 
> > > Zoetrope Array.
> > > Labyrinth of illusion.
> > > Structure never ends.
> > 
> > Thank you for making me look up zoetrope ;-)
> 
> My pleasure :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3] iomap: report collisions between directio and buffered writes to userspace

2017-11-21 Thread Darrick J. Wong
From: Darrick J. Wong <darrick.w...@oracle.com>

If two programs simultaneously try to write to the same part of a file
via direct IO and buffered IO, there's a chance that the post-diowrite
pagecache invalidation will fail on the dirty page.  When this happens,
the dio write succeeded, which means that the page cache is no longer
coherent with the disk!

Programs are not supposed to mix IO types and this is a clear case of
data corruption, so store an EIO which will be reflected to userspace
during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
so that the developers have /some/ kind of breadcrumb to track down the
offending program(s) and file(s) involved.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
Reviewed-by: Liu Bo <bo.li@oracle.com>
---
v3: increase ratelimit period to 30s
v2: fix old and new dio paths
---
 fs/direct-io.c |   24 +++-
 fs/iomap.c |   12 ++--
 include/linux/fs.h |1 +
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe132..ef5d12a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio,
return dio->pages[sdio->head];
 }
 
+/*
+ * Warn about a page cache invalidation failure during a direct io write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+   static DEFINE_RATELIMIT_STATE(_rs, 30 * HZ, DEFAULT_RATELIMIT_BURST);
+   char pathname[128];
+   struct inode *inode = file_inode(filp);
+   char *path;
+
+   errseq_set(>i_mapping->wb_err, -EIO);
+   if (__ratelimit(&_rs)) {
+   path = file_path(filp, pathname, sizeof(pathname));
+   if (IS_ERR(path))
+   path = "(unknown)";
+   pr_crit("Page cache invalidation failure on direct I/O.  
Possible data corruption due to collision with buffered I/O!\n");
+   pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+   current->comm);
+   }
+}
+
 /**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
@@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, 
unsigned int flags)
err = invalidate_inode_pages2_range(dio->inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + ret - 1) >> PAGE_SHIFT);
-   WARN_ON_ONCE(err);
+   if (err)
+   dio_warn_stale_pagecache(dio->iocb->ki_filp);
}
 
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a96..028f329 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
err = invalidate_inode_pages2_range(inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + dio->size - 1) >> PAGE_SHIFT);
-   WARN_ON_ONCE(err);
+   if (err)
+   dio_warn_stale_pagecache(iocb->ki_filp);
}
 
inode_dio_end(file_inode(iocb->ki_filp));
@@ -1012,9 +1013,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out_free_dio;
 
+   /*
+* Try to invalidate cache pages for the range we're direct
+* writing.  If this invalidation fails, tough, the write will
+* still work, but racing two incompatible write paths is a
+* pretty crazy thing to do, so we don't support it 100%.
+*/
ret = invalidate_inode_pages2_range(mapping,
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-   WARN_ON_ONCE(ret);
+   if (ret)
+   dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
 
if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2690864..0e5f060 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2976,6 +2976,7 @@ enum {
 };
 
 void dio_end_io(struct bio *bio);
+void dio_warn_stale_pagecache(struct file *filp);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 struct block_device *bdev, struct iov_iter *iter,
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] iomap: report collisions between directio and buffered writes to userspace

2017-11-21 Thread Darrick J. Wong
On Tue, Nov 21, 2017 at 09:23:47AM -0800, Matthew Wilcox wrote:
> On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> > On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> > > If you want an example of it in use, I'm pretty happy with this patch
> > > that switches the brd driver entirely from the radix tree API to the
> > > xarray API:
> > > 
> > > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f
> > 
> > Looks pretty neat, but I'll reserve judgement for when I see the
> > conversion of the XFS radix tree code
> 
> Challenge accepted.  This was a good thing for me to do because I found
> some opportunities to improve the XArray API.
> 
> Changes since yesterday:
> 
>  - Added XA_NO_TAG
>  - Added 'max' parameters to xa_get_entries() and xa_get_tagged()
>  - Changed the order of the arguments of xa_get_entries() and xa_get_tagged()
>to match the radix tree equivalents
> 
> You can see the patches on that (rebased) branch:
> 
> xfs: Convert mru cache to XArray
> xfs: Convert xfs dquot to XArray
> xfs: Convert pag_ici_root to XArray
> xfs: Convert m_perag_tree to XArray
> 
> or did you want me to send them by email?

Yes please.

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] iomap: report collisions between directio and buffered writes to userspace

2017-11-20 Thread Darrick J. Wong
On Tue, Nov 21, 2017 at 09:27:49AM +1100, Dave Chinner wrote:
> On Mon, Nov 20, 2017 at 01:51:00PM -0800, Matthew Wilcox wrote:
> > On Tue, Nov 21, 2017 at 07:26:06AM +1100, Dave Chinner wrote:
> > > On Mon, Nov 20, 2017 at 08:18:29AM -0800, Matthew Wilcox wrote:
> > > > On Fri, Nov 17, 2017 at 11:39:25AM -0800, Darrick J. Wong wrote:
> > > > > If two programs simultaneously try to write to the same part of a file
> > > > > via direct IO and buffered IO, there's a chance that the post-diowrite
> > > > > pagecache invalidation will fail on the dirty page.  When this 
> > > > > happens,
> > > > > the dio write succeeded, which means that the page cache is no longer
> > > > > coherent with the disk!
> > > > 
> > > > This seems like a good opportunity to talk about what I've been working
> > > > on for solving this problem.  The XArray is going to introduce a set
> > > > of entries which can be stored to locations in the page cache that I'm
> > > > calling 'wait entries'.
> > > 
> > > What's this XArray thing you speak of?
> > 
> > Ah, right, you were on sabbatical at LSFMM this year where I talked
> > about it.  Briefly, it's a new API for the radix tree.  The data structure
> > is essentially unchanged (minor enhancements), but I'm rationalising
> > existing functionality and adding new abilities.  And getting rid of
> > misfeatures like the preload API and implicit GFP flags.
> > 
> > My current working tree is here:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/xarray-2017-11-20
> 
> First thing I noticed was that "xa" as a prefix is already quite
> widely used in XFS - it's shorthand for "XFS AIL". Indeed, xa_lock
> already exists and is quite widely used, so having a generic
> interface using the same prefixes and lock names is going to be
> quite confusing in the XFS code. Especially considering there's
> fair bit of radix tree use in XFS (e.g. the internal inode and
> dquot caches).
> 
> FYI, from fs/xfs/xfs_trans_priv.h:
> 
> /*
>  * Private AIL structures.
>  *
>  * Eventually we need to drive the locking in here as well.
>  */
> struct xfs_ail {
> struct xfs_mount*xa_mount;
> struct task_struct  *xa_task;
> struct list_headxa_ail;
> xfs_lsn_t   xa_target;
> xfs_lsn_t   xa_target_prev;
> struct list_headxa_cursors;
> spinlock_t  xa_lock;
> xfs_lsn_t   xa_last_pushed_lsn;
> int xa_log_flush;
> struct list_headxa_buf_list;
> wait_queue_head_t   xa_empty;
> };
> 
> 
> > Ignoring the prep patches, the excitement is all to be found with the
> > commits which start 'xarray:'
> 
> FWIW, why is it named "XArray"?  "X" stands for what?  It still
> looks like a tree structure to me, but without a design doc I'm a
> bit lost to how it differs to the radix tree (apart from the API)
> and why it's considered an "array".

/me nominates 'xarr' for the prefix because pirates. :P

--D

> > If you want an example of it in use, I'm pretty happy with this patch
> > that switches the brd driver entirely from the radix tree API to the
> > xarray API:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/dbf96ae943e43563cbbaa26e21b656b6fe8f4b0f
> 
> Looks pretty neat, but I'll reserve judgement for when I see the
> conversion of the XFS radix tree code
> 
> > I've been pretty liberal with the kernel-doc, but I haven't written out
> > a good .rst file to give an overview of how to use it.
> 
> Let me know when you've written it :)
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] iomap: report collisions between directio and buffered writes to userspace

2017-11-17 Thread Darrick J. Wong
From: Darrick J. Wong <darrick.w...@oracle.com>

If two programs simultaneously try to write to the same part of a file
via direct IO and buffered IO, there's a chance that the post-diowrite
pagecache invalidation will fail on the dirty page.  When this happens,
the dio write succeeded, which means that the page cache is no longer
coherent with the disk!

Programs are not supposed to mix IO types and this is a clear case of
data corruption, so store an EIO which will be reflected to userspace
during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
so that the developers have /some/ kind of breadcrumb to track down the
offending program(s) and file(s) involved.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
 fs/direct-io.c |   24 +++-
 fs/iomap.c |   12 ++--
 include/linux/fs.h |1 +
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe132..ef5d12a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio,
return dio->pages[sdio->head];
 }
 
+/*
+ * Warn about a page cache invalidation failure during a direct io write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+   static DEFINE_RATELIMIT_STATE(_rs, 30 * HZ, DEFAULT_RATELIMIT_BURST);
+   char pathname[128];
+   struct inode *inode = file_inode(filp);
+   char *path;
+
+   errseq_set(>i_mapping->wb_err, -EIO);
+   if (__ratelimit(&_rs)) {
+   path = file_path(filp, pathname, sizeof(pathname));
+   if (IS_ERR(path))
+   path = "(unknown)";
+   pr_crit("Page cache invalidation failure on direct I/O.  
Possible data corruption due to collision with buffered I/O!\n");
+   pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+   current->comm);
+   }
+}
+
 /**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
@@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, 
unsigned int flags)
err = invalidate_inode_pages2_range(dio->inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + ret - 1) >> PAGE_SHIFT);
-   WARN_ON_ONCE(err);
+   if (err)
+   dio_warn_stale_pagecache(dio->iocb->ki_filp);
}
 
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a96..028f329 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
err = invalidate_inode_pages2_range(inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + dio->size - 1) >> PAGE_SHIFT);
-   WARN_ON_ONCE(err);
+   if (err)
+   dio_warn_stale_pagecache(iocb->ki_filp);
}
 
inode_dio_end(file_inode(iocb->ki_filp));
@@ -1012,9 +1013,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out_free_dio;
 
+   /*
+* Try to invalidate cache pages for the range we're direct
+* writing.  If this invalidation fails, tough, the write will
+* still work, but racing two incompatible write paths is a
+* pretty crazy thing to do, so we don't support it 100%.
+*/
ret = invalidate_inode_pages2_range(mapping,
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-   WARN_ON_ONCE(ret);
+   if (ret)
+   dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
 
if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2690864..0e5f060 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2976,6 +2976,7 @@ enum {
 };
 
 void dio_end_io(struct bio *bio);
+void dio_warn_stale_pagecache(struct file *filp);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 struct block_device *bdev, struct iov_iter *iter,
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

2017-09-21 Thread Darrick J. Wong
On Thu, Sep 21, 2017 at 04:16:35PM -0400, Zygo Blaxell wrote:
> On Thu, Sep 21, 2017 at 12:59:42PM -0700, Darrick J. Wong wrote:
> > On Thu, Sep 21, 2017 at 12:10:15AM -0400, Zygo Blaxell wrote:
> > > Now that check_extent_in_eb()'s extent offset filter can be turned off,
> > > we need a way to do it from userspace.
> > > 
> > > Add a 'flags' field to the btrfs_logical_ino_args structure to disable 
> > > extent
> > > offset filtering, taking the place of one of the reserved[] fields.
> > > 
> > > Previous versions of LOGICAL_INO neglected to check whether any of the
> > > reserved fields have non-zero values.  Assigning meaning to those fields
> > > now may change the behavior of existing programs that left these fields
> > > uninitialized.
> > > 
> > > To avoid any surprises, define a new ioctl LOGICAL_INO_V2 which uses
> > > the same argument layout as LOGICAL_INO, but uses one of the reserved
> > > fields for flags.  The V2 ioctl explicitly checks that unsupported flag
> > > bits are zero so that userspace can probe for future feature bits as
> > > they are defined.  If the other reserved fields are used in the future,
> > > one of the remaining flag bits could specify that the other reserved
> > > fields are valid, so we don't need to check those for now.
> > > 
> > > Since the memory layouts and behavior of the two ioctls' arguments
> > > are almost identical, there is no need for a separate function for
> > > logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search).
> > > A version parameter and an 'if' statement will suffice.
> > > 
> > > Now that we have a flags field in logical_ino_args, add a flag
> > > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
> > > and pass it down the stack to iterate_inodes_from_logical.
> > > 
> > > Signed-off-by: Zygo Blaxell <ce3g8...@umail.furryterror.org>
> > > ---
> > >  fs/btrfs/ioctl.c   | 21 ++---
> > >  include/uapi/linux/btrfs.h |  8 +++-
> > >  2 files changed, 25 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > index b7de32568082..2bc3a9588d1d 100644
> > > --- a/fs/btrfs/ioctl.c
> > > +++ b/fs/btrfs/ioctl.c
> > > @@ -4536,13 +4536,14 @@ static int build_ino_list(u64 inum, u64 offset, 
> > > u64 root, void *ctx)
> > >  }
> > >  
> > >  static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
> > > - void __user *arg)
> > > + void __user *arg, int version)
> > >  {
> > >   int ret = 0;
> > >   int size;
> > >   struct btrfs_ioctl_logical_ino_args *loi;
> > >   struct btrfs_data_container *inodes = NULL;
> > >   struct btrfs_path *path = NULL;
> > > + bool ignore_offset;
> > >  
> > >   if (!capable(CAP_SYS_ADMIN))
> > >   return -EPERM;
> > > @@ -4551,6 +4552,17 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > > btrfs_fs_info *fs_info,
> > >   if (IS_ERR(loi))
> > >   return PTR_ERR(loi);
> > >  
> > > + if (version == 1) {
> > > + ignore_offset = false;
> > > + } else {
> > > + /* Only accept flags we have defined so far */
> > > + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
> > > + ret = -EINVAL;
> > > + goto out_loi;
> > > + }
> > > + ignore_offset = loi->flags & 
> > > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
> > 
> > Please check loi->reserved[3] for zeroness so that the next person who
> > wants to add a field to btrfs_ioctl_logical_ino_args doesn't have to
> > create LOGICAL_INO_V3 for the same reason you're creating V2.
> 
> OK now I'm confused, in several distinct ways.
> 
> I wonder if you meant reserved[1] and reserved[2] there, since I'm not
> checking them (for reasons stated in the commit log--we can use flags
> to indicate whether and what values are present there).

You can do that, though that means you have to burn flag bits to light
up the remaining reserved area, which means you can't in the future
decide that a non-zero field value will turn on some new feature.  You
retain the ability to use flag bits to turn on the new field, if it's
the case that zero has a meaning.

> But that's not the bigger problem.  Maybe you did mean reserved

Re: [PATCH 2/3] btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

2017-09-21 Thread Darrick J. Wong
On Thu, Sep 21, 2017 at 12:10:15AM -0400, Zygo Blaxell wrote:
> Now that check_extent_in_eb()'s extent offset filter can be turned off,
> we need a way to do it from userspace.
> 
> Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent
> offset filtering, taking the place of one of the reserved[] fields.
> 
> Previous versions of LOGICAL_INO neglected to check whether any of the
> reserved fields have non-zero values.  Assigning meaning to those fields
> now may change the behavior of existing programs that left these fields
> uninitialized.
> 
> To avoid any surprises, define a new ioctl LOGICAL_INO_V2 which uses
> the same argument layout as LOGICAL_INO, but uses one of the reserved
> fields for flags.  The V2 ioctl explicitly checks that unsupported flag
> bits are zero so that userspace can probe for future feature bits as
> they are defined.  If the other reserved fields are used in the future,
> one of the remaining flag bits could specify that the other reserved
> fields are valid, so we don't need to check those for now.
> 
> Since the memory layouts and behavior of the two ioctls' arguments
> are almost identical, there is no need for a separate function for
> logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search).
> A version parameter and an 'if' statement will suffice.
> 
> Now that we have a flags field in logical_ino_args, add a flag
> BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
> and pass it down the stack to iterate_inodes_from_logical.
> 
> Signed-off-by: Zygo Blaxell 
> ---
>  fs/btrfs/ioctl.c   | 21 ++---
>  include/uapi/linux/btrfs.h |  8 +++-
>  2 files changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index b7de32568082..2bc3a9588d1d 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -4536,13 +4536,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 
> root, void *ctx)
>  }
>  
>  static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
> - void __user *arg)
> + void __user *arg, int version)
>  {
>   int ret = 0;
>   int size;
>   struct btrfs_ioctl_logical_ino_args *loi;
>   struct btrfs_data_container *inodes = NULL;
>   struct btrfs_path *path = NULL;
> + bool ignore_offset;
>  
>   if (!capable(CAP_SYS_ADMIN))
>   return -EPERM;
> @@ -4551,6 +4552,17 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>   if (IS_ERR(loi))
>   return PTR_ERR(loi);
>  
> + if (version == 1) {
> + ignore_offset = false;
> + } else {
> + /* Only accept flags we have defined so far */
> + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
> + ret = -EINVAL;
> + goto out_loi;
> + }
> + ignore_offset = loi->flags & 
> BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;

Please check loi->reserved[3] for zeroness so that the next person who
wants to add a field to btrfs_ioctl_logical_ino_args doesn't have to
create LOGICAL_INO_V3 for the same reason you're creating V2.

--D

> + }
> +
>   path = btrfs_alloc_path();
>   if (!path) {
>   ret = -ENOMEM;
> @@ -4566,7 +4578,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>   }
>  
>   ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
> -   build_ino_list, inodes, false);
> +   build_ino_list, inodes, 
> ignore_offset);
>   if (ret == -EINVAL)
>   ret = -ENOENT;
>   if (ret < 0)
> @@ -4580,6 +4592,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>  out:
>   btrfs_free_path(path);
>   kvfree(inodes);
> +out_loi:
>   kfree(loi);
>  
>   return ret;
> @@ -5550,7 +5563,9 @@ long btrfs_ioctl(struct file *file, unsigned int
>   case BTRFS_IOC_INO_PATHS:
>   return btrfs_ioctl_ino_to_path(root, argp);
>   case BTRFS_IOC_LOGICAL_INO:
> - return btrfs_ioctl_logical_to_ino(fs_info, argp);
> + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
> + case BTRFS_IOC_LOGICAL_INO_V2:
> + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
>   case BTRFS_IOC_SPACE_INFO:
>   return btrfs_ioctl_space_info(fs_info, argp);
>   case BTRFS_IOC_SYNC: {
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 378230c163d5..0b3de597e04f 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -608,10 +608,14 @@ struct btrfs_ioctl_ino_path_args {
>  struct btrfs_ioctl_logical_ino_args {
>   __u64   logical;/* in */
>   __u64   size;   /* in 

Re: [PATCH] btrfs: copy fsid to super_block s_uuid

2017-08-02 Thread Darrick J. Wong
On Wed, Aug 02, 2017 at 02:02:11PM +0800, Anand Jain wrote:
> 
> Hi Darrick,
> 
>  Thanks for commenting..
> 
> >>+   memcpy(>s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
> >
> >uuid_copy()?
> 
>   It requires a larger migration to use uuid_t, IMO it can be done all
>   together, in a separate patch ?
> 
>   Just for experiment, starting with struct btrfs_fs_info.fsid and
>   to check its foot prints, I just renamed fsid to fs_id, and compiled.
>   It reports 73 'has no member named ‘fsid'' errors.
>   So looks like redefining u8 fsid[] to uuid_t fsid and further updating
>   all its foot prints, has to be simplified. Any suggestions ?

Cocinelle script?

 It was a fairly simply transition for xfs and others, though
from a simple grep it looks like btrfs uses open coded u8 arrays in a
few more places.

--D

> 
> Thanks, Anand
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: copy fsid to super_block s_uuid

2017-08-01 Thread Darrick J. Wong
On Tue, Aug 01, 2017 at 06:35:08PM +0800, Anand Jain wrote:
> We didn't copy fsid to struct super_block.s_uuid so Overlay disables
> index feature with btrfs as the lower FS.
> 
> kernel: overlayfs: fs on '/lower' does not support file handles, falling back 
> to index=off.
> 
> Fix this by publishing the fsid through struct super_block.s_uuid.
> 
> Signed-off-by: Anand Jain 
> ---
> I tried to know if in case did we deliberately missed this for some reason,
> however there is no information on that. If we mount a non-default subvol in
> the next mount/remount, its still the same FS, so publishing the FSID
> instead of subvol uuid is correct, OR I can't think any other reason for
> not using s_uuid for btrfs.
> 
> 
>  fs/btrfs/disk-io.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 080e2ebb8aa0..b7e72d040442 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2899,6 +2899,7 @@ int open_ctree(struct super_block *sb,
>  
>   sb->s_blocksize = sectorsize;
>   sb->s_blocksize_bits = blksize_bits(sectorsize);
> + memcpy(>s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);

uuid_copy()?

--D

>  
>   mutex_lock(_info->chunk_mutex);
>   ret = btrfs_read_sys_array(fs_info);
> -- 
> 2.13.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 17/18] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-29 Thread Darrick J. Wong
On Thu, Jun 29, 2017 at 09:19:53AM -0400, jlay...@kernel.org wrote:
> From: Jeff Layton <jlay...@redhat.com>
> 
> Just check and advance the data errseq_t in struct file before
> before returning from fsync on normal files. Internal filemap_*
> callers are left as-is.
> 
> Signed-off-by: Jeff Layton <jlay...@redhat.com>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  fs/xfs/xfs_file.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5fb5a0958a14..6600b264b0b6 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -140,7 +140,7 @@ xfs_file_fsync(
>  
>   trace_xfs_file_fsync(ip);
>  
> - error = filemap_write_and_wait_range(inode->i_mapping, start, end);
> + error = file_write_and_wait_range(file, start, end);
>   if (error)
>   return error;
>  
> -- 
> 2.13.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 12/18] Documentation: flesh out the section in vfs.txt on storing and reporting writeback errors

2017-06-29 Thread Darrick J. Wong
On Thu, Jun 29, 2017 at 09:19:48AM -0400, jlay...@kernel.org wrote:
> From: Jeff Layton 
> 
> Let's try to make this extra clear for fs authors.
> 
> Cc: Jan Kara 
> Signed-off-by: Jeff Layton 
> ---
>  Documentation/filesystems/vfs.txt | 43 
> ---
>  1 file changed, 40 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/filesystems/vfs.txt 
> b/Documentation/filesystems/vfs.txt
> index f42b90687d40..1366043b3942 100644
> --- a/Documentation/filesystems/vfs.txt
> +++ b/Documentation/filesystems/vfs.txt
> @@ -576,7 +576,42 @@ should clear PG_Dirty and set PG_Writeback.  It can be 
> actually
>  written at any point after PG_Dirty is clear.  Once it is known to be
>  safe, PG_Writeback is cleared.
>  
> -Writeback makes use of a writeback_control structure...
> +Writeback makes use of a writeback_control structure to direct the
> +operations.  This gives the the writepage and writepages operations some
> +information about the nature of and reason for the writeback request,
> +and the constraints under which it is being done.  It is also used to
> +return information back to the caller about the result of a writepage or
> +writepages request.
> +
> +Handling errors during writeback
> +
> +Most applications that utilize the pagecache will periodically call
> +fsync to ensure that data written has made it to the backing store.

/me wonders if this sentence ought to be worded more strongly, e.g.

"Applications that utilize the pagecache must call a data
synchronization syscall such as fsync, fdatasync, or msync to ensure
that data written has made it to the backing store."

I'm also wondering -- fdatasync and msync will also report any writeback
errors that have happened anywhere (like fsync), since they all map to
vfs_fsync_range, correct?  If so, I think it worth it to state
explicitly that the other *sync methods behave the same as fsync w.r.t.
writeback error reporting.

--D

> +When there is an error during writeback, they expect that error to be
> +reported when fsync is called.  After an error has been reported on one
> +fsync, subsequent fsync calls on the same file descriptor should return
> +0, unless further writeback errors have occurred since the previous
> +fsync.
> +
> +Ideally, the kernel would report an error only on file descriptions on
> +which writes were done that subsequently failed to be written back.  The
> +generic pagecache infrastructure does not track the file descriptions
> +that have dirtied each individual page however, so determining which
> +file descriptors should get back an error is not possible.
> +
> +Instead, the generic writeback error tracking infrastructure in the
> +kernel settles for reporting errors to fsync on all file descriptions
> +that were open at the time that the error occurred.  In a situation with
> +multiple writers, all of them will get back an error on a subsequent fsync,
> +even if all of the writes done through that particular file descriptor
> +succeeded (or even if there were no writes on that file descriptor at all).
> +
> +Filesystems that wish to use this infrastructure should call
> +mapping_set_error to record the error in the address_space when it
> +occurs.  Then, at the end of their fsync operation, they should call
> +file_check_and_advance_wb_err to ensure that the struct file's error
> +cursor has advanced to the correct point in the stream of errors emitted
> +by the backing device(s).
>  
>  struct address_space_operations
>  ---
> @@ -804,7 +839,8 @@ struct address_space_operations {
>  The File Object
>  ===
>  
> -A file object represents a file opened by a process.
> +A file object represents a file opened by a process. This is also known
> +as an "open file description" in POSIX parlance.
>  
>  
>  struct file_operations
> @@ -887,7 +923,8 @@ otherwise noted.
>  
>release: called when the last reference to an open file is closed
>  
> -  fsync: called by the fsync(2) system call
> +  fsync: called by the fsync(2) system call. Also see the section above
> +  entitled "Handling errors during writeback".
>  
>fasync: called by the fcntl(2) system call when asynchronous
>   (non-blocking) mode is enabled for a file
> -- 
> 2.13.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: generic/015 run failed with btrfs

2017-06-27 Thread Darrick J. Wong
[cc linux-btrfs, drop linux-xfs cc]
On Tue, Jun 27, 2017 at 07:42:16AM +, Dai, XiangX wrote:
> Hi! I test on my host and find this issue, i want to know how can fix it?
> 
> root@localhost ~/xfstests# ./check generic/015
> FSTYP -- btrfs

   ^ please contact the btrfs list, not the xfs list.

--D

> PLATFORM  -- Linux/x86_64 localhost 4.12.0-rc6
> MKFS_OPTIONS  -- /dev/sda6
> MOUNT_OPTIONS -- /dev/sda6 /fs/scratch
> 
> generic/015 - output mismatch (see 
> ~/xfstests/results//generic/015.out.bad)
> --- tests/generic/015.out2017-05-31 18:15:23.0 +0800
> +++ ~/xfstests/results//generic/015.out.bad2017-06-26 
> 14:41:53.265215409 +0800
> @@ -1,7 +1,778 @@
> +++ basename ./tests/generic/015
> ++ seq=015
> ++ seqres=~/xfstests/results//generic/015
> ++ echo 'QA output created by 015'
>  QA output created by 015
> +++ pwd
> ++ here=~/xfstests
> ...
> (Run 'diff -u tests/generic/015.out 
> ~/xfstests/results//generic/015.out.bad'  to see the entire diff)
> Ran: generic/015
> Failures: generic/015
> Failed 1 of 1 tests
> 
> The output is like below:
> QA output created by 015
> fill disk:
>!!! disk full (expected)
> check free space:
> delete fill:
> check free space:
>!!! free space has value of 17348 <===
> free space is NOT in range 45591.48 .. 46512.52
> 
> 
> I copy the cmd and write to log:
> 
> root@localhost ~/xfstests# cat log
> _df_dir /fs/scratch | /usr/bin/awk '{ print  }'
> Filesystem Type  1024-blocks  Used Available Capacity Mounted on
> /dev/sda6  btrfs   51200 46144 17348  73% /fs/scratch   <===
> 17348
> 
> 
> Does it means i need to test on a larger machine?
> 
> Thanks
> Dai Xiang
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread Darrick J. Wong
On Mon, Jun 26, 2017 at 01:58:32PM -0400, jlay...@redhat.com wrote:
> On Mon, 2017-06-26 at 08:22 -0700, Darrick J. Wong wrote:
> > On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> > > Just check and advance the data errseq_t in struct file before
> > > before returning from fsync on normal files. Internal filemap_*
> > > callers are left as-is.
> > > 
> > > Signed-off-by: Jeff Layton <jlay...@redhat.com>
> > > ---
> > >  fs/xfs/xfs_file.c | 15 +++
> > >  1 file changed, 11 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index 5fb5a0958a14..bc3b1575e8db 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -134,7 +134,7 @@ xfs_file_fsync(
> > >   struct inode*inode = file->f_mapping-
> > > >host;
> > >   struct xfs_inode*ip = XFS_I(inode);
> > >   struct xfs_mount*mp = ip->i_mount;
> > > - int error = 0;
> > > + int error = 0, err2;
> > >   int log_flushed = 0;
> > >   xfs_lsn_t   lsn = 0;
> > >  
> > > @@ -142,10 +142,12 @@ xfs_file_fsync(
> > >  
> > >   error = filemap_write_and_wait_range(inode->i_mapping,
> > > start, end);
> > >   if (error)
> > > - return error;
> > > + goto out;
> > >  
> > > - if (XFS_FORCED_SHUTDOWN(mp))
> > > - return -EIO;
> > > + if (XFS_FORCED_SHUTDOWN(mp)) {
> > > + error = -EIO;
> > > + goto out;
> > > + }
> > >  
> > >   xfs_iflags_clear(ip, XFS_ITRUNCATED);
> > >  
> > > @@ -197,6 +199,11 @@ xfs_file_fsync(
> > >   mp->m_logdev_targp == mp->m_ddev_targp)
> > >   xfs_blkdev_issue_flush(mp->m_ddev_targp);
> > >  
> > > +out:
> > > + err2 = filemap_report_wb_err(file);
> > 
> > Could we have a comment here to remind anyone reading the code a year
> > from now that filemap_report_wb_err has side effects?  Pre-coffee me
> > was
> > wondering why we'd bother calling filemap_report_wb_err in the
> > XFS_FORCED_SHUTDOWN case, then remembered that it touches data
> > structures.
> > 
> > The first sentence of the commit message (really, the word 'advance')
> > added as a comment was adequate to remind me of the side effects.
> > 
> > Once that's added,
> > Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>
> > 
> > --D
> > 
> 
> Yeah, definitely. I'm working on a respin of the series now to
> incorporate HCH's suggestion too. I'll add that in as well.
> 
> Maybe I should rename that function to file_check_and_advance_wb_err()
> ? It would be good to make it clear that it does advance the errseq_t
> cursor.

Seems like a good idea.

--D

> 
> > > + if (!error)
> > > + error = err2;
> > > +
> > >   return error;
> > >  }
> > >  
> > > -- 
> > > 2.13.0
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-
> > > xfs" in
> > > the body of a message to majord...@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-
> > btrfs" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 21/22] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-26 Thread Darrick J. Wong
On Fri, Jun 16, 2017 at 03:34:26PM -0400, Jeff Layton wrote:
> Just check and advance the data errseq_t in struct file before
> before returning from fsync on normal files. Internal filemap_*
> callers are left as-is.
> 
> Signed-off-by: Jeff Layton <jlay...@redhat.com>
> ---
>  fs/xfs/xfs_file.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 5fb5a0958a14..bc3b1575e8db 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -134,7 +134,7 @@ xfs_file_fsync(
>   struct inode*inode = file->f_mapping->host;
>   struct xfs_inode*ip = XFS_I(inode);
>   struct xfs_mount*mp = ip->i_mount;
> - int error = 0;
> + int error = 0, err2;
>   int log_flushed = 0;
>   xfs_lsn_t   lsn = 0;
>  
> @@ -142,10 +142,12 @@ xfs_file_fsync(
>  
>   error = filemap_write_and_wait_range(inode->i_mapping, start, end);
>   if (error)
> - return error;
> + goto out;
>  
> - if (XFS_FORCED_SHUTDOWN(mp))
> - return -EIO;
> + if (XFS_FORCED_SHUTDOWN(mp)) {
> + error = -EIO;
> + goto out;
> + }
>  
>   xfs_iflags_clear(ip, XFS_ITRUNCATED);
>  
> @@ -197,6 +199,11 @@ xfs_file_fsync(
>   mp->m_logdev_targp == mp->m_ddev_targp)
>   xfs_blkdev_issue_flush(mp->m_ddev_targp);
>  
> +out:
> + err2 = filemap_report_wb_err(file);

Could we have a comment here to remind anyone reading the code a year
from now that filemap_report_wb_err has side effects?  Pre-coffee me was
wondering why we'd bother calling filemap_report_wb_err in the
XFS_FORCED_SHUTDOWN case, then remembered that it touches data
structures.

The first sentence of the commit message (really, the word 'advance')
added as a comment was adequate to remind me of the side effects.

Once that's added,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> + if (!error)
> + error = err2;
> +
>   return error;
>  }
>  
> -- 
> 2.13.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 19/20] xfs: minimal conversion to errseq_t writeback error reporting

2017-06-12 Thread Darrick J. Wong
On Mon, Jun 12, 2017 at 08:23:15AM -0400, Jeff Layton wrote:
> Just set the FS_WB_ERRSEQ flag to indicate that we want to use errseq_t
> based error reporting. Internal filemap_* calls are left as-is for now.
> 
> Signed-off-by: Jeff Layton 
> ---
>  fs/xfs/xfs_super.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 455a575f101d..28d3be187025 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1758,7 +1758,7 @@ static struct file_system_type xfs_fs_type = {
>   .name   = "xfs",
>   .mount  = xfs_fs_mount,
>   .kill_sb= kill_block_super,
> - .fs_flags   = FS_REQUIRES_DEV,
> + .fs_flags   = FS_REQUIRES_DEV | FS_WB_ERRSEQ,

Huh?  Why are there two patches with the same subject line?  And this
same bit of code too?  Or ... 11/13, 11/20?  What's going on here?



--D

>  };
>  MODULE_ALIAS_FS("xfs");
>  
> -- 
> 2.13.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: crypto: Work around deallocated stack frame reference gcc bug on sparc.

2017-06-02 Thread Darrick J. Wong
[add ext4 list to cc]

On Fri, Jun 02, 2017 at 11:28:54AM -0400, David Miller wrote:
> 
> On sparc, if we have an alloca() like situation, as is the case with
> SHASH_DESC_ON_STACK(), we can end up referencing deallocated stack
> memory.  The result can be that the value is clobbered if a trap
> or interrupt arrives at just the right instruction.
> 
> It only occurs if the function ends returning a value from that
> alloca() area and that value can be placed into the return value
> register using a single instruction.
> 
> For example, in lib/libcrc32c.c:crc32c() we end up with a return
> sequence like:
> 
> return  %i7+8
>  lduw   [%o5+16], %o0   ! MEM[(u32 *)__shash_desc.1_10 + 16B],
> 
> %o5 holds the base of the on-stack area allocated for the shash
> descriptor.  But the return released the stack frame and the
> register window.
> 
> So if an intererupt arrives between 'return' and 'lduw', then
> the value read at %o5+16 can be corrupted.
> 
> Add a data compiler barrier to work around this problem.  This is
> exactly what the gcc fix will end up doing as well, and it absolutely
> should not change the code generated for other cpus (unless gcc
> on them has the same bug :-)
> 
> With crucial insight from Eric Sandeen.
> 
> Reported-by: Anatoly Pugachev 
> Signed-off-by: David S. Miller 
> ---
> 
> See the thread anchored at:
> 
>   http://marc.info/?l=linux-sparc=149623182616944=2
> 
> for discussion, it has a reproducer module.  The problem was
> first noticed as occaisional XFS checksum corruptions.
> 
> Herbert, I don't expect you to like this but it is the best we can do
> I think.  It should not pessimize code on other architectures at all.
> I will work on fixing the gcc bug but it's been around forever and all
> versions are effected.
> 
> I noticed while working on this that at least btrfs duplicates the
> facilities provided by lib/libcrc32c.c and therefore should probably
> be converted over to straight crc32c() calls if possible.

ext4/jbd2's crc32c implementations will also need a fix like this for
{ext4,jbd2}_chksum.  Note that both of these modules call the crypto api
directly to avoid a static dependence on libcrc32c; this was done to
reduce kernel footprint for applications that don't need it.  (ext2,
ext3, and ext4 before the metadata_csum feature existed).

--D

> 
> Thanks!
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
> index ecdba2f..1ac5b85 100644
> --- a/drivers/infiniband/sw/rxe/rxe.h
> +++ b/drivers/infiniband/sw/rxe/rxe.h
> @@ -68,6 +68,7 @@
>  static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   u32 crc, void *next, size_t len)
>  {
> + u32 retval;
>   int err;
>  
>   SHASH_DESC_ON_STACK(shash, rxe->tfm);
> @@ -81,7 +82,9 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe,
>   return crc32_le(crc, next, len);
>   }
>  
> - return *(u32 *)shash_desc_ctx(shash);
> + retval = *(u32 *)shash_desc_ctx(shash);
> + barrier_data(shash_desc_ctx(shash));
> + return retval;
>  }
>  
>  int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
> diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
> index a97fdc1..baacc18 100644
> --- a/fs/btrfs/hash.c
> +++ b/fs/btrfs/hash.c
> @@ -38,6 +38,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int 
> length)
>  {
>   SHASH_DESC_ON_STACK(shash, tfm);
>   u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 retval;
>   int err;
>  
>   shash->tfm = tfm;
> @@ -47,5 +48,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int 
> length)
>   err = crypto_shash_update(shash, address, length);
>   BUG_ON(err);
>  
> - return *ctx;
> + retval = *ctx;
> + barrier_data(ctx);
> + return retval;
>  }
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 2185c7a..fd2e651 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -1078,6 +1078,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, 
> const void *address,
>  {
>   SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
>   u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 retval;
>   int err;
>  
>   shash->tfm = sbi->s_chksum_driver;
> @@ -1087,7 +1088,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, 
> const void *address,
>   err = crypto_shash_update(shash, address, length);
>   BUG_ON(err);
>  
> - return *ctx;
> + retval = *ctx;
> + barrier_data(ctx);
> + return retval;
>  }
>  
>  static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
> diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
> index 74a54b7..9f79547 100644
> --- a/lib/libcrc32c.c
> +++ b/lib/libcrc32c.c
> @@ -43,7 +43,7 @@ static struct crypto_shash *tfm;
>  u32 crc32c(u32 crc, const void *address, unsigned int length)
>  {
>   SHASH_DESC_ON_STACK(shash, tfm);
> - u32 *ctx = (u32 *)shash_desc_ctx(shash);
> + u32 ret, *ctx 

Re: [PATCH 09/10] xfs: nowait aio support

2017-05-24 Thread Darrick J. Wong
On Wed, May 24, 2017 at 11:41:49AM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgold...@suse.com>
> 
> If IOCB_NOWAIT is set, bail if the i_rwsem is not lockable
> immediately.
> 
> IF IOMAP_NOWAIT is set, return EAGAIN in xfs_file_iomap_begin
> if it needs allocation either due to file extension, writing to a hole,
> or COW or waiting for other DIOs to finish.
> 
> Signed-off-by: Goldwyn Rodrigues <rgold...@suse.com>
> Reviewed-by: Christoph Hellwig <h...@lst.de>

Looks good,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  fs/xfs/xfs_file.c  | 19 ++-
>  fs/xfs/xfs_iomap.c | 17 +
>  2 files changed, 31 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 35703a801372..b307940e7d56 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -541,8 +541,11 @@ xfs_file_dio_aio_write(
>   iolock = XFS_IOLOCK_SHARED;
>   }
>  
> - xfs_ilock(ip, iolock);
> -
> + if (!xfs_ilock_nowait(ip, iolock)) {
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EAGAIN;
> + xfs_ilock(ip, iolock);
> + }
>   ret = xfs_file_aio_write_checks(iocb, from, );
>   if (ret)
>   goto out;
> @@ -553,9 +556,15 @@ xfs_file_dio_aio_write(
>* otherwise demote the lock if we had to take the exclusive lock
>* for other reasons in xfs_file_aio_write_checks.
>*/
> - if (unaligned_io)
> - inode_dio_wait(inode);
> - else if (iolock == XFS_IOLOCK_EXCL) {
> + if (unaligned_io) {
> + /* If we are going to wait for other DIO to finish, bail */
> + if (iocb->ki_flags & IOCB_NOWAIT) {
> + if (atomic_read(>i_dio_count))
> + return -EAGAIN;
> + } else {
> + inode_dio_wait(inode);
> + }
> + } else if (iolock == XFS_IOLOCK_EXCL) {
>   xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
>   iolock = XFS_IOLOCK_SHARED;
>   }
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 94e5bdf7304c..8b0e3c1e086d 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1016,6 +1016,15 @@ xfs_file_iomap_begin(
>  
>   if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
>   if (flags & IOMAP_DIRECT) {
> + /*
> +  * A reflinked inode will result in CoW alloc.
> +  * FIXME: It could still overwrite on unshared extents
> +  * and not need allocation.
> +  */
> + if (flags & IOMAP_NOWAIT) {
> + error = -EAGAIN;
> + goto out_unlock;
> + }
>   /* may drop and re-acquire the ilock */
>   error = xfs_reflink_allocate_cow(ip, , ,
>   );
> @@ -1033,6 +1042,14 @@ xfs_file_iomap_begin(
>  
>   if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, , nimaps)) {
>   /*
> +  * If nowait is set bail since we are going to make
> +  * allocations.
> +  */
> + if (flags & IOMAP_NOWAIT) {
> + error = -EAGAIN;
> + goto out_unlock;
> + }
> + /*
>* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
>* pages to keep the chunks of work done where somewhat 
> symmetric
>* with the work writeback does. This is a completely arbitrary
> -- 
> 2.12.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] vfs: add mount umount logs

2017-05-18 Thread Darrick J. Wong
On Thu, May 18, 2017 at 06:08:04PM +0800, Anand Jain wrote:
> By looking at the logs we should be able to know when was the FS
> mounted and unmounted and the options used, so to help forensic
> investigations.
> 
> Signed-off-by: Anand Jain 
> ---
> You may want to know that, during boot and shutdown this
> adds roughly 25 lines more logs depending on the config, and it
> logs even for non block device FS, such as proc, sysfs ..etc.
> And blockdev FS only check will eliminate overlay as well, which
> is kind of defeats the purpose.
> Further, just to highlight if your test script involves mount and
> umount, which probably all of fstests does, it will add logs when
> FS is mounted and umounted.
> Still IMO, these logs are useful for the end purpose as mentioned
> above. Its for your feedback. Thanks.

XFS already logs its own unmounts.  I prefer to let each filesystem log
its own unmount, because then the mount/unmount messages also have the
same prefix as all other messages coming from that filesystem driver.

>  fs/namespace.c | 15 +++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index b3b115bd4e1e..78375b6f8330 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1686,6 +1686,8 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
>   struct mount *mnt;
>   int retval;
>   int lookup_flags = 0;
> + struct super_block *sb;
> + char umntlog[256] = {0};

Kind of a lot of stack space...

--D

>  
>   if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
>   return -EINVAL;
> @@ -1711,7 +1713,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, 
> flags)
>   if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
>   goto dput_and_out;
>  
> + sb = mnt->mnt.mnt_sb;
> + snprintf(umntlog, sizeof(umntlog), "umount %s dev:%s flags:%d",
> + sb->s_type->name, sb->s_id, flags);
> +
>   retval = do_umount(mnt, flags);
> +
> + if (!retval)
> + printk(KERN_NOTICE "%s\n", umntlog);
> +
>  dput_and_out:
>   /* we mustn't call path_put() as that would clear mnt_expiry_mark */
>   dput(path.dentry);
> @@ -2833,6 +2843,11 @@ long do_mount(const char *dev_name, const char __user 
> *dir_name,
>   else
>   retval = do_new_mount(, type_page, flags, mnt_flags,
> dev_name, data_page);
> +
> + if (!retval)
> + printk(KERN_NOTICE "mount %s dev:%s dir:%pd flags:0x%lX 
> opt:%s\n",
> + type_page, dev_name, path.dentry, flags, (char 
> *)data_page);
> +
>  dput_out:
>   path_put();
>   return retval;
> -- 
> 2.10.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-05-17 Thread Darrick J. Wong
Document the new GETFSMAP ioctl that returns the physical layout of a
(disk-based) filesystem.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
v2: emphasize that filesystems are not obligated to return inode numbers
---
 man2/ioctl_getfsmap.2 |  375 +
 1 file changed, 375 insertions(+)
 create mode 100644 man2/ioctl_getfsmap.2

diff --git a/man2/ioctl_getfsmap.2 b/man2/ioctl_getfsmap.2
new file mode 100644
index 000..b451950
--- /dev/null
+++ b/man2/ioctl_getfsmap.2
@@ -0,0 +1,375 @@
+.\" Copyright (c) 2017, Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" This is free documentation; you can redistribute it and/or
+.\" modify it under the terms of the GNU General Public License as
+.\" published by the Free Software Foundation; either version 2 of
+.\" the License, or (at your option) any later version.
+.\"
+.\" The GNU General Public License's references to "object code"
+.\" and "executables" are to be interpreted as the output of any
+.\" document formatting or typesetting system, including
+.\" intermediate and printed output.
+.\"
+.\" This manual is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public
+.\" License along with this manual; if not, see
+.\" <http://www.gnu.org/licenses/>.
+.\" %%%LICENSE_END
+.TH IOCTL-GETFSMAP 2 2017-02-10 "Linux" "Linux Programmer's Manual"
+.SH NAME
+ioctl_getfsmap \- retrieve the physical layout of the filesystem
+.SH SYNOPSIS
+.br
+.B #include 
+.br
+.B #include 
+.br
+.B #include 
+.sp
+.BI "int ioctl(int " fd ", FS_IOC_GETFSMAP, struct fsmap_head * " arg );
+.SH DESCRIPTION
+This
+.BR ioctl (2)
+retrieves physical extent mappings for a filesystem.
+This information can be used to discover which files are mapped to a physical
+block, examine free space, or find known bad blocks, among other things.
+
+The sole argument to this ioctl should be a pointer to a single
+.BR "struct fsmap_head" ":"
+.in +4n
+.nf
+
+struct fsmap {
+   __u32   fmr_device; /* device id */
+   __u32   fmr_flags;  /* mapping flags */
+   __u64   fmr_physical;   /* device offset of segment */
+   __u64   fmr_owner;  /* owner id */
+   __u64   fmr_offset; /* file offset of segment */
+   __u64   fmr_length; /* length of segment */
+   __u64   fmr_reserved[3];/* must be zero */
+};
+
+struct fsmap_head {
+   __u32   fmh_iflags; /* control flags */
+   __u32   fmh_oflags; /* output flags */
+   __u32   fmh_count;  /* # of entries in array incl. input */
+   __u32   fmh_entries;/* # of entries filled in (output). */
+   __u64   fmh_reserved[6];/* must be zero */
+
+   struct fsmapfmh_keys[2];/* low and high keys for the mapping 
search */
+   struct fsmapfmh_recs[]; /* returned records */
+};
+
+.fi
+.in
+The two
+.I fmh_keys
+array elements specify the lowest and highest reverse-mapping
+keys, respectively, for which userspace would like physical mapping
+information.
+A reverse mapping key consists of the tuple (device, block, owner, offset).
+The owner and offset fields are part of the key because some filesystems
+support sharing physical blocks between multiple files and
+therefore may return multiple mappings for a given physical block.
+.PP
+Filesystem mappings are copied into the
+.I fmh_recs
+array, which immediately follows the header data.
+.SS Fields of struct fsmap_head
+.PP
+The
+.I fmh_iflags
+field is a bitmask passed to the kernel to alter the output.
+There are no flags defined, so callers must set this value to zero.
+
+.PP
+The
+.I fmh_oflags
+field is a bitmask of flags set by the kernel concerning the returned mappings.
+If
+.B FMH_OF_DEV_T
+is set, then the
+.I fmr_device
+field represents a
+.B dev_t
+structure containing the major and minor numbers of the block device.
+
+.PP
+The
+.I fmh_count
+field contains the number of elements in the array being passed to the
+kernel.
+If this value is 0,
+.I fmh_entries
+will be set to the number of records that would have been returned had
+the array been large enough;
+no mapping information will be returned.
+
+.PP
+The
+.I fmh_entries
+field contains the number of elements in the
+.I fmh_recs
+array that contain useful information.
+
+.PP
+The
+.I fmh_reserved
+fields must be set to zero.
+
+.SS Keys
+.PP
+The two key records in
+.B fsmap_head.fmh_keys
+specify the lo

Re: [PATCH] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-05-17 Thread Darrick J. Wong
On Sun, May 14, 2017 at 06:56:10AM -0700, Andy Lutomirski wrote:
> On Sat, May 13, 2017 at 6:41 PM, Andreas Dilger <adil...@dilger.ca> wrote:
> > On May 10, 2017, at 11:10 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> >>
> >> On Wed, May 10, 2017 at 01:14:37PM -0700, Darrick J. Wong wrote:
> >>> [cc btrfs, since afaict that's where most of the dedupe tool authors hang 
> >>> out]
> 
> >> Yes, PIDs have traditionally been global, but today we have PID 
> >> namespaces, and
> >> many other isolation features such as mount namespaces.  Nothing is 
> >> perfect, of
> >> course, and containers are a lot worse than VMs, but it seems weird to use 
> >> that
> >> as an excuse to knowingly make things worse...
> >>
> 
> Indeed.  Not only PID namespaces -- we have hidepid and we can simply
> unmount /proc.  "There are other info leaks" is a poor excuse.

Eh.  From the sounds of it I'm not all that impressed at the isolation
and leakproofness of any of these schemes.  Regardless, I will rephrase
the manpage to emphasize more strongly that filesystems are under no
obligation to share inode numbers, privileged callers or otherwise.

> >>>
> >>>>> Fortunately, the days of timesharing seem to well behind us.  For
> >>>>> those people who think that containers are as secure as VM's (hah,
> >>>>> hah, hah), it might be that best way to handle this is to have a mount
> >>>>> option that requires root access to this functionality.  For those
> >>>>> people who really care about this, they can disable access.
> >>>
> >>> Or use separate filesystems for each container so that exploitable bugs
> >>> that shut down the filesystem can't be used to kill the other
> >>> containers.  You could use a torrent of metadata-heavy operations
> >>> (fallocate a huge file, punch every block, truncate file, repeat) to DoS
> >>> the other containers.
> >>>
> >>>> What would be the reason for not putting this behind
> >>>> capable(CAP_SYS_ADMIN)?
> >>>>
> >>>> What possible legitimate function could this functionality serve to
> >>>> users who don't own your filesystem?
> >>>
> >>> As I've said before, it's to enable dedupe tools to decide, given a set
> >>> of files with shareable blocks, roughly how many other times each of
> >>> those shareable blocks are shared so that they can make better decisions
> >>> about which file keeps its shareable blocks, and which file gets
> >>> remapped.  Dedupe is not a privileged operation, nor are any of the
> >>> tools.
> >>>
> >>
> >> So why does the ioctl need to return all extent mappings for the entire
> >> filesystem, instead of just the share count of each block in the file that 
> >> the
> >> ioctl is called on?
> >
> > One possibility is that the ioctl() can return the mapping for all inodes
> > owned by the calling PID (or others if CAP_SYS_ADMIN, CAP_DAC_OVERRIDE,
> > or CAP_FOWNER is set), and return an "filesystem aggregate inode" (or more
> > than one if there is a reason to do so) with all the other allocated blocks
> > for inodes the user doesn't have permission to access?
> 
> Sounds like it could be reasonable.  But you don't want "owned by the
> calling PID" precisely -- you also need to check
> kgid_has_mapping(current_user_ns(), inode->i_gid), I think.

Not to mention that I don't want to go xfs_igetting every inode across
the entire filesystem... :)

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-api" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [xfstests PATCH v2 2/3] ext4: allow ext4 to use $SCRATCH_LOGDEV

2017-05-15 Thread Darrick J. Wong
On Tue, May 09, 2017 at 12:12:44PM -0400, Jeff Layton wrote:
> The writeback error handling test requires that you put the journal on a
> separate device. This allows us to use dmerror to simulate data
> writeback failure, without affecting the journal.
> 
> xfs already has infrastructure for this (a'la $SCRATCH_LOGDEV), so wire
> up the ext4 code so that it can do the same thing when _scratch_mkfs is
> called.
> 
> Signed-off-by: Jeff Layton <jlay...@redhat.com>

Looks ok,
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> ---
>  common/rc | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 257b1903359d..8b815d9c8c33 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -675,6 +675,9 @@ _scratch_mkfs_ext4()
>   local tmp=`mktemp`
>   local mkfs_status
>  
> + [ "$USE_EXTERNAL" = yes -a ! -z "$SCRATCH_LOGDEV" ] && \
> + $mkfs_cmd -O journal_dev $SCRATCH_LOGDEV && \
> + mkfs_cmd="$mkfs_cmd -J device=$SCRATCH_LOGDEV"
>  
>   _scratch_do_mkfs "$mkfs_cmd" "$mkfs_filter" $* 2>$tmp.mkfserr 
> 1>$tmp.mkfsstd
>   mkfs_status=$?
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-05-13 Thread Darrick J. Wong
On Sat, May 13, 2017 at 07:41:24PM -0600, Andreas Dilger wrote:
> On May 10, 2017, at 11:10 PM, Eric Biggers <ebigge...@gmail.com> wrote:
> > 
> > On Wed, May 10, 2017 at 01:14:37PM -0700, Darrick J. Wong wrote:
> >> [cc btrfs, since afaict that's where most of the dedupe tool authors hang 
> >> out]
> >> 
> >> On Wed, May 10, 2017 at 02:27:33PM -0500, Eric W. Biederman wrote:
> >>> Theodore Ts'o <ty...@mit.edu> writes:
> >>> 
> >>>> On Tue, May 09, 2017 at 02:17:46PM -0700, Eric Biggers wrote:
> >>>>> 1.) Privacy implications.  Say the filesystem is being shared between 
> >>>>> multiple
> >>>>>users, and one user unpacks foo.tar.gz into their home directory, 
> >>>>> which
> >>>>>they've set to mode 700 to hide from other users.  Because of this 
> >>>>> new
> >>>>>ioctl, all users will be able to see every (inode number, size in 
> >>>>> blocks)
> >>>>>pair that was added to the filesystem, as well as the exact layout 
> >>>>> of the
> >>>>>physical block allocations which might hint at how the files were 
> >>>>> created.
> >>>>>If there is a known "fingerprint" for the unpacked foo.tar.gz in this
> >>>>>regard, its presence on the filesystem will be revealed to all 
> >>>>> users.  And
> >>>>>if any filesystems happen to prefer allocating blocks near the 
> >>>>> containing
> >>>>>directory, the directory the files are in would likely be revealed 
> >>>>> too.
> >> 
> >> Frankly, why are container users even allowed to make unrestricted ioctl
> >> calls?  I thought we had a bunch of security infrastructure to constrain
> >> what userspace can do to a system, so why don't ioctls fall under these
> >> same protections?  If your containers are really that adversarial, you
> >> ought to be blacklisting as much as you can.
> >> 
> > 
> > Personally I don't find the presence of sandboxing features to be a very 
> > good
> > excuse for introducing random insecure ioctls.  Not everyone has everything
> > perfectly "sandboxed" all the time, for obvious reasons.  It's easy to 
> > forget
> > about the filesystem ioctls, too, since they can be executed on any regular
> > file, without having to open some device node in /dev.
> > 
> > (And this actually does happen; the SELinux policy in Android, for example,
> > still allows apps to call any ioctl on their data files, despite all the 
> > effort
> > that has gone into whitelisting other types of ioctls.  Which should be 
> > fixed,
> > of course, but it shows that this kind of mistake is very easy to make.)
> > 
> >>>> Unix/Linux has historically not been terribly concerned about trying
> >>>> to protect this kind of privacy between users.  So for example, in
> >>>> order to do this, you would have to call GETFSMAP continously to track
> >>>> this sort of thing.  Someone who wanted to do this could probably get
> >>>> this information (and much, much more) by continuously running "ps" to
> >>>> see what processes are running.
> >>>> 
> >>>> (I will note. wryly, that in the bad old days, when dozens of users
> >>>> were sharing a one MIPS Vax/780, it was considered a *good* thing
> >>>> that social pressure could be applied when it was found that someone
> >>>> was running a CPU or memory hogger on a time sharing system.  The
> >>>> privacy right of someone running "xtrek" to be able to hide this from
> >>>> other users on the system was never considered important at all.  :-)
> >> 
> >> Not to mention someone running GETFSMAP in a loop will be pretty obvious
> >> both from the high kernel cpu usage and the huge number of metadata
> >> operations.
> > 
> > Well, only if that someone running GETFSMAP actually wants to watch things 
> > in
> > real-time (it's not necessary for all scenarios that have been mentioned), 
> > *and*
> > there is monitoring in place which actually detects it and can do something
> > about it.
> > 
> > Yes, PIDs have traditionally been global, but today we have PID namespaces, 
> > and
> > many other isolation features such as mount namespaces.  Nothing is 
> > p

Re: [PATCH] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-05-10 Thread Darrick J. Wong
[cc btrfs, since afaict that's where most of the dedupe tool authors hang out]

On Wed, May 10, 2017 at 02:27:33PM -0500, Eric W. Biederman wrote:
> Theodore Ts'o  writes:
> 
> > On Tue, May 09, 2017 at 02:17:46PM -0700, Eric Biggers wrote:
> >> 1.) Privacy implications.  Say the filesystem is being shared between 
> >> multiple
> >> users, and one user unpacks foo.tar.gz into their home directory, which
> >> they've set to mode 700 to hide from other users.  Because of this new
> >> ioctl, all users will be able to see every (inode number, size in 
> >> blocks)
> >> pair that was added to the filesystem, as well as the exact layout of 
> >> the
> >> physical block allocations which might hint at how the files were 
> >> created.
> >> If there is a known "fingerprint" for the unpacked foo.tar.gz in this
> >> regard, its presence on the filesystem will be revealed to all users.  
> >> And
> >> if any filesystems happen to prefer allocating blocks near the 
> >> containing
> >> directory, the directory the files are in would likely be revealed too.

Frankly, why are container users even allowed to make unrestricted ioctl
calls?  I thought we had a bunch of security infrastructure to constrain
what userspace can do to a system, so why don't ioctls fall under these
same protections?  If your containers are really that adversarial, you
ought to be blacklisting as much as you can.

> > Unix/Linux has historically not been terribly concerned about trying
> > to protect this kind of privacy between users.  So for example, in
> > order to do this, you would have to call GETFSMAP continously to track
> > this sort of thing.  Someone who wanted to do this could probably get
> > this information (and much, much more) by continuously running "ps" to
> > see what processes are running.
> >
> > (I will note. wryly, that in the bad old days, when dozens of users
> > were sharing a one MIPS Vax/780, it was considered a *good* thing
> > that social pressure could be applied when it was found that someone
> > was running a CPU or memory hogger on a time sharing system.  The
> > privacy right of someone running "xtrek" to be able to hide this from
> > other users on the system was never considered important at all.  :-)

Not to mention someone running GETFSMAP in a loop will be pretty obvious
both from the high kernel cpu usage and the huge number of metadata
operations.

> > Fortunately, the days of timesharing seem to well behind us.  For
> > those people who think that containers are as secure as VM's (hah,
> > hah, hah), it might be that best way to handle this is to have a mount
> > option that requires root access to this functionality.  For those
> > people who really care about this, they can disable access.

Or use separate filesystems for each container so that exploitable bugs
that shut down the filesystem can't be used to kill the other
containers.  You could use a torrent of metadata-heavy operations
(fallocate a huge file, punch every block, truncate file, repeat) to DoS
the other containers.

> What would be the reason for not putting this behind
> capable(CAP_SYS_ADMIN)?
> 
> What possible legitimate function could this functionality serve to
> users who don't own your filesystem?

As I've said before, it's to enable dedupe tools to decide, given a set
of files with shareable blocks, roughly how many other times each of
those shareable blocks are shared so that they can make better decisions
about which file keeps its shareable blocks, and which file gets
remapped.  Dedupe is not a privileged operation, nor are any of the
tools.

> I have seen several people speak up how this is a concern I don't see
> anyone saying here is a legitimate use for a non-system administrator.

/I/ said that a few emails ago.

--D

> This doesn't seem like something where abuses of time-sharing systems
> can be observed.
> 
> Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] fstests: generic: Check if cycle mount and sleep can affect fiemap result

2017-04-07 Thread Darrick J. Wong
On Fri, Apr 07, 2017 at 01:02:58PM +0800, Eryu Guan wrote:
> On Thu, Apr 06, 2017 at 11:28:01AM -0500, Eric Sandeen wrote:
> > On 4/6/17 11:26 AM, Theodore Ts'o wrote:
> > > On Wed, Apr 05, 2017 at 10:35:26AM +0800, Eryu Guan wrote:
> > >>
> > >> Test fails with ext3/2 when driving with ext4 driver, fiemap changed
> > >> after umount/mount cycle, then changed back to original result after
> > >> sleeping some time. An ext4 bug? (cc'ed linux-ext4 list.)
> > > 
> > > I haven't had time to look at this, but I'm not sure this test is a
> > > reasonable one on the face of it.
> > > 
> > > A file system may choose to optimize a file's extent tree for whatever
> > > reason it wants, whenever it wants, including on an unmount --- and
> > > that would not be an invalid thing to do.  So to have an xfstests that
> > > causes a test failure if a file system were to, say, do some cleanup
> > > at mount or unmount time, or when the file is next opened, to merge
> > > adjacent extents together (and hence change what is returned by
> > > FIEMAP) might be strange, or even weird --- but is this any of user
> > > space's business?  Or anything we want to enforce as wrong wrong wrong
> > > by xfstests?
> 
> So I was asking for a review from ext4 side instead of queuing it for
> next xfstests update :)

In general FIEMAP can return pretty much whatever it wants, which
usually means that it won't report extents larger than the underlying
block mapping extents, though as we've seen it can split a single
on-disk extent into multiple FIEMAP records for the purpose of reporting
sharedness.

For ext3 I'm wondering if it's the case that the first time we FIEMAP an
indirect map file we see a possibly-merged version of whatever's in the
particular leaf node we land in; then that information gets cached &
merged with other records in the extent status tree, such that
subsequent FIEMAP calls see longer extents than the first time around.

> > I had the same question.  If the exact behavior isn't defined anywhere,
> > I don't know what we can be testing, TBH.
> 
> Agreed, I was about to ask for the expected behavior today if there was
> no new review comments on this patch.

I think the expected behavior is that any behavior is expected. :(

--D

> 
> Thanks for the comments and review!
> 
> Eryu
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] nowait aio: xfs

2017-04-07 Thread Darrick J. Wong
On Fri, Apr 07, 2017 at 06:34:28AM -0500, Goldwyn Rodrigues wrote:
> 
> 
> On 04/06/2017 05:54 PM, Darrick J. Wong wrote:
> > On Mon, Apr 03, 2017 at 11:52:11PM -0700, Christoph Hellwig wrote:
> >>> + if (unaligned_io) {
> >>> + /* If we are going to wait for other DIO to finish, bail */
> >>> + if ((iocb->ki_flags & IOCB_NOWAIT) &&
> >>> +  atomic_read(>i_dio_count))
> >>> + return -EAGAIN;
> >>>   inode_dio_wait(inode);
> >>
> >> This checks i_dio_count twice in the nowait case, I think it should be:
> >>
> >>if (iocb->ki_flags & IOCB_NOWAIT) {
> >>if (atomic_read(>i_dio_count))
> >>return -EAGAIN;
> >>} else {
> >>inode_dio_wait(inode);
> >>}
> >>
> >>>   if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
> >>>   if (flags & IOMAP_DIRECT) {
> >>> + /* A reflinked inode will result in CoW alloc */
> >>> + if (flags & IOMAP_NOWAIT) {
> >>> + error = -EAGAIN;
> >>> + goto out_unlock;
> >>> + }
> >>
> >> This is a bit pessimistic - just because the inode has any shared
> >> extents we could still write into unshared ones.  For now I think this
> >> pessimistic check is fine, but the comment should be corrected.
> > 
> > Consider what happens in both _reflink_{allocate,reserve}_cow.  If there
> > is already an existing reservation in the CoW fork then we'll have to
> > CoW and therefore can't satisfy the NOWAIT flag.  If there isn't already
> > anything in the CoW fork, then we have to see if there are shared blocks
> > by calling _reflink_trim_around_shared.  That performs a refcountbt
> > lookup, which involves locking the AGF, so we also can't satisfy NOWAIT.
> > 
> > IOWs, I think this hunk has to move outside the IOMAP_DIRECT check to
> > cover both write-to-reflinked-file cases.
> > 
> 
> IOMAP_NOWAIT is set only with IOMAP_DIRECT since the nowait feature is
> for direct-IO only. This is checked early on, when we are checking for

Ah, ok.  Disregard what I said about moving it then.

--D

> user-passed flags, and if not, -EINVAL is returned.
> 
> 
> -- 
> Goldwyn
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] nowait aio: xfs

2017-04-06 Thread Darrick J. Wong
On Mon, Apr 03, 2017 at 11:52:11PM -0700, Christoph Hellwig wrote:
> > +   if (unaligned_io) {
> > +   /* If we are going to wait for other DIO to finish, bail */
> > +   if ((iocb->ki_flags & IOCB_NOWAIT) &&
> > +atomic_read(>i_dio_count))
> > +   return -EAGAIN;
> > inode_dio_wait(inode);
> 
> This checks i_dio_count twice in the nowait case, I think it should be:
> 
>   if (iocb->ki_flags & IOCB_NOWAIT) {
>   if (atomic_read(>i_dio_count))
>   return -EAGAIN;
>   } else {
>   inode_dio_wait(inode);
>   }
> 
> > if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
> > if (flags & IOMAP_DIRECT) {
> > +   /* A reflinked inode will result in CoW alloc */
> > +   if (flags & IOMAP_NOWAIT) {
> > +   error = -EAGAIN;
> > +   goto out_unlock;
> > +   }
> 
> This is a bit pessimistic - just because the inode has any shared
> extents we could still write into unshared ones.  For now I think this
> pessimistic check is fine, but the comment should be corrected.

Consider what happens in both _reflink_{allocate,reserve}_cow.  If there
is already an existing reservation in the CoW fork then we'll have to
CoW and therefore can't satisfy the NOWAIT flag.  If there isn't already
anything in the CoW fork, then we have to see if there are shared blocks
by calling _reflink_trim_around_shared.  That performs a refcountbt
lookup, which involves locking the AGF, so we also can't satisfy NOWAIT.

IOWs, I think this hunk has to move outside the IOMAP_DIRECT check to
cover both write-to-reflinked-file cases.

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] fstests: generic: Check if cycle mount and sleep can affect fiemap result

2017-04-03 Thread Darrick J. Wong
On Mon, Apr 03, 2017 at 03:09:23PM +0800, Qu Wenruo wrote:
> As long as we don't modify the on-disk data, fiemap result should always
> be constant.
> 
> Operation like cycle mount and sleep should not affect fiemap result.
> While unfortunately, btrfs doesn't follow that behavior.
> 
> Btrfs fiemap sometimes return merged result, while after cycle mount, it
> returns split result. Furthermore after a snap, btrfs returns merged
> result again.
> 
> Signed-off-by: Qu Wenruo 
> ---
>  tests/generic/422 | 127 
> ++
>  tests/generic/422.out |   2 +
>  tests/generic/group   |   1 +
>  3 files changed, 130 insertions(+)
>  create mode 100755 tests/generic/422
>  create mode 100644 tests/generic/422.out
> 
> diff --git a/tests/generic/422 b/tests/generic/422
> new file mode 100755
> index 000..4ca4476
> --- /dev/null
> +++ b/tests/generic/422
> @@ -0,0 +1,127 @@
> +#! /bin/bash
> +# FS QA Test 422
> +#
> +# Test if a file system returns constant fiemap result after remount and
> +# fiemap.
> +# Unfortunately, btrfs doesn't follow this behavior.

Is this test a goalpost for btrfs bugfixes you're going to post?

> +#
> +#---
> +# Copyright (c) 2017 Fujitsu.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#---
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1 # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> + cd /
> + rm -f $tmp.*
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# remove previous $seqres.full before test
> +rm -f $seqres.full
> +
> +# real QA test starts here
> +
> +# Modify as appropriate.
> +_supported_fs generic
> +_supported_os IRIX Linux

I don't think we need to support IRIX...

--D

> +_require_scratch
> +
> +block_size=$((64 * 1024))
> +block_count=32
> +dst=$SCRATCH_MNT/file
> +sleeptime=3
> +
> +# It's almost 100% for btrfs to trigger inconstant fiemap result 
> +# just in case
> +runtime=$((2 * $LOAD_FACTOR))
> +
> +# record fiemap as checkpoint, and output the hash of fiemap result
> +# to stdout
> +fiemap_checkpoint()
> +{
> + local number=$1
> + local message=$2
> +
> + echo "=== $message ===" >> $seqres.full
> + $XFS_IO_PROG -c "fiemap -v" $dst > ${tmp}.cp$number
> + cat ${tmp}.cp${number} >> $seqres.full
> +
> + md5sum ${tmp}.cp${number} | cut -d ' ' -f 1
> +}
> +
> +do_test()
> +{
> + local number=$1
> +
> + # Use 16 times of file size to ensure we have enough space
> + _scratch_mkfs_sized $((16 * $block_size * $block_count)) \
> + > /dev/null 2>&1
> + _scratch_mount 
> +
> + echo "== Loop $number ==" >> $seqres.full
> + touch $dst
> + # Xfsprogs 4.9.0 still has a bug that xfs_io "open" with O_SYNC command
> + # doesn't work well with "pwrite", although it gets fixed in v4.10.0,
> + # use dd here to avoid it won't hurt for non-xfs developers
> + dd if=/dev/zero of=$dst bs=$block_size count=$block_count oflag=dsync \
> + status=none 2>&1
> +
> + hash1=$(fiemap_checkpoint 1 "Fiemap just after dsync write")
> +
> + # Sleep should not modify fiemap result
> + sleep $sleeptime
> +
> + hash2=$(fiemap_checkpoint 2 "Fiemap after dsync write and sleep")
> +
> + # cycle mount should not modify fiemap result
> + _scratch_cycle_mount
> +
> + hash3=$(fiemap_checkpoint 3 "Fiemap after cycle mount")
> +
> + # Sleep should not modify fiemap result
> + sleep $sleeptime
> +
> + hash4=$(fiemap_checkpoint 4 "Fiemap after cycle mount and sleep")
> +
> + _scratch_unmount
> +
> + if [ $hash1 != $hash2 -o $hash2 != $hash3 -o $hash3 != $hash4 ]; then
> + echo "Inconstant fiemap result detected"
> + fi
> + echo >> $seqres.full
> +}
> +
> +for i in $(seq 1 $runtime); do
> + do_test $i
> +done
> +
> +echo "Silence is golden"
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/422.out b/tests/generic/422.out
> new file mode 100644
> index 000..f70693f
> --- /dev/null
> +++ 

Re: [PATCH 3/4] reflink: test adjacency of reflinked blocks

2017-02-28 Thread Darrick J. Wong
On Tue, Feb 28, 2017 at 04:15:02PM +0800, Eryu Guan wrote:
> On Fri, Feb 24, 2017 at 05:12:57PM -0800, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.w...@oracle.com>
> > 
> > If we reflink a file with N blocks to another file one block at a time,
> > does the destination file end up with the same number of extents as the
> > source file?  In other words, does the filesystem succeed at combining
> > adjacent mappings into a maximal extents?
> 
> I'm not sure if this is a standard behavior and applies to btrfs too?
> But btrfs is failing this test now:
> 
> +f1 (1) != f2 (32)
> +s1 (1) != s2 (32)
> 
> Fix test or btrfs? I'm taking it if btrfs is the one to be fixed :)

btrfs has that weird behavior where it doesn't merge the adjacent
extents at all (at least not according to FIEMAP) until you remount the
filesystem.  After the remount it's fine, but... WTF? :)

So yes, the test is working as designed.  btrfs needs fixing, or I guess
worst case we can _notrun it on btrfs.

Snark aside, it was intended originally to make sure that XFS is
properly merging the extent records together; then it occurred to me to
rewrite it with fiemap and make it one of the generic reflink tests so
that ocfs2 can get tested too.

--D

> 
> Thanks,
> Eryu
> 
> > 
> > Signed-off-by: Darrick J. Wong <djw...@djwong.org>
> > ---
> >  tests/generic/930 |  106 
> > +
> >  tests/generic/930.out |   11 +
> >  tests/generic/group   |1 
> >  3 files changed, 118 insertions(+)
> >  create mode 100755 tests/generic/930
> >  create mode 100644 tests/generic/930.out
> > 
> > 
> > diff --git a/tests/generic/930 b/tests/generic/930
> > new file mode 100755
> > index 000..15d8cbf
> > --- /dev/null
> > +++ b/tests/generic/930
> > @@ -0,0 +1,106 @@
> > +#! /bin/bash
> > +# FS QA Test No. 930
> > +#
> > +# Check that reflinking adjacent blocks in a file produces a single
> > +# block mapping extent.
> > +#
> > +#---
> > +# Copyright (c) 2017 Oracle, Inc.  All Rights Reserved.
> > +#
> > +# This program is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU General Public License as
> > +# published by the Free Software Foundation.
> > +#
> > +# This program is distributed in the hope that it would be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +# GNU General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU General Public License
> > +# along with this program; if not, write the Free Software Foundation,
> > +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> > +#---
> > +#
> > +
> > +seq=`basename $0`
> > +seqres=$RESULT_DIR/$seq
> > +echo "QA output created by $seq"
> > +
> > +here=`pwd`
> > +tmp=/tmp/$$
> > +status=1   # failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 7 15
> > +
> > +_cleanup()
> > +{
> > +   cd /
> > +   rm -rf $tmp.*
> > +   wait
> > +}
> > +
> > +# get standard environment, filters and checks
> > +. ./common/rc
> > +. ./common/filter
> > +. ./common/reflink
> > +
> > +# real QA test starts here
> > +_supported_os Linux
> > +_supported_fs generic
> > +_require_scratch_reflink
> > +_require_fiemap
> > +
> > +echo "Format and mount"
> > +_scratch_mkfs > $seqres.full 2>&1
> > +_scratch_mount >> $seqres.full 2>&1
> > +
> > +testdir=$SCRATCH_MNT/test-$seq
> > +mkdir $testdir
> > +
> > +blocks=32
> > +blksz=65536
> > +sz=$((blocks * blksz))
> > +
> > +echo "Create the original files"
> > +$XFS_IO_PROG -f -c "falloc 0 $sz" $testdir/file1 >> $seqres.full
> > +_pwrite_byte 0x61 0 $sz $testdir/file1 >> $seqres.full
> > +seq 0 $blksz $((sz - blksz)) | while read offset; do
> > +   _reflink_range $testdir/file1 $offset $testdir/file2 $offset $blksz >> 
> > $seqres.full
> > +done
> > +
> > +echo "Compare files"
> > +md5sum $testdir/file1 | _filter_scratch
> > +md5sum $testdir/file2 | _filter_scratch
> > +
> > +echo "Check extent counts"
> > +f1=$(_count_extents $test

[PATCH] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-02-21 Thread Darrick J. Wong
Document the new GETFSMAP ioctl that returns the physical layout of a
(disk-based) filesystem.  This time around the fs-specific parts have
been moved to a separate section; I'll move move them into separate
xfsprogs/e2fsprogs manpages when we get closer to landing the ioctl.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
 man2/ioctl_getfsmap.2 |  359 +
 1 file changed, 359 insertions(+)
 create mode 100644 man2/ioctl_getfsmap.2

diff --git a/man2/ioctl_getfsmap.2 b/man2/ioctl_getfsmap.2
new file mode 100644
index 000..7121d61
--- /dev/null
+++ b/man2/ioctl_getfsmap.2
@@ -0,0 +1,359 @@
+.\" Copyright (c) 2017, Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" This is free documentation; you can redistribute it and/or
+.\" modify it under the terms of the GNU General Public License as
+.\" published by the Free Software Foundation; either version 2 of
+.\" the License, or (at your option) any later version.
+.\"
+.\" The GNU General Public License's references to "object code"
+.\" and "executables" are to be interpreted as the output of any
+.\" document formatting or typesetting system, including
+.\" intermediate and printed output.
+.\"
+.\" This manual is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public
+.\" License along with this manual; if not, see
+.\" <http://www.gnu.org/licenses/>.
+.\" %%%LICENSE_END
+.TH IOCTL-GETFSMAP 2 2017-02-10 "Linux" "Linux Programmer's Manual"
+.SH NAME
+ioctl_getfsmap \- retrieve the physical layout of the filesystem
+.SH SYNOPSIS
+.br
+.B #include 
+.br
+.B #include 
+.br
+.B #include 
+.sp
+.BI "int ioctl(int " fd ", GETFSMAP, struct fsmap_head * " arg );
+.SH DESCRIPTION
+This
+.BR ioctl (2)
+retrieves physical extent mappings for a filesystem.
+This information can be used to discover which files are mapped to a physical
+block, examine free space, or find known bad blocks, among other things.
+
+The sole argument to this ioctl should be a pointer to a single
+.BR "struct fsmap_head" ":"
+.in +4n
+.nf
+
+struct fsmap {
+   __u32   fmr_device; /* device id */
+   __u32   fmr_flags;  /* mapping flags */
+   __u64   fmr_physical;   /* device offset of segment */
+   __u64   fmr_owner;  /* owner id */
+   __u64   fmr_offset; /* file offset of segment */
+   __u64   fmr_length; /* length of segment */
+   __u64   fmr_reserved[3];/* must be zero */
+};
+
+struct fsmap_head {
+   __u32   fmh_iflags; /* control flags */
+   __u32   fmh_oflags; /* output flags */
+   __u32   fmh_count;  /* # of entries in array incl. input */
+   __u32   fmh_entries;/* # of entries filled in (output). */
+   __u64   fmh_reserved[6];/* must be zero */
+
+   struct fsmapfmh_keys[2];/* low and high keys for the mapping 
search */
+   struct fsmapfmh_recs[]; /* returned records */
+};
+
+.fi
+.in
+The two
+.I fmh_keys
+array elements specify the lowest and highest reverse-mapping
+keys, respectively, for which userspace would like physical mapping
+information.
+A reverse mapping key consists of the tuple (device, block, owner, offset).
+The owner and offset fields are part of the key because some filesystems
+support sharing physical blocks between multiple files and
+therefore may return multiple mappings for a given physical block.
+.PP
+Filesystem mappings are copied into the
+.I fmh_recs
+array, which immediately follows the header data.
+.SS Fields of struct fsmap_head
+.PP
+The
+.I fmh_iflags
+field is a bitmask passed to the kernel to alter the output.
+There are no flags defined, so this value must be zero.
+
+.PP
+The
+.I fmh_oflags
+field is a bitmask of flags that concern all output mappings.
+If
+.B FMH_OF_DEV_T
+is set, then the
+.I fmr_device
+field represents a
+.B dev_t
+structure containing the major and minor numbers of the block device.
+
+.PP
+The
+.I fmh_count
+field contains the number of elements in the array being passed to the
+kernel.
+If this value is 0,
+.I fmh_entries
+will be set to the number of records that would have been returned had
+the array been large enough;
+no mapping information will be returned.
+
+.PP
+The
+.I fmh_entries
+field contains the number of elements in the
+.I fmh_recs
+array that contain useful information.
+
+.PP
+The
+.I fmh_reserved
+fields must be set to zero.
+
+.SS Keys
+.P

Re: [PATCH 4/6] xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio*

2017-02-06 Thread Darrick J. Wong
On Mon, Feb 06, 2017 at 07:47:43PM +0100, Michal Hocko wrote:
> On Mon 06-02-17 10:32:37, Darrick J. Wong wrote:
> > On Mon, Feb 06, 2017 at 06:44:15PM +0100, Michal Hocko wrote:
> > > On Mon 06-02-17 07:39:23, Matthew Wilcox wrote:
> > > > On Mon, Feb 06, 2017 at 03:07:16PM +0100, Michal Hocko wrote:
> > > > > +++ b/fs/xfs/xfs_buf.c
> > > > > @@ -442,17 +442,17 @@ _xfs_buf_map_pages(
> > > > >   bp->b_addr = NULL;
> > > > >   } else {
> > > > >   int retried = 0;
> > > > > - unsigned noio_flag;
> > > > > + unsigned nofs_flag;
> > > > >  
> > > > >   /*
> > > > >* vm_map_ram() will allocate auxillary structures (e.g.
> > > > >* pagetables) with GFP_KERNEL, yet we are likely to be 
> > > > > under
> > > > >* GFP_NOFS context here. Hence we need to tell memory 
> > > > > reclaim
> > > > > -  * that we are in such a context via PF_MEMALLOC_NOIO 
> > > > > to prevent
> > > > > +  * that we are in such a context via PF_MEMALLOC_NOFS 
> > > > > to prevent
> > > > >* memory reclaim re-entering the filesystem here and
> > > > >* potentially deadlocking.
> > > > >*/
> > > > 
> > > > This comment feels out of date ... how about:
> > > 
> > > which part is out of date?
> > > 
> > > > 
> > > > /*
> > > >  * vm_map_ram will allocate auxiliary structures (eg 
> > > > page
> > > >  * tables) with GFP_KERNEL.  If that tries to reclaim 
> > > > memory
> > > >  * by calling back into this filesystem, we may 
> > > > deadlock.
> > > >  * Prevent that by setting the NOFS flag.
> > > >  */
> > > 
> > > dunno, the previous wording seems clear enough to me. Maybe little bit
> > > more chatty than yours but I am not sure this is worth changing.
> > 
> > I prefer to keep the "...yet we are likely to be under GFP_NOFS..."
> > wording of the old comment because it captures the uncertainty of
> > whether or not we actually are already under NOFS.  If someone actually
> > has audited this code well enough to know for sure then yes let's change
> > the comment, but I haven't gone that far.

Ugh, /me hands himself another cup of coffee...

Somehow I mixed up _xfs_buf_map_pages and kmem_zalloc_large in this
discussion.  Probably because they have similar code snippets with very
similar comments to two totally different parts of xfs.

The _xfs_buf_map_pages can be called inside or outside of
transaction context, so I think we still have to memalloc_nofs_save for
that to avoid the lockdep complaints and deadlocks referenced in the
commit that added all that (to _xfs_buf_map_pages) in the first place.
ae687e58b3 ("xfs: use NOIO contexts for vm_map_ram")

My comments about kmem_zalloc_large still stand, even though the part
of the patch you two were discussing was the _xfs_buf_map_pages.  I
probably should have clarified that I think both functions actually
/are/ doing the right thing wrt calling (or not calling)
memalloc_nofs_save().

> I believe we can drop the memalloc_nofs_save then as well because either
> we are called from a potentially dangerous context and thus we are in
> the nofs scope we we do not need the protection at all.

Uh, now that I've muddied up the waters, which part are you referring to?

--D

> -- 
> Michal Hocko
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/6] xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio*

2017-02-06 Thread Darrick J. Wong
On Mon, Feb 06, 2017 at 06:44:15PM +0100, Michal Hocko wrote:
> On Mon 06-02-17 07:39:23, Matthew Wilcox wrote:
> > On Mon, Feb 06, 2017 at 03:07:16PM +0100, Michal Hocko wrote:
> > > +++ b/fs/xfs/xfs_buf.c
> > > @@ -442,17 +442,17 @@ _xfs_buf_map_pages(
> > >   bp->b_addr = NULL;
> > >   } else {
> > >   int retried = 0;
> > > - unsigned noio_flag;
> > > + unsigned nofs_flag;
> > >  
> > >   /*
> > >* vm_map_ram() will allocate auxillary structures (e.g.
> > >* pagetables) with GFP_KERNEL, yet we are likely to be under
> > >* GFP_NOFS context here. Hence we need to tell memory reclaim
> > > -  * that we are in such a context via PF_MEMALLOC_NOIO to prevent
> > > +  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
> > >* memory reclaim re-entering the filesystem here and
> > >* potentially deadlocking.
> > >*/
> > 
> > This comment feels out of date ... how about:
> 
> which part is out of date?
> 
> > 
> > /*
> >  * vm_map_ram will allocate auxiliary structures (eg page
> >  * tables) with GFP_KERNEL.  If that tries to reclaim memory
> >  * by calling back into this filesystem, we may deadlock.
> >  * Prevent that by setting the NOFS flag.
> >  */
> 
> dunno, the previous wording seems clear enough to me. Maybe little bit
> more chatty than yours but I am not sure this is worth changing.

I prefer to keep the "...yet we are likely to be under GFP_NOFS..."
wording of the old comment because it captures the uncertainty of
whether or not we actually are already under NOFS.  If someone actually
has audited this code well enough to know for sure then yes let's change
the comment, but I haven't gone that far.

The way the kmem_zalloc_large code is structured suggests to me that
callers don't have to be especially aware of the NOFS state -- they can
just call the function and it'll take care of making it work.

> > 
> > > - noio_flag = memalloc_noio_save();
> > > + nofs_flag = memalloc_nofs_save();
> > >   do {
> > >   bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
> > >   -1, PAGE_KERNEL);
> > 
> > Also, I think it shows that this is the wrong place in XFS to be calling
> > memalloc_nofs_save().  I'm not arguing against including this patch;
> > it's a step towards where we want to be.  I also don't know XFS well
> > enough to know where to set that flag ;-)  Presumably when we start a
> > transaction ... ?

None of the current kmem_zalloc_large callers actually have a
transaction, at least not at that point.

> Yes that is what I would like to achieve longterm. And the reason why I
> didn't want to mimic this pattern in kvmalloc as some have suggested.
> It just takes much more time to get there from the past experience and
> we should really start somewhere.

--D

> -- 
> Michal Hocko
> SUSE Labs
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v2 4/4] vfs: wrap write f_ops with file_{start,end}_write()

2017-01-27 Thread Darrick J. Wong
[adding mfasheh & btrfs list to cc]

On Fri, Jan 27, 2017 at 06:20:12PM +0200, Amir Goldstein wrote:
> On Fri, Jan 27, 2017 at 1:50 PM, Amir Goldstein  wrote:
> > On Fri, Jan 27, 2017 at 1:09 PM, Miklos Szeredi  wrote:
> >> On Mon, Jan 23, 2017 at 8:43 PM, Amir Goldstein  wrote:
> >>> Before calling write f_ops, call file_start_write() instead
> >>> of sb_start_write().
> >>>
> >>> This ensures freeze protection for both overlay and upper fs
> >>> when file is open from an overlayfs mount.
> >>>
> >>> Replace {sb,file}_start_write() for {copy,clone}_file_range() and
> >>> for fallocate().
> >>>
> >>> For dedup_file_range() there is no need for mnt_want_write_file().
> >>> File is already open for write, so we already have mnt_want_write()
> >>> and we only need file_start_write().
> >>
> >> Being opened for write is not verified if capable(CAP_SYS_ADMIN).
> >> Ugly special case, don't ask me why it's done...
> >>
> >
> > Christoph, Darrick, is that by design?
> 
> Anyway, whether is makes sense or not, that's a legacy from
> BTRFS_IOC_FILE_EXTENT_SAME, we probably have to live with.
> 
> Michael, I recon man page needs updating.
> 
> I'll remove this hunk from the patch.

I /think/ that behavior (CAP_SYS_ADMIN not requiring destfd to be open
for writes in order to dedupe) was intentional; it seems to date back to
the original ioctl in 2013.  My guess of the justification is that we're
not really writing to dest, so if the admin comes along with an O_RDONLY
destfd it's ok?

 Let's see if we get any bites from the btrfs developers. :)

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Darrick J. Wong
On Wed, Jan 11, 2017 at 12:51:43PM +0100, Jan Kara wrote:
> On Wed 11-01-17 11:29:28, Miklos Szeredi wrote:
> > I know there's work on this for xfs, but could this be done in generic mm
> > code?
> > 
> > What are the obstacles?  page->mapping and page->index are the obvious
> > ones.
> 
> Yes, these two are the main that come to my mind. Also you'd need to
> somehow share the mapping->i_mmap tree so that unmap_mapping_range() works.
> 
> > If that's too difficult is it maybe enough to share mappings between
> > files while they are completely identical and clone the mapping when
> > necessary?
> 
> Well, but how would the page->mapping->host indirection work? Even if you
> have identical contents of the mappings, you still need to be aware there
> are several inodes behind them and you need to pick the right one
> somehow...
> 
> > All COW filesystems would benefit, as well as layered ones: lots of
> > fuse fs, and in some cases overlayfs too.
> > 
> > Related:  what can DAX do in the presence of cloned block?
> 
> For DAX handling a block COW should be doable if that is what you are
> asking about. Handling of blocks that can be written to while they are
> shared will be rather difficult (you have problems with keeping dirty bits
> in the radix tree consistent if nothing else).

I'm also interested in this topic, though I haven't gotten any further
than a hand-wavy notion of handling cow by allocating new blocks, memcpy
the contents to the new blocks (how?), then update the mappings to point
to the new blocks (how?).  It looks a lot easier now with the iomap
stuff, but that's as far as I got. :)

(IOWs it basically took all the time since the last LSF to get reflink
polished enough to handle regular files reasonably well.)

--D

> 
>   Honza
> -- 
> Jan Kara 
> SUSE Labs, CR
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Converging userspace and kernel code

2017-01-09 Thread Darrick J. Wong
On Mon, Jan 09, 2017 at 04:38:22PM -0500, Jeff Mahoney wrote:
> On 1/9/17 4:34 PM, Omar Sandoval wrote:
> > On Mon, Jan 09, 2017 at 09:31:39AM -0600, Eric Sandeen wrote:
> >> On 1/8/17 8:11 PM, Qu Wenruo wrote:
> >>>
> >>>
> >>> At 01/08/2017 09:16 PM, Goldwyn Rodrigues wrote:
> 
>  1. Motivation
>  While fixing user space tools for btrfs-progs, I found a couple of bugs
>  which are already solved in kernel space but were not ported to user
>  space. User space is a little ignored when it comes to fixing bugs in
>  the core functionality. XFS developers have already performed this and
>  the userspace and kernel code walks in lockstep for libxfs.

Eh, I've wrangled the two other FSes mentioned, so I guess I can
reiterate for a while too. :P

Yes, it's very nice to be able to apply kernel patches of core libxfs
code onto xfsprogs and have it work.  It's /annoying/ to have to it sort
of work but with a lot of fuzz and somewhere midway through the apply
loop the whole thing crashes and burns due to some minor merge conflict.

(It's not so bad for libext2fs since at least it's a totally different
implementation.  But I'll get to that later.)

The core XFS algorithms (btrees, space management, inodes, directories,
attributes, on-disk formats, and some of the log stuff) live in
libxfs/xfs_*.[ch].   The stuff that sits between those algorithms and
the kernel all live in fs/xfs/*.[ch], and the stuff between the
algorithms and the C library live in libxfs/ and include/ in files that
don't start with "xfs_".  As I understand it, the dividing line is that
core algorithms are in libxfs, and everything else isn't.

> >>> Personally speaking, I'm not a fan of re-using kernel code in
> >>> btrfs-progs.
> >>
> >> But it already does re-use kernel code, it's just that the re-use is
> >> extremely stale, with unfixed bugs in both directions as a result
> >> (at least last time I looked.)
> >>
> >>> In fact, in btrfs-progs, we don't need a lot of kernel facilities,
> >>> like page/VFS/lock(btrfs-progs works in single thread under most
> >>> case).
> >>>
> >>> And that should make btrfs-progs easier to maintain.

The way I look at it is that there's core algorithms and on-disk format
stuff that can be the same wherever it is, and this code ought to be
kept in sync so that bugfixes and features end up the same in both.
The 'core algorithms' library can talk to the same interfaces in the
kernel and userspace, even though the implementations are different, or,
as Eric points out, even #define'd away.

> >> But as Goldwyn already pointed out, many bugs have gone un-fixed
> >> in userspace, in code which was forked long ago from kernelspace.

Ick.

> >> For things like locking it's trivial to define that away.
> >>
> >> xfsprogs does i.e. -
> >>
> >> /* miscellaneous kernel routines not in user space */
> >> #define down_read(a)((void) 0)
> >> #define up_read(a)  ((void) 0)
> >> #define spin_lock_init(a)   ((void) 0)
> >> #define spin_lock(a)((void) 0)
> >> #define spin_unlock(a)  ((void) 0)
> >> #define likely(x)   (x)
> >> #define unlikely(x) (x)
> >> #define rcu_read_lock() ((void) 0)
> >> #define rcu_read_unlock()   ((void) 0)
> >> #define WARN_ON_ONCE(expr)  ((void) 0)
> >>
> >>
> >>> Furthermore, there are cases while kernel is doing things wrong while
> >>> btrfs-progs does it right.
> >>
> >> All the more reason to sync it up, fixes should always be in both
> >> places, right?
> >>
> >> I had looked at this a few years ago, and started trying to sync things
> >> up, but got daunted and busy and never completed anything.  :(  I sent
> >> a few fixups back in April 2013 to get things /slightly/ closer.
> >>
> >> The libxfs sync in xfs has borne fruit; I'm of the opinion that similar
> >> work would help btrfs too, though it can be a long road.
> >>
> >> (e2fsprogs has gone the other way, and has a completely separate
> >> re-implementation in userspace; it works, I guess, but I have to say
> >> that I really like the code commonality in xfs.)

Having worked on ext* also I will say that as far as finding and
resolving ambiguities in the on-disk specification, having two
independent and maintained implementations of ext4 is wonderful.

Unfortunately, it's /really/ expensive to engineer both, and there are
plenty of small behavioral discrepancies between the two.  In your free
time, run xfstests against fuse2fs and regular ext4. ;)

> > Yup, I also think we should be going in the XFS direction. It's a big
> > maintenance burden to have to worry about the code in two places. (E.g.,
> > the biggest reason I haven't gotten around to implementing full free
> > space tree support in btrfs-progs is that it's such a pain in the ass to
> > port new kernel code to the outdated progs code.)

Yeah.  That sucked for the (very) few times I had to do that for
xfsprogs.

> Another advantage is that we will be able to at least 

Re: [PATCH 2/8] xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

2017-01-09 Thread Darrick J. Wong
On Fri, Jan 06, 2017 at 03:11:01PM +0100, Michal Hocko wrote:
> From: Michal Hocko <mho...@suse.com>
> 
> xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite
> some time ago. We would like to make this concept more generic and use
> it for other filesystems as well. Let's start by giving the flag a
> more generic name PF_MEMALLOC_NOFS which is in line with an exiting
> PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO
> contexts. Replace all PF_FSTRANS usage from the xfs code in the first
> step before we introduce a full API for it as xfs uses the flag directly
> anyway.
> 
> This patch doesn't introduce any functional change.
> 
> Signed-off-by: Michal Hocko <mho...@suse.com>
> Reviewed-by: Brian Foster <bfos...@redhat.com>

Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

> ---
>  fs/xfs/kmem.c |  4 ++--
>  fs/xfs/kmem.h |  2 +-
>  fs/xfs/libxfs/xfs_btree.c |  2 +-
>  fs/xfs/xfs_aops.c |  6 +++---
>  fs/xfs/xfs_trans.c| 12 ++--
>  include/linux/sched.h |  2 ++
>  6 files changed, 15 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index 339c696bbc01..a76a05dae96b 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -80,13 +80,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.
>*/
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   noio_flag = memalloc_noio_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   memalloc_noio_restore(noio_flag);
>  
>   return ptr;
> diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
> index 689f746224e7..d973dbfc2bfa 100644
> --- a/fs/xfs/kmem.h
> +++ b/fs/xfs/kmem.h
> @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
>   lflags = GFP_ATOMIC | __GFP_NOWARN;
>   } else {
>   lflags = GFP_KERNEL | __GFP_NOWARN;
> - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
> + if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
>   lflags &= ~__GFP_FS;
>   }
>  
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 21e6a6ab6b9a..a2672ba4dc33 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -2866,7 +2866,7 @@ xfs_btree_split_worker(
>   struct xfs_btree_split_args *args = container_of(work,
>   struct xfs_btree_split_args, 
> work);
>   unsigned long   pflags;
> - unsigned long   new_pflags = PF_FSTRANS;
> + unsigned long   new_pflags = PF_MEMALLOC_NOFS;
>  
>   /*
>* we are in a transaction context here, but may also be doing work
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index ef382bfb402b..d4094bb55033 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
>* We hand off the transaction to the completion thread now, so
>* clear the flag here.
>*/
> - current_restore_flags_nested(>t_pflags, PF_FSTRANS);
> + current_restore_flags_nested(>t_pflags, PF_MEMALLOC_NOFS);
>   return 0;
>  }
>  
> @@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
>* thus we need to mark ourselves as being in a transaction manually.
>* Similarly for freeze protection.
>*/
> - current_set_flags_nested(>t_pflags, PF_FSTRANS);
> + current_set_flags_nested(>t_pflags, PF_MEMALLOC_NOFS);
>   __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
>  
>   /* we abort the update if there was an IO error */
> @@ -1015,7 +1015,7 @@ xfs_do_writepage(
>* Given that we do not allow direct reclaim to call us, we should
>* never be called while in a filesystem transaction.
>*/
> - if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
> + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
>   goto redirty;
>  
>   /*
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index 70f42ea86dfb..f5969c8274fc 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c

Re: [PATCH 4/8] xfs: use memalloc_nofs_{save,restore} instead of memalloc_noio*

2017-01-09 Thread Darrick J. Wong
On Fri, Jan 06, 2017 at 03:11:03PM +0100, Michal Hocko wrote:
> From: Michal Hocko <mho...@suse.com>
> 
> kmem_zalloc_large and _xfs_buf_map_pages use memalloc_noio_{save,restore}
> API to prevent from reclaim recursion into the fs because vmalloc can
> invoke unconditional GFP_KERNEL allocations and these functions might be
> called from the NOFS contexts. The memalloc_noio_save will enforce
> GFP_NOIO context which is even weaker than GFP_NOFS and that seems to be
> unnecessary. Let's use memalloc_nofs_{save,restore} instead as it should
> provide exactly what we need here - implicit GFP_NOFS context.
> 
> Changes since v1
> - s@memalloc_noio_restore@memalloc_nofs_restore@ in _xfs_buf_map_pages
>   as per Brian Foster
> 
> Signed-off-by: Michal Hocko <mho...@suse.com>

Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

> ---
>  fs/xfs/kmem.c| 10 +-
>  fs/xfs/xfs_buf.c |  8 
>  2 files changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
> index a76a05dae96b..d69ed5e76621 100644
> --- a/fs/xfs/kmem.c
> +++ b/fs/xfs/kmem.c
> @@ -65,7 +65,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
>  void *
>  kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>  {
> - unsigned noio_flag = 0;
> + unsigned nofs_flag = 0;
>   void*ptr;
>   gfp_t   lflags;
>  
> @@ -80,14 +80,14 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
>* context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
>* the filesystem here and potentially deadlocking.
>*/
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - noio_flag = memalloc_noio_save();
> + if (flags & KM_NOFS)
> + nofs_flag = memalloc_nofs_save();
>  
>   lflags = kmem_flags_convert(flags);
>   ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
>  
> - if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
> - memalloc_noio_restore(noio_flag);
> + if (flags & KM_NOFS)
> + memalloc_nofs_restore(nofs_flag);
>  
>   return ptr;
>  }
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 7f0a01f7b592..8cb8dd4cdfd8 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -441,17 +441,17 @@ _xfs_buf_map_pages(
>   bp->b_addr = NULL;
>   } else {
>   int retried = 0;
> - unsigned noio_flag;
> + unsigned nofs_flag;
>  
>   /*
>* vm_map_ram() will allocate auxillary structures (e.g.
>* pagetables) with GFP_KERNEL, yet we are likely to be under
>* GFP_NOFS context here. Hence we need to tell memory reclaim
> -  * that we are in such a context via PF_MEMALLOC_NOIO to prevent
> +  * that we are in such a context via PF_MEMALLOC_NOFS to prevent
>* memory reclaim re-entering the filesystem here and
>* potentially deadlocking.
>*/
> - noio_flag = memalloc_noio_save();
> + nofs_flag = memalloc_nofs_save();
>   do {
>   bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
>   -1, PAGE_KERNEL);
> @@ -459,7 +459,7 @@ _xfs_buf_map_pages(
>   break;
>   vm_unmap_aliases();
>   } while (retried++ <= 1);
> - memalloc_noio_restore(noio_flag);
> + memalloc_nofs_restore(nofs_flag);
>  
>   if (!bp->b_addr)
>   return -ENOMEM;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] duperemove: test presence of dedupe ioctl

2017-01-05 Thread Darrick J. Wong
Since a zero-length dedupe operation is guaranteed to succeed, use that
to test whether or not this filesystem supports dedupe.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
v2: declare variables on the stack instead of introducing fake types
---
 file_scan.c |   45 +++--
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/file_scan.c b/file_scan.c
index 617f166..2708bfe 100644
--- a/file_scan.c
+++ b/file_scan.c
@@ -45,11 +45,7 @@
 #include "file_scan.h"
 #include "dbfile.h"
 #include "util.h"
-
-/* This is not in linux/magic.h */
-#ifndefXFS_SB_MAGIC
-#defineXFS_SB_MAGIC0x58465342  /* 'XFSB' */
-#endif
+#include "btrfs-ioctl.h"
 
 static char path[PATH_MAX] = { 0, };
 static char *pathp = path;
@@ -189,6 +185,37 @@ static int walk_dir(const char *name)
return ret;
 }
 
+/*
+ * A zero-length dedupe between two files should always succeed,
+ * so we can use this to test the presence of dedupe functionality.
+ */
+static bool check_ioctl_works(int fd)
+{
+   struct {
+   struct btrfs_ioctl_same_args args;
+   struct btrfs_ioctl_same_extent_info info;
+   } sa = {0};
+   struct stat sb;
+   static int cached = -1;
+   int ret;
+
+   if (cached >= 0)
+   return cached != 0;
+
+   ret = fstat(fd, );
+   if (ret)
+   return false;
+
+   sa.args.dest_count = 1;
+   sa.args.length = 0;
+   sa.info.fd = fd;
+   sa.info.logical_offset = 0;
+   errno = 0;
+   ret = btrfs_extent_same(fd, );
+   cached = !ret && !errno && !sa.info.status;
+   return cached;
+}
+
 static int __add_file(const char *name, struct stat *st,
  struct filerec **ret_file)
 {
@@ -235,12 +262,10 @@ static int __add_file(const char *name, struct stat *st,
goto out;
}
 
-   if (run_dedupe &&
-   ((fs.f_type != BTRFS_SUPER_MAGIC &&
- fs.f_type != XFS_SB_MAGIC))) {
+   if (run_dedupe && !check_ioctl_works(fd)) {
close(fd);
-   fprintf(stderr, "\"%s\": Can only dedupe files on btrfs or xfs "
-   "(experimental)\n", name);
+   fprintf(stderr, "\"%s\": dedupe ioctl not supported on this "
+   "filesystem.\n", name);
return ENOSYS;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/9] xfs: introduce and use KM_NOLOCKDEP to silence reclaim lockdep false positives

2016-12-19 Thread Darrick J. Wong
On Tue, Dec 20, 2016 at 08:24:13AM +1100, Dave Chinner wrote:
> On Thu, Dec 15, 2016 at 03:07:08PM +0100, Michal Hocko wrote:
> > From: Michal Hocko 
> > 
> > Now that the page allocator offers __GFP_NOLOCKDEP let's introduce
> > KM_NOLOCKDEP alias for the xfs allocation APIs. While we are at it
> > also change KM_NOFS users introduced by b17cb364dbbb ("xfs: fix missing
> > KM_NOFS tags to keep lockdep happy") and use the new flag for them
> > instead. There is really no reason to make these allocations contexts
> > weaker just because of the lockdep which even might not be enabled
> > in most cases.
> > 
> > Signed-off-by: Michal Hocko 
> 
> I'd suggest that it might be better to drop this patch for now -
> it's not necessary for the context flag changeover but does
> introduce a risk of regressions if the conversion is wrong.

I was just about to write in that while I didn't see anything obviously
wrong with the NOFS removals, I also don't know for sure that we can't
end up recursively in those code paths (specifically the directory
traversal thing).

--D

> Hence I think this is better as a completely separate series
> which audits and changes all the unnecessary KM_NOFS allocations
> in one go. I've never liked whack-a-mole style changes like this -
> do it once, do it properly
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] duperemove: test presence of dedupe ioctl

2016-12-15 Thread Darrick J. Wong
Since a zero-length dedupe operation is guaranteed to succeed, use that
to test whether or not this filesystem supports dedupe.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
v2: Don't declare a new type; just declare the struct on the stack.
---
 file_scan.c |   45 +++--
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/file_scan.c b/file_scan.c
index 617f166..2708bfe 100644
--- a/file_scan.c
+++ b/file_scan.c
@@ -45,11 +45,7 @@
 #include "file_scan.h"
 #include "dbfile.h"
 #include "util.h"
-
-/* This is not in linux/magic.h */
-#ifndefXFS_SB_MAGIC
-#defineXFS_SB_MAGIC0x58465342  /* 'XFSB' */
-#endif
+#include "btrfs-ioctl.h"
 
 static char path[PATH_MAX] = { 0, };
 static char *pathp = path;
@@ -189,6 +185,37 @@ static int walk_dir(const char *name)
return ret;
 }
 
+/*
+ * A zero-length dedupe between two files should always succeed,
+ * so we can use this to test the presence of dedupe functionality.
+ */
+static bool check_ioctl_works(int fd)
+{
+   struct {
+   struct btrfs_ioctl_same_args args;
+   struct btrfs_ioctl_same_extent_info info;
+   } sa = {0};
+   struct stat sb;
+   static int cached = -1;
+   int ret;
+
+   if (cached >= 0)
+   return cached != 0;
+
+   ret = fstat(fd, );
+   if (ret)
+   return false;
+
+   sa.args.dest_count = 1;
+   sa.args.length = 0;
+   sa.info.fd = fd;
+   sa.info.logical_offset = 0;
+   errno = 0;
+   ret = btrfs_extent_same(fd, );
+   cached = !ret && !errno && !sa.info.status;
+   return cached;
+}
+
 static int __add_file(const char *name, struct stat *st,
  struct filerec **ret_file)
 {
@@ -235,12 +262,10 @@ static int __add_file(const char *name, struct stat *st,
goto out;
}
 
-   if (run_dedupe &&
-   ((fs.f_type != BTRFS_SUPER_MAGIC &&
- fs.f_type != XFS_SB_MAGIC))) {
+   if (run_dedupe && !check_ioctl_works(fd)) {
close(fd);
-   fprintf(stderr, "\"%s\": Can only dedupe files on btrfs or xfs "
-   "(experimental)\n", name);
+   fprintf(stderr, "\"%s\": dedupe ioctl not supported on this "
+   "filesystem.\n", name);
return ENOSYS;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] duperemove: test presence of dedupe ioctl

2016-12-15 Thread Darrick J. Wong
On Wed, Dec 14, 2016 at 11:26:07AM -0800, Christoph Hellwig wrote:
> On Wed, Dec 14, 2016 at 10:38:45AM -0800, Darrick J. Wong wrote:
> > > > +struct fake_btrfs_ioctl_same_args {
> > > > +   struct btrfs_ioctl_same_args args;
> > > > +   struct btrfs_ioctl_same_extent_info info;
> > > > +};
> > > 
> > > Why does this need a fake structure here?
> > 
> > In order to test the ioctl we have to fill out at least one
> > btrfs_ioctl_same_extent_info so that we get far enough into the fs-specific
> > dedupe_range handler that we've verified that the fs is capable of dedupe 
> > and
> > that the fs is willing to try to satisfy the request.
> 
> Oh, got it, it's just the fake that tripped me up.
> 
> > We could just malloc sizeof(_same_args) + sizeof(_same_extent_info)...
> 
> Either that, or more simply just don't give the structure a name
> by just declaring it locally on the stack:
> 
>   struct {
>   struct btrfs_ioctl_same_args args;
>   struct btrfs_ioctl_same_extent_info info;
>   } sa = { 0 };

Fair enough, no need to pollute the namespace.

--D

> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] duperemove: test presence of dedupe ioctl

2016-12-14 Thread Darrick J. Wong
On Wed, Dec 14, 2016 at 02:44:36AM -0800, Christoph Hellwig wrote:
> On Fri, Dec 09, 2016 at 09:56:45AM -0800, Darrick J. Wong wrote:
> > Since a zero-length dedupe operation is guaranteed to succeed, use that
> > to test whether or not this filesystem supports dedupe.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
> > ---
> >  file_scan.c |   47 +--
> >  1 file changed, 37 insertions(+), 10 deletions(-)
> > 
> > diff --git a/file_scan.c b/file_scan.c
> > index 617f166..a34453e 100644
> > --- a/file_scan.c
> > +++ b/file_scan.c
> > @@ -45,11 +45,7 @@
> >  #include "file_scan.h"
> >  #include "dbfile.h"
> >  #include "util.h"
> > -
> > -/* This is not in linux/magic.h */
> > -#ifndefXFS_SB_MAGIC
> > -#defineXFS_SB_MAGIC0x58465342  /* 'XFSB' */
> > -#endif
> > +#include "btrfs-ioctl.h"
> >  
> >  static char path[PATH_MAX] = { 0, };
> >  static char *pathp = path;
> > @@ -189,6 +185,39 @@ static int walk_dir(const char *name)
> > return ret;
> >  }
> >  
> > +struct fake_btrfs_ioctl_same_args {
> > +   struct btrfs_ioctl_same_args args;
> > +   struct btrfs_ioctl_same_extent_info info;
> > +};
> 
> Why does this need a fake structure here?

In order to test the ioctl we have to fill out at least one
btrfs_ioctl_same_extent_info so that we get far enough into the fs-specific
dedupe_range handler that we've verified that the fs is capable of dedupe and
that the fs is willing to try to satisfy the request.

We could just malloc sizeof(_same_args) + sizeof(_same_extent_info)...

--D

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: out-of-band dedup status?

2016-12-09 Thread Darrick J. Wong
[adding mark fasheh (duperemove maintainer) to cc]

On Fri, Dec 09, 2016 at 07:29:21AM -0500, Austin S. Hemmelgarn wrote:
> On 2016-12-08 21:54, Chris Murphy wrote:
> >On Thu, Dec 8, 2016 at 7:26 PM, Darrick J. Wong <darrick.w...@oracle.com> 
> >wrote:
> >>On Thu, Dec 08, 2016 at 05:45:40PM -0700, Chris Murphy wrote:
> >>>OK something's wrong.
> >>>
> >>>Kernel 4.8.12 and duperemove v0.11.beta4. Brand new file system
> >>>(mkfs.btrfs -dsingle -msingle, default mount options) and two
> >>>identical files separately copied.
> >>>
> >>>[chris@f25s]$ ls -li /mnt/test
> >>>total 2811904
> >>>260 -rw-r--r--. 1 root root 1439694848 Dec  8 17:26
> >>>Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso
> >>>259 -rw-r--r--. 1 root root 1439694848 Dec  8 17:26
> >>>Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2
> >>>
> >>>[chris@f25s]$ filefrag /mnt/test/*
> >>>/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso: 3 extents found
> >>>/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2: 2 extents found
> >>>
> >>>
> >>>[chris@f25s duperemove]$ sudo ./duperemove -dv /mnt/test/*
> >>>Using 128K blocks
> >>>Using hash: murmur3
> >>>Gathering file list...
> >>>Using 4 threads for file hashing phase
> >>>[1/2] (50.00%) csum: 
> >>>/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso
> >>>[2/2] (100.00%) csum: 
> >>>/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2
> >>>Total files:  2
> >>>Total hashes: 21968
> >>>Loading only duplicated hashes from hashfile.
> >>>Using 4 threads for dedupe phase
> >>>[0xba8400] (1/10947) Try to dedupe extents with id e47862ea
> >>>[0xba84a0] (3/10947) Try to dedupe extents with id ffed44f2
> >>>[0xba84f0] (2/10947) Try to dedupe extents with id ffeefcdd
> >>>[0xba8540] (4/10947) Try to dedupe extents with id ffe4cf64
> >>>[0xba8540] Add extent for file
> >>>"/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> >>>1182924800 (4)
> >>>[0xba8540] Add extent for file
> >>>"/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> >>>1182924800 (5)
> >>>[0xba8540] Dedupe 1 extents (id: ffe4cf64) with target: (1182924800,
> >>>131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso"
> >>
> >>Ew, it's deduping these two 1.4GB files 128K at a time, which results in
> >>12000 ioctl calls.  Each of those 12000 calls has to lock the two
> >>inodes, read the file contents, remap the blocks, etc.  instead of
> >>finding the maximal identical range and making a single call for the
> >>whole range.
> >>
> >>That's probably why it's taking forever to dedupe.
> >
> >Yes but it looks like it's also heavily fragmenting the files as a
> >result as well.

I'm not sure why btrfs has that behavior... XFS doesn't do that, and
evidently there's a bug in ocfs2 such that it sometimes merges records
and sometimes does not.  Hmm, I'll have to take a second look at ocfs2.

> This kind of reinforces what I've been telling people recently, namely that
> while generic batch deduplication generally works, it's quite often better
> to do a custom tool that understands your data-set and knows how to handle
> it efficiently.
> 
> As an example, one of the cases where I use deduplication is on a set of
> directories that are disjoint sets of a larger tree.  So, the directories
> look something like this:
> + a
> | + file1
> | \ file2
> + b
> | + file3
> | \ file2
> \ c
>   + file1
>   \ file3
> 
> In this case, I know that if a/file1 and c/file1 have the same mtime and
> size, they're (supposed to be) copies of the same file.  Given this, the
> tool I use for this just checks for duplicate names with the same size and
> mtime, and then counts on the ioctl's check to verify that the files are
> actually identical (and throws a warning if they aren't), and does some
> special stuff to submit things such that any given file both has the fewest
> possible number of extents and all the extents are roughly the same size.
> On average, even with the fancy extent size calculation logic, this still
> takes less than a quarter of the time that duperemove took on the same
> data-set.

It sure would be nice if duperemove could group all the files that are
the same size and perform whole-file dedupe on the identical ones
instead of doing everything chunk by chunk, particularly since all three
filesystems can actually handle that case.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] duperemove: test presence of dedupe ioctl

2016-12-09 Thread Darrick J. Wong
Since a zero-length dedupe operation is guaranteed to succeed, use that
to test whether or not this filesystem supports dedupe.

Signed-off-by: Darrick J. Wong <darrick.w...@oracle.com>
---
 file_scan.c |   47 +--
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/file_scan.c b/file_scan.c
index 617f166..a34453e 100644
--- a/file_scan.c
+++ b/file_scan.c
@@ -45,11 +45,7 @@
 #include "file_scan.h"
 #include "dbfile.h"
 #include "util.h"
-
-/* This is not in linux/magic.h */
-#ifndefXFS_SB_MAGIC
-#defineXFS_SB_MAGIC0x58465342  /* 'XFSB' */
-#endif
+#include "btrfs-ioctl.h"
 
 static char path[PATH_MAX] = { 0, };
 static char *pathp = path;
@@ -189,6 +185,39 @@ static int walk_dir(const char *name)
return ret;
 }
 
+struct fake_btrfs_ioctl_same_args {
+   struct btrfs_ioctl_same_args args;
+   struct btrfs_ioctl_same_extent_info info;
+};
+
+/*
+ * A zero-length dedupe between two files should always succeed,
+ * so we can use this to test the presence of dedupe functionality.
+ */
+static bool check_ioctl_works(int fd)
+{
+   struct fake_btrfs_ioctl_same_args sa = {0};
+   struct stat sb;
+   static int cached = -1;
+   int ret;
+
+   if (cached >= 0)
+   return cached != 0;
+
+   ret = fstat(fd, );
+   if (ret)
+   return false;
+
+   sa.args.dest_count = 1;
+   sa.args.length = 0;
+   sa.info.fd = fd;
+   sa.info.logical_offset = 0;
+   errno = 0;
+   ret = btrfs_extent_same(fd, );
+   cached = !ret && !errno && !sa.info.status;
+   return cached != 0;
+}
+
 static int __add_file(const char *name, struct stat *st,
  struct filerec **ret_file)
 {
@@ -235,12 +264,10 @@ static int __add_file(const char *name, struct stat *st,
goto out;
}
 
-   if (run_dedupe &&
-   ((fs.f_type != BTRFS_SUPER_MAGIC &&
- fs.f_type != XFS_SB_MAGIC))) {
+   if (run_dedupe && !check_ioctl_works(fd)) {
close(fd);
-   fprintf(stderr, "\"%s\": Can only dedupe files on btrfs or xfs "
-   "(experimental)\n", name);
+   fprintf(stderr, "\"%s\": dedupe ioctl not supported on this "
+   "filesystem.\n", name);
return ENOSYS;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: out-of-band dedup status?

2016-12-08 Thread Darrick J. Wong
On Thu, Dec 08, 2016 at 05:45:40PM -0700, Chris Murphy wrote:
> OK something's wrong.
> 
> Kernel 4.8.12 and duperemove v0.11.beta4. Brand new file system
> (mkfs.btrfs -dsingle -msingle, default mount options) and two
> identical files separately copied.
> 
> [chris@f25s]$ ls -li /mnt/test
> total 2811904
> 260 -rw-r--r--. 1 root root 1439694848 Dec  8 17:26
> Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso
> 259 -rw-r--r--. 1 root root 1439694848 Dec  8 17:26
> Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2
> 
> [chris@f25s]$ filefrag /mnt/test/*
> /mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso: 3 extents found
> /mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2: 2 extents found
> 
> 
> [chris@f25s duperemove]$ sudo ./duperemove -dv /mnt/test/*
> Using 128K blocks
> Using hash: murmur3
> Gathering file list...
> Using 4 threads for file hashing phase
> [1/2] (50.00%) csum: /mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso
> [2/2] (100.00%) csum: 
> /mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2
> Total files:  2
> Total hashes: 21968
> Loading only duplicated hashes from hashfile.
> Using 4 threads for dedupe phase
> [0xba8400] (1/10947) Try to dedupe extents with id e47862ea
> [0xba84a0] (3/10947) Try to dedupe extents with id ffed44f2
> [0xba84f0] (2/10947) Try to dedupe extents with id ffeefcdd
> [0xba8540] (4/10947) Try to dedupe extents with id ffe4cf64
> [0xba8540] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 1182924800 (4)
> [0xba8540] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 1182924800 (5)
> [0xba8540] Dedupe 1 extents (id: ffe4cf64) with target: (1182924800,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso"

Ew, it's deduping these two 1.4GB files 128K at a time, which results in
12000 ioctl calls.  Each of those 12000 calls has to lock the two
inodes, read the file contents, remap the blocks, etc.  instead of
finding the maximal identical range and making a single call for the
whole range.

That's probably why it's taking forever to dedupe.

--D

> [0xba8540] (4/10947) Try to dedupe extents with id ffe4cf64
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 543293440 (4)
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 543293440 (5)
> [0xba84a0] Dedupe 1 extents (id: ffed44f2) with target: (543293440,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso"
> [0xba8540] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 1182924800 (5)
> [0xba8540] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 1182924800 (4)
> [0xba8540] Dedupe 1 extents (id: ffe4cf64) with target: (1182924800,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2"
> [0xba84a0] (3/10947) Try to dedupe extents with id ffed44f2
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 543293440 (5)
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 543293440 (4)
> [0xba84a0] Dedupe 1 extents (id: ffed44f2) with target: (543293440,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2"
> [0xba84f0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 101580800 (4)
> [0xba84f0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 101580800 (5)
> [0xba84f0] Dedupe 1 extents (id: ffeefcdd) with target: (101580800,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso"
> [0xba84a0] (5/10947) Try to dedupe extents with id ffe24eaf
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 171835392 (4)
> [0xba84a0] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 171835392 (5)
> [0xba84a0] Dedupe 1 extents (id: ffe24eaf) with target: (171835392,
> 131072), "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso"
> [0xba84f0] (2/10947) Try to dedupe extents with id ffeefcdd
> [0xba8540] (6/10947) Try to dedupe extents with id ffe116c8
> [0xba8400] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso" at offset
> 52035584 (4)
> [0xba8400] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 52035584 (5)
> [0xba8400] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 52166656 (5)
> [0xba8400] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 60030976 (5)
> [0xba8400] Add extent for file
> "/mnt/test/Fedora-Workstation-Live-x86_64-25_Beta-1.1.iso2" at offset
> 

Re: [PATCH] generic/35[67]: disable swapfile tests on Btrfs

2016-12-06 Thread Darrick J. Wong
On Mon, Dec 05, 2016 at 05:01:28PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osan...@fb.com>
> 
> Btrfs doesn't support swapfiles (yet?), so generic/356 fails
> erroneously, and generic/357 only passes by accident. Let's add a
> _require_scratch_swapfile helper and add it to these tests.

Hehe, good catch. :)
Reviewed-by: Darrick J. Wong <darrick.w...@oracle.com>

--D

> 
> Signed-off-by: Omar Sandoval <osan...@fb.com>
> ---
> I have some code enabling swapfiles for Btrfs [1], but there's some ABBA
> deadlock issues with i_rwsem and mmap_sem on swap-over-NFS that I
> haven't had time to sort out. In the meantime, let's just skip these
> tests.
> 
> 1: https://github.com/osandov/linux/tree/btrfs-swap
> 
>  common/rc | 22 ++
>  tests/generic/356 |  1 +
>  tests/generic/357 |  1 +
>  3 files changed, 24 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 2719b23..d863e56 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -1790,6 +1790,28 @@ _require_odirect()
>   rm -f $testfile 2>&1 > /dev/null
>  }
>  
> +# Check that the filesystem supports swapfiles
> +_require_scratch_swapfile()
> +{
> + _require_scratch
> +
> + _scratch_mkfs >/dev/null
> + _scratch_mount
> +
> + # Minimum size for mkswap is 10 pages
> + local size=$(($(get_page_size) * 10))
> +
> + _pwrite_byte 0x61 0 "$size" "$SCRATCH_MNT/swap" >/dev/null 2>&1
> + mkswap "$SCRATCH_MNT/swap" >/dev/null 2>&1
> + if ! swapon "$SCRATCH_MNT/swap" >/dev/null 2>&1; then
> + _scratch_unmount
> + _notrun "swapfiles are not supported"
> + fi
> +
> + swapoff "$SCRATCH_MNT/swap" >/dev/null 2>&1
> + _scratch_unmount
> +}
> +
>  # Check that a fs has enough free space (in 1024b blocks)
>  #
>  _require_fs_space()
> diff --git a/tests/generic/356 b/tests/generic/356
> index 6bb90c0..51eeb65 100755
> --- a/tests/generic/356
> +++ b/tests/generic/356
> @@ -44,6 +44,7 @@ _cleanup()
>  
>  # real QA test starts here
>  _supported_os Linux
> +_require_scratch_swapfile
>  _require_scratch_reflink
>  _require_cp_reflink
>  
> diff --git a/tests/generic/357 b/tests/generic/357
> index 439b314..0dd0c10 100755
> --- a/tests/generic/357
> +++ b/tests/generic/357
> @@ -44,6 +44,7 @@ _cleanup()
>  
>  # real QA test starts here
>  _supported_os Linux
> +_require_scratch_swapfile
>  _require_scratch_reflink
>  _require_cp_reflink
>  
> -- 
> 2.10.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe fstests" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] Btrfs: make a source length of 0 imply EOF for dedupe

2016-11-28 Thread Darrick J. Wong
On Thu, Nov 24, 2016 at 11:20:39PM -0500, Zygo Blaxell wrote:
> On Wed, Nov 23, 2016 at 05:26:18PM -0800, Darrick J. Wong wrote:
> [...]
> > Keep in mind that the number of bytes deduped is returned to userspace
> > via file_dedupe_range.info[x].bytes_deduped, so a properly functioning
> > userspace program actually /can/ detect that its 128MB request got cut
> > down to only 16MB and re-issue the request with the offsets moved up by
> > 16MB.  The dedupe client in xfs_io (see dedupe_ioctl() in io/reflink.c)
> > implements this strategy.  duperemove (the only other user I know of)
> > also does this.
> > 
> > So it's really no big deal to increase the limit beyond 16MB, eliminate
> > it entirely, or even change it to cap the total request size while
> > dropping the per-item IO limit.
> > 
> > As I mentioned in my other reply, the only hesitation I have for not
> > killing XFS_MAX_DEDUPE_LEN is that I feel that 2GB is enough IO for a
> > single ioctl call.
> 
> Everything's relative.  btrfs has ioctls that will do hundreds of
> terabytes of IO and take months to run.  2GB of data is nothing.
> 
> Deduping entire 100TB files with a single ioctl call makes as much
> sense to me as reflink copying them with a single ioctl call.  The only
> reason I see to keep the limit is to work around something wrong with
> the implementation.

Ok.  I'll post patches removing the 16MB limitation for XFS and ocfs2 in 4.10
if nobody objects.

--D
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/2] Btrfs: make a source length of 0 imply EOF for dedupe

2016-11-23 Thread Darrick J. Wong
On Thu, Nov 24, 2016 at 10:53:24AM +1100, Dave Chinner wrote:
> On Wed, Nov 23, 2016 at 06:14:47PM -0500, Zygo Blaxell wrote:
> > On Thu, Nov 24, 2016 at 09:13:28AM +1100, Dave Chinner wrote:
> > > On Wed, Nov 23, 2016 at 08:55:59AM -0500, Zygo Blaxell wrote:
> > > > On Wed, Nov 23, 2016 at 03:26:32PM +1100, Dave Chinner wrote:
> > > > > On Tue, Nov 22, 2016 at 09:02:10PM -0500, Zygo Blaxell wrote:
> > > > > > On Thu, Nov 17, 2016 at 04:07:48PM -0800, Omar Sandoval wrote:
> > > > > > > 3. Both XFS and Btrfs cap each dedupe operation to 16MB, but the
> > > > > > >implicit EOF gets around this in the existing XFS 
> > > > > > > implementation. I
> > > > > > >copied this for the Btrfs implementation.
> > > > > > 
> > > > > > Somewhat tangential to this patch, but on the dedup topic:  Can we 
> > > > > > raise
> > > > > > or drop that 16MB limit?
> > > > > > 
> > > > > > The maximum btrfs extent length is 128MB.  Currently the btrfs dedup
> > > > > > behavior for a 128MB extent is to generate 8x16MB shared extent 
> > > > > > references
> > > > > > with different extent offsets to a single 128MB physical extent.
> > > > > > These references no longer look like the original 128MB extent to a
> > > > > > userspace dedup tool.  That raises the difficulty level 
> > > > > > substantially
> > > > > > for a userspace dedup tool when it tries to figure out which 
> > > > > > extents to
> > > > > > keep and which to discard or rewrite.
> > > > > 
> > > > > That, IMO, is a btrfs design/implementation problem, not a problem
> > > > > with the API. Applications are always going to end up doing things
> > > > > that aren't perfectly aligned to extent boundaries or sizes
> > > > > regardless of the size limit that is placed on the dedupe ranges.
> > > > 
> > > > Given that XFS doesn't have all the problems btrfs does, why does XFS
> > > > have the same aribitrary size limit?  Especially since XFS demonstrably
> > > > doesn't need it?
> > > 
> > > Creating a new-but-slightly-incompatible jsut for XFS makes no
> > > sense - we have multiple filesystems that support this functionality
> > > and so they all should use the same APIs and present (as far as is
> > > possible) the same behaviour to userspace.
> > 
> > OK.  Let's just remove the limit on all the filesystems then.
> > XFS doesn't need it, and btrfs can be fixed.
> 
> Yet applications still have to support kernel versions where btrfs
> has a limit. IOWs, we can remove the limit for future improvement,
> but that doesn't mean userspace is free from having to know about
> the existing limit constraints.

Keep in mind that the number of bytes deduped is returned to userspace
via file_dedupe_range.info[x].bytes_deduped, so a properly functioning
userspace program actually /can/ detect that its 128MB request got cut
down to only 16MB and re-issue the request with the offsets moved up by
16MB.  The dedupe client in xfs_io (see dedupe_ioctl() in io/reflink.c)
implements this strategy.  duperemove (the only other user I know of)
also does this.

So it's really no big deal to increase the limit beyond 16MB, eliminate
it entirely, or even change it to cap the total request size while
dropping the per-item IO limit.

As I mentioned in my other reply, the only hesitation I have for not
killing XFS_MAX_DEDUPE_LEN is that I feel that 2GB is enough IO for a
single ioctl call.

(Dave: That said, if you want to kill it, I'm more than happy to do so
for XFS and ocfs2.)

--D

> That is, once a behaviour has been exposed to userspace through an
> API, we can't just change it and act like it was always that way -
> apps still have to support kernels that expose the old behaviour.
> i.e. the old behaviour is there forever, and this why designing
> userspace APIs is /hard/. It's also why it's better to use an
> existing, slightly less than ideal API than invent a new one that
> will simply have different problems exposed in future...
> 
> > > IOWs it's more important to use existing APIs than to invent a new
> > > one that does almost the same thing. This way userspace applications
> > > don't need to be changed to support new XFS functionality and we
> > > make life easier for everyone. 
> > 
> > Except removing the limit doesn't work that way.  An application that
> > didn't impose an undocumented limit on itself wouldn't break when moved
> > to a filesystem that imposed no such limit, i.e. if XFS had no limit,
> > an application that moved from btrfs to XFS would just work.
> 
> It goes /both ways/ though. Write an app on XFS that does not care
> about limits and it won't work on btrfs because it gets unexpected
> errors.
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   4   5   >