vm_fault_t conversion, for real

2018-05-15 Thread Christoph Hellwig
Hi all,

this series tries to actually turn vm_fault_t into a type that can be
typechecked and checks the fallout instead of sprinkling random
annotations without context.

The first one fixes a real bug in orangefs, the second and third fix
mismatched existing vm_fault_t annotations on the same function, the
fourth removes an unused export that was in the chain.  The remainder
until the last one do some not quite trivial conversions, and the last
one does the trivial mass annotation and flips vm_fault_t to a __bitwise
unsigned int - the unsigned means we also get plain compiler type
checking for the new ->fault signature even without sparse.

This has survived an x86 allyesconfig build, and got a SUCCESS from the
buildbot that I don't really trust - I'm pretty sure there are bits
and pieces hiding in other architectures that it hasn't caught up to.

The sparse annotations are manuall verified for the core MM code and
a few other interesting bits (e.g. DAX and the x86 fault code)

The series is against linux-next as of 2018/05/15 to make sure any
annotations in subsystem trees are picked up.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] fs: make the filemap_page_mkwrite prototype consistent

2018-05-15 Thread Christoph Hellwig
!CONFIG_MMU version didn't agree with the rest of the kernel..

Signed-off-by: Christoph Hellwig 
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f28e6f4..cf21ced98eff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2748,7 +2748,7 @@ int generic_file_readonly_mmap(struct file *file, struct 
vm_area_struct *vma)
return generic_file_mmap(file, vma);
 }
 #else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
return -ENOSYS;
 }
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/14] orangefs: don't return errno values from ->fault

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 fs/orangefs/file.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 26358efbf794..b4a25cd4f3fa 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -528,18 +528,16 @@ static long orangefs_ioctl(struct file *file, unsigned 
int cmd, unsigned long ar
return ret;
 }
 
-static int orangefs_fault(struct vm_fault *vmf)
+static vm_fault_t orangefs_fault(struct vm_fault *vmf)
 {
struct file *file = vmf->vma->vm_file;
int rc;
-   rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
-   STATX_SIZE);
-   if (rc == -ESTALE)
-   rc = -EIO;
+
+   rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, STATX_SIZE);
if (rc) {
gossip_err("%s: orangefs_inode_getattr failed, "
"rc:%d:.\n", __func__, rc);
-   return rc;
+   return VM_FAULT_SIGBUS;
}
return filemap_fault(vmf);
 }
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/14] dax: make the dax_iomap_fault prototype consistent

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 include/linux/dax.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dax.h b/include/linux/dax.h
index dc65ece825ee..a292bccdc274 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -183,7 +183,7 @@ void dax_flush(struct dax_device *dax_dev, void *addr, 
size_t size);
 
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
-int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
enum page_entry_size pe_size, pfn_t pfn);
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/14] mm: remove the unused device_private_entry_fault export

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 kernel/memremap.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index db4e1a373e5f..59ee3b604b39 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -65,7 +65,6 @@ int device_private_entry_fault(struct vm_area_struct *vma,
 */
return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
 }
-EXPORT_SYMBOL(device_private_entry_fault);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff)
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/14] ceph: untangle ceph_filemap_fault

2018-05-15 Thread Christoph Hellwig
Streamline the code to have a somewhat natural flow, and separate the
errno values from the VM_FAULT_* values.

Signed-off-by: Christoph Hellwig 
---
 fs/ceph/addr.c | 100 +
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f7ad3d0df2e..6e80894ca073 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1428,15 +1428,18 @@ static void ceph_restore_sigs(sigset_t *oldset)
 /*
  * vm ops
  */
-static int ceph_filemap_fault(struct vm_fault *vmf)
+static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
+   struct address_space *mapping = inode->i_mapping;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
-   struct page *pinned_page = NULL;
+   struct page *pinned_page = NULL, *page;
loff_t off = vmf->pgoff << PAGE_SHIFT;
-   int want, got, ret;
+   int want, got, err = 0;
+   vm_fault_t ret = 0;
+   bool did_fault = false;
sigset_t oldset;
 
ceph_block_sigs();
@@ -1449,9 +1452,9 @@ static int ceph_filemap_fault(struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE;
 
got = 0;
-   ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, , _page);
-   if (ret < 0)
-   goto out_restore;
+   err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, , _page);
+   if (err < 0)
+   goto out_errno;
 
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
@@ -1462,8 +1465,8 @@ static int ceph_filemap_fault(struct vm_fault *vmf)
ceph_add_rw_context(fi, _ctx);
ret = filemap_fault(vmf);
ceph_del_rw_context(fi, _ctx);
-   } else
-   ret = -EAGAIN;
+   did_fault = true;
+   }
 
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret);
@@ -1471,57 +1474,55 @@ static int ceph_filemap_fault(struct vm_fault *vmf)
put_page(pinned_page);
ceph_put_cap_refs(ci, got);
 
-   if (ret != -EAGAIN)
+   if (did_fault)
goto out_restore;
 
/* read inline data */
if (off >= PAGE_SIZE) {
/* does not support inline data > PAGE_SIZE */
ret = VM_FAULT_SIGBUS;
+   goto out_restore;
+   }
+
+   page = find_or_create_page(mapping, 0,
+   mapping_gfp_constraint(mapping, ~__GFP_FS));
+   if (!page) {
+   ret = VM_FAULT_OOM;
+   goto out_inline;
+   }
+
+   err = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true);
+   if (err < 0 || off >= i_size_read(inode)) {
+   unlock_page(page);
+   put_page(page);
+   if (err < 0)
+   goto out_errno;
+   ret = VM_FAULT_SIGBUS;
} else {
-   int ret1;
-   struct address_space *mapping = inode->i_mapping;
-   struct page *page = find_or_create_page(mapping, 0,
-   mapping_gfp_constraint(mapping,
-   ~__GFP_FS));
-   if (!page) {
-   ret = VM_FAULT_OOM;
-   goto out_inline;
-   }
-   ret1 = __ceph_do_getattr(inode, page,
-CEPH_STAT_CAP_INLINE_DATA, true);
-   if (ret1 < 0 || off >= i_size_read(inode)) {
-   unlock_page(page);
-   put_page(page);
-   if (ret1 < 0)
-   ret = ret1;
-   else
-   ret = VM_FAULT_SIGBUS;
-   goto out_inline;
-   }
-   if (ret1 < PAGE_SIZE)
-   zero_user_segment(page, ret1, PAGE_SIZE);
+   if (err < PAGE_SIZE)
+   zero_user_segment(page, err, PAGE_SIZE);
else
flush_dcache_page(page);
SetPageUptodate(page);
vmf->page = page;
ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
-out_inline:
-   dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
-inode, off, (size_t)PAGE_SIZE, ret);
}
+
+out_inline:
+   dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
+inode, off, (size_t)PAGE_SIZE, ret);
 out_restore:
ceph_restore_sigs();
-   if (ret < 0)
-   ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
-
return ret;
+out_errno:
+   ret = (err == -ENOMEM) 

[PATCH 06/14] btrfs: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 fs/btrfs/ctree.h |  2 +-
 fs/btrfs/inode.c | 19 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1485cd130e2b..02a0de73c1d1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3203,7 +3203,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long 
offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags);
 void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
-int btrfs_page_mkwrite(struct vm_fault *vmf);
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec9db248c499..f4f03f0f4556 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8824,7 +8824,7 @@ static void btrfs_invalidatepage(struct page *page, 
unsigned int offset,
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  */
-int btrfs_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
 {
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -8836,7 +8836,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
char *kaddr;
unsigned long zero_start;
loff_t size;
-   int ret;
+   vm_fault_t ret;
+   int err;
int reserved = 0;
u64 reserved_space;
u64 page_start;
@@ -8858,14 +8859,14 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
 * end up waiting indefinitely to get a lock on the page currently
 * being processed by btrfs_page_mkwrite() function.
 */
-   ret = btrfs_delalloc_reserve_space(inode, _reserved, page_start,
+   err = btrfs_delalloc_reserve_space(inode, _reserved, page_start,
   reserved_space);
-   if (!ret) {
-   ret = file_update_time(vmf->vma->vm_file);
+   if (!err) {
+   err = file_update_time(vmf->vma->vm_file);
reserved = 1;
}
-   if (ret) {
-   if (ret == -ENOMEM)
+   if (err) {
+   if (err == -ENOMEM)
ret = VM_FAULT_OOM;
else /* -ENOSPC, -EIO, etc */
ret = VM_FAULT_SIGBUS;
@@ -8927,9 +8928,9 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
  0, 0, _state);
 
-   ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
+   err = btrfs_set_extent_delalloc(inode, page_start, end, 0,
_state, 0);
-   if (ret) {
+   if (err) {
unlock_extent_cached(io_tree, page_start, page_end,
 _state);
ret = VM_FAULT_SIGBUS;
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/14] ext4: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 fs/ext4/ext4.h  |  4 ++--
 fs/ext4/inode.c | 30 +++---
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fa52b7dd4542..48592d0edf3e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2463,8 +2463,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_fault *vmf);
+extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 95bc48f5c88b..fe49045a2832 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6106,27 +6106,27 @@ static int ext4_bh_unmapped(handle_t *handle, struct 
buffer_head *bh)
return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
loff_t size;
unsigned long len;
-   int ret;
+   vm_fault_t ret;
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
handle_t *handle;
get_block_t *get_block;
-   int retries = 0;
+   int retries = 0, err;
 
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
 
down_read(_I(inode)->i_mmap_sem);
 
-   ret = ext4_convert_inline_data(inode);
-   if (ret)
+   err = ext4_convert_inline_data(inode);
+   if (err)
goto out_ret;
 
/* Delalloc case is easy... */
@@ -6134,9 +6134,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
-   ret = block_page_mkwrite(vma, vmf,
+   err = block_page_mkwrite(vma, vmf,
   ext4_da_get_block_prep);
-   } while (ret == -ENOSPC &&
+   } while (err == -ENOSPC &&
   ext4_should_retry_alloc(inode->i_sb, ));
goto out_ret;
}
@@ -6181,8 +6181,8 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
goto out;
}
-   ret = block_page_mkwrite(vma, vmf, get_block);
-   if (!ret && ext4_should_journal_data(inode)) {
+   err = block_page_mkwrite(vma, vmf, get_block);
+   if (!err && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
  PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
@@ -6193,24 +6193,24 @@ int ext4_page_mkwrite(struct vm_fault *vmf)
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
-   if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, ))
+   if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, ))
goto retry_alloc;
 out_ret:
-   ret = block_page_mkwrite_return(ret);
+   ret = block_page_mkwrite_return(err);
 out:
up_read(_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
 }
 
-int ext4_filemap_fault(struct vm_fault *vmf)
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
 {
struct inode *inode = file_inode(vmf->vma->vm_file);
-   int err;
+   vm_fault_t ret;
 
down_read(_I(inode)->i_mmap_sem);
-   err = filemap_fault(vmf);
+   ret = filemap_fault(vmf);
up_read(_I(inode)->i_mmap_sem);
 
-   return err;
+   return ret;
 }
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/14] vgem: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
And streamline the code in vgem_fault with early returns so that it is
a little bit more readable.

Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/vgem/vgem_drv.c | 51 +++--
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 2524ff116f00..a261e0aab83a 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -61,12 +61,13 @@ static void vgem_gem_free_object(struct drm_gem_object *obj)
kfree(vgem_obj);
 }
 
-static int vgem_gem_fault(struct vm_fault *vmf)
+static vm_fault_t vgem_gem_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct drm_vgem_gem_object *obj = vma->vm_private_data;
/* We don't use vmf->pgoff since that has the fake offset */
unsigned long vaddr = vmf->address;
+   struct page *page;
int ret;
loff_t num_pages;
pgoff_t page_offset;
@@ -85,35 +86,29 @@ static int vgem_gem_fault(struct vm_fault *vmf)
ret = 0;
}
mutex_unlock(>pages_lock);
-   if (ret) {
-   struct page *page;
-
-   page = shmem_read_mapping_page(
-   file_inode(obj->base.filp)->i_mapping,
-   page_offset);
-   if (!IS_ERR(page)) {
-   vmf->page = page;
-   ret = 0;
-   } else switch (PTR_ERR(page)) {
-   case -ENOSPC:
-   case -ENOMEM:
-   ret = VM_FAULT_OOM;
-   break;
-   case -EBUSY:
-   ret = VM_FAULT_RETRY;
-   break;
-   case -EFAULT:
-   case -EINVAL:
-   ret = VM_FAULT_SIGBUS;
-   break;
-   default:
-   WARN_ON(PTR_ERR(page));
-   ret = VM_FAULT_SIGBUS;
-   break;
-   }
+   if (!ret)
+   return 0;
+
+   page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping,
+   page_offset);
+   if (!IS_ERR(page)) {
+   vmf->page = page;
+   return 0;
+   }
 
+   switch (PTR_ERR(page)) {
+   case -ENOSPC:
+   case -ENOMEM:
+   return VM_FAULT_OOM;
+   case -EBUSY:
+   return VM_FAULT_RETRY;
+   case -EFAULT:
+   case -EINVAL:
+   return VM_FAULT_SIGBUS;
+   default:
+   WARN_ON(PTR_ERR(page));
+   return VM_FAULT_SIGBUS;
}
-   return ret;
 }
 
 static const struct vm_operations_struct vgem_gem_vm_ops = {
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/14] ubifs: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 fs/ubifs/file.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1acb2ff505e6..7c1a2e1c3de5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1513,7 +1513,7 @@ static int ubifs_releasepage(struct page *page, gfp_t 
unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made writable.
  * UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 {
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -1521,6 +1521,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct timespec now = current_time(inode);
struct ubifs_budget_req req = { .new_page = 1 };
int err, update_time;
+   vm_fault_t ret = 0;
 
dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
i_size_read(inode));
@@ -1601,8 +1602,8 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf)
unlock_page(page);
ubifs_release_budget(c, );
if (err)
-   err = VM_FAULT_SIGBUS;
-   return err;
+   ret = VM_FAULT_SIGBUS;
+   return ret;
 }
 
 static const struct vm_operations_struct ubifs_file_vm_ops = {
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 1/3] btrfs: Add unprivileged ioctl which returns subvolume information

2018-05-15 Thread Tomohiro Misono
Add new unprivileged ioctl BTRFS_IOC_GET_SUBVOL_INFO which returns
the information of subvolume containing this inode.
(i.e. returns the information in ROOT_ITEM and ROOT_BACKREF.)

Signed-off-by: Tomohiro Misono 
---
 v4 -> v5
- Update error handling of btrfs_next_leaf() to cover all cases
- Return error if ROOT_BACKREF is not found (except top-level)

 fs/btrfs/ioctl.c   | 146 +
 include/uapi/linux/btrfs.h |  51 
 2 files changed, 197 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48e2ddff32bd..c1c9ae9a937d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2242,6 +2242,150 @@ static noinline int btrfs_ioctl_ino_lookup(struct file 
*file,
return ret;
 }
 
+/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
+static noinline int btrfs_ioctl_get_subvol_info(struct file *file,
+  void __user *argp)
+{
+   struct btrfs_ioctl_get_subvol_info_args *subvol_info;
+   struct btrfs_root *root;
+   struct btrfs_path *path;
+   struct btrfs_key key;
+
+   struct btrfs_root_item root_item;
+   struct btrfs_root_ref *rref;
+   struct extent_buffer *l;
+   int slot;
+
+   unsigned long item_off;
+   unsigned long item_len;
+
+   struct inode *inode;
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
+   if (!subvol_info) {
+   btrfs_free_path(path);
+   return -ENOMEM;
+   }
+
+   inode = file_inode(file);
+   root = BTRFS_I(inode)->root->fs_info->tree_root;
+
+   key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(NULL, root, , path, 0, 0);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   u64 objectid = key.objectid;
+
+   if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+   ret = btrfs_next_leaf(root, path);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+   }
+
+   /* If the subvolume is a snapshot, offset is not zero */
+   btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]);
+   if (key.objectid != objectid ||
+   key.type != BTRFS_ROOT_ITEM_KEY) {
+   ret = -ENOENT;
+   goto out;
+   }
+   }
+
+   l = path->nodes[0];
+   slot = path->slots[0];
+   item_off = btrfs_item_ptr_offset(l, slot);
+   item_len = btrfs_item_size_nr(l, slot);
+   read_extent_buffer(l, _item, item_off, item_len);
+
+   subvol_info->id = key.objectid;
+
+   subvol_info->generation = btrfs_root_generation(_item);
+   subvol_info->flags = btrfs_root_flags(_item);
+
+   memcpy(subvol_info->uuid, root_item.uuid, BTRFS_UUID_SIZE);
+   memcpy(subvol_info->parent_uuid, root_item.parent_uuid,
+   BTRFS_UUID_SIZE);
+   memcpy(subvol_info->received_uuid, root_item.received_uuid,
+   BTRFS_UUID_SIZE);
+
+   subvol_info->ctransid = btrfs_root_ctransid(_item);
+   subvol_info->ctime.sec = btrfs_stack_timespec_sec(_item.ctime);
+   subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(_item.ctime);
+
+   subvol_info->otransid = btrfs_root_otransid(_item);
+   subvol_info->otime.sec = btrfs_stack_timespec_sec(_item.otime);
+   subvol_info->otime.nsec = btrfs_stack_timespec_nsec(_item.otime);
+
+   subvol_info->stransid = btrfs_root_stransid(_item);
+   subvol_info->stime.sec = btrfs_stack_timespec_sec(_item.stime);
+   subvol_info->stime.nsec = btrfs_stack_timespec_nsec(_item.stime);
+
+   subvol_info->rtransid = btrfs_root_rtransid(_item);
+   subvol_info->rtime.sec = btrfs_stack_timespec_sec(_item.rtime);
+   subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(_item.rtime);
+
+   btrfs_release_path(path);
+   if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
+   key.type = BTRFS_ROOT_BACKREF_KEY;
+   key.offset = 0;
+   ret = btrfs_search_slot(NULL, root, , path, 0, 0);
+   if (ret < 0) {
+   goto out;
+   } else if (path->slots[0] >=
+   btrfs_header_nritems(path->nodes[0])) {
+   ret = btrfs_next_leaf(root, path);
+   if (ret < 0) {
+   goto out;
+   } else 

[PATCH v5 0/3] btrfs: Add three new unprivileged ioctls to allow normal users to call "sub list/show" etc.

2018-05-15 Thread Tomohiro Misono
[based on current misc-next]

changelog:
v4 -> v5
  - Update error handling of 1st/2nd patch. See each log for details
  - Fix misspelling
v3 -> v4
  - call btrfs_next_leaf() after btrfs_search_slot() when the slot
position exceeds the number of items
  - rebased to current misc-next
v2 -> v3
  - fix kbuild test bot warning
v1 -> v2
  - completely reimplement 1st/2nd ioctl to have user friendly api
  - various cleanup, remove unnecessary goto
===

This adds three new unprivileged ioctls:

1st patch:
  ioctl which returns subvolume information of ROOT_ITEM and ROOT_BACKREF
2nd patch:
  ioctl which returns subvolume information of ROOT_REF (without subvolume name)
3rd patch: 
  user version of ino_lookup ioctl which also performs permission check.

They will be used to implement user version of "subvolume list/show" etc.
in user tools.
See each commit log for more detals.

The implementation of btrfs-progs can be found in the ML titled as follows: 
  [PATCH 0/11] btrfs-progs: Rework of "subvolume list/show" and relax the root 
privileges of them

Tomohiro Misono (3):
  btrfs: Add unprivileged ioctl which returns subvolume information
  btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF
  btrfs: Add unprivileged version of ino_lookup ioctl

 fs/btrfs/ioctl.c   | 452 +
 include/uapi/linux/btrfs.h |  84 +
 2 files changed, 536 insertions(+)

-- 
2.14.3


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5 2/3] btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF

2018-05-15 Thread Tomohiro Misono
Add unprivileged ioctl BTRFS_IOC_GET_SUBVOL_ROOTREF which
returns ROOT_REF information of the subvolume containing this inode
except the subvolume name (this is because to prevent potential name
leak). The subvolume name will be gained by user version of ino_lookup
ioctl (BTRFS_IOC_INO_LOOKUP_USER) which also performs permission check.

The min id of root ref's subvolume to be searched is specified by
@min_id in struct btrfs_ioctl_get_subvol_rootref_args. After the search
ends, @min_id is set to the last searched root ref's subvolid + 1. Also,
if there are more root refs than BTRFS_MAX_ROOTREF_BUFFER_NUM, -EOVERFLOW
is returned. Therefore the caller can just call this ioctl again without
changing the argument to continue search.

Signed-off-by: Tomohiro Misono 
---
 v4 -> v5
- Update error handling of btrfs_next_leaf() to cover all cases
- Use btrfs_next_item() to reduce the call of btrfs_search_slot()

 fs/btrfs/ioctl.c   | 102 +
 include/uapi/linux/btrfs.h |  16 +++
 2 files changed, 118 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c1c9ae9a937d..db5de77540e1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2386,6 +2386,106 @@ static noinline int btrfs_ioctl_get_subvol_info(struct 
file *file,
return ret;
 }
 
+/*
+ * Return ROOT_REF information of the subvolume containing this inode
+ * except the subvolume name.
+ */
+static noinline int btrfs_ioctl_get_subvol_rootref(struct file *file,
+  void __user *argp)
+{
+   struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
+   struct btrfs_root_ref *rref;
+   struct btrfs_root *root;
+   struct btrfs_path *path;
+   struct btrfs_key key;
+
+   struct extent_buffer *l;
+   int slot;
+
+   struct inode *inode;
+   int ret;
+   u64 objectid;
+   u8 found;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   rootrefs = memdup_user(argp, sizeof(*rootrefs));
+   if (!rootrefs) {
+   btrfs_free_path(path);
+   return -ENOMEM;
+   }
+
+   inode = file_inode(file);
+   root = BTRFS_I(inode)->root->fs_info->tree_root;
+   objectid = BTRFS_I(inode)->root->root_key.objectid;
+
+   key.objectid = objectid;
+   key.type = BTRFS_ROOT_REF_KEY;
+   key.offset = rootrefs->min_id;
+   found = 0;
+
+   ret = btrfs_search_slot(NULL, root, , path, 0, 0);
+   if (ret < 0) {
+   goto out;
+   } else if (path->slots[0] >=
+   btrfs_header_nritems(path->nodes[0])) {
+   ret = btrfs_next_leaf(root, path);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+   }
+   while (1) {
+   l = path->nodes[0];
+   slot = path->slots[0];
+
+   btrfs_item_key_to_cpu(l, , slot);
+   if (key.objectid != objectid ||
+   key.type != BTRFS_ROOT_REF_KEY) {
+   ret = 0;
+   goto out;
+   }
+
+   if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
+   ret = -EOVERFLOW;
+   goto out;
+   }
+
+   rref = btrfs_item_ptr(l, slot, struct btrfs_root_ref);
+   rootrefs->rootref[found].subvolid = key.offset;
+   rootrefs->rootref[found].dirid =
+ btrfs_root_ref_dirid(l, rref);
+   found++;
+
+   ret = btrfs_next_item(root, path);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+   }
+
+out:
+   if (!ret || ret == -EOVERFLOW) {
+   rootrefs->num_items = found;
+   /* update min_id for next search */
+   if (found)
+   rootrefs->min_id =
+   rootrefs->rootref[found - 1].subvolid + 1;
+   if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
+   ret = -EFAULT;
+   }
+
+   btrfs_free_path(path);
+   kfree(rootrefs);
+   return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 void __user *arg)
 {
@@ -5520,6 +5620,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_features(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
return btrfs_ioctl_get_subvol_info(file, argp);
+   case BTRFS_IOC_GET_SUBVOL_ROOTREF:
+   return btrfs_ioctl_get_subvol_rootref(file, argp);
}
 
return -ENOTTY;
diff --git 

[PATCH v5 3/3] btrfs: Add unprivileged version of ino_lookup ioctl

2018-05-15 Thread Tomohiro Misono
Add unprivileged version of ino_lookup ioctl BTRFS_IOC_INO_LOOKUP_USER
to allow normal users to call "btrfs subvololume list/show" etc. in
combination with BTRFS_IOC_GET_SUBVOL_INFO/BTRFS_IOC_GET_SUBVOL_ROOTREF.

This can be used like BTRFS_IOC_INO_LOOKUP but the argument is
different. This is  because it always searches the fs/file tree
correspoinding to the fd with which this ioctl is called and also
returns the name of bottom subvolume.

The main differences from original ino_lookup ioctl are:
  1. Read + Exec permission will be checked using inode_permission()
 during path construction. -EACCES will be returned in case
 of failure.
  2. Path construction will be stopped at the inode number which
 corresponds to the fd with which this ioctl is called. If
 constructed path does not exist under fd's inode, -EACCES
 will be returned.
  3. The name of bottom subvolume is also searched and filled.

Note that the maximum length of path is shorter 256 (BTRFS_VOL_NAME_MAX+1)
bytes than ino_lookup ioctl because of space of subvolume's name.

Signed-off-by: Tomohiro Misono 
---
 fs/btrfs/ioctl.c   | 204 +
 include/uapi/linux/btrfs.h |  17 
 2 files changed, 221 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index db5de77540e1..5120e934f602 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2200,6 +2200,166 @@ static noinline int btrfs_search_path_in_tree(struct 
btrfs_fs_info *info,
return ret;
 }
 
+static noinline int btrfs_search_path_in_tree_user(struct inode *inode,
+   struct btrfs_ioctl_ino_lookup_user_args *args)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+   struct super_block *sb = inode->i_sb;
+   struct btrfs_key upper_limit = BTRFS_I(inode)->location;
+   u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
+   u64 dirid = args->dirid;
+
+   unsigned long item_off;
+   unsigned long item_len;
+   struct btrfs_inode_ref *iref;
+   struct btrfs_root_ref *rref;
+   struct btrfs_root *root;
+   struct btrfs_path *path;
+   struct btrfs_key key, key2;
+   struct extent_buffer *l;
+   struct inode *temp_inode;
+   char *ptr;
+   int slot;
+   int len;
+   int total_len = 0;
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   /*
+* If the bottom subvolume does not exist directly under upper_limit,
+* construct the path in bottomup way.
+*/
+   if (dirid != upper_limit.objectid) {
+   ptr = >path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
+
+   key.objectid = treeid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   root = btrfs_read_fs_root_no_name(fs_info, );
+   if (IS_ERR(root)) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   key.objectid = dirid;
+   key.type = BTRFS_INODE_REF_KEY;
+   key.offset = (u64)-1;
+   while (1) {
+   ret = btrfs_search_slot(NULL, root, , path, 0, 0);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+   }
+
+   l = path->nodes[0];
+   slot = path->slots[0];
+   btrfs_item_key_to_cpu(l, , slot);
+
+   iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+   len = btrfs_inode_ref_name_len(l, iref);
+   ptr -= len + 1;
+   total_len += len + 1;
+   if (ptr < args->path) {
+   ret = -ENAMETOOLONG;
+   goto out;
+   }
+
+   *(ptr + len) = '/';
+   read_extent_buffer(l, ptr,
+   (unsigned long)(iref + 1), len);
+
+   /* Check the read+exec permission of this directory */
+   ret = btrfs_previous_item(root, path, dirid,
+ BTRFS_INODE_ITEM_KEY);
+   if (ret < 0) {
+   goto out;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+  

[PATCH 11/14] ttm: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/ttm/ttm_bo_vm.c | 42 +
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 8eba95b3c737..255e7801f62c 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -43,10 +43,11 @@
 
 #define TTM_BO_VM_NUM_PREFAULT 16
 
-static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
+static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
struct vm_fault *vmf)
 {
-   int ret = 0;
+   vm_fault_t ret = 0;
+   int err = 0;
 
if (likely(!bo->moving))
goto out_unlock;
@@ -77,8 +78,8 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
/*
 * Ordinary wait.
 */
-   ret = dma_fence_wait(bo->moving, true);
-   if (unlikely(ret != 0)) {
+   err = dma_fence_wait(bo->moving, true);
+   if (unlikely(err != 0)) {
ret = (ret != -ERESTARTSYS) ? VM_FAULT_SIGBUS :
VM_FAULT_NOPAGE;
goto out_unlock;
@@ -104,7 +105,7 @@ static unsigned long ttm_bo_io_mem_pfn(struct 
ttm_buffer_object *bo,
+ page_offset;
 }
 
-static int ttm_bo_vm_fault(struct vm_fault *vmf)
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
@@ -115,7 +116,8 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
unsigned long pfn;
struct ttm_tt *ttm = NULL;
struct page *page;
-   int ret;
+   vm_fault_t ret;
+   int err;
int i;
unsigned long address = vmf->address;
struct ttm_mem_type_manager *man =
@@ -128,9 +130,9 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
 * for reserve, and if it fails, retry the fault after waiting
 * for the buffer to become unreserved.
 */
-   ret = ttm_bo_reserve(bo, true, true, NULL);
-   if (unlikely(ret != 0)) {
-   if (ret != -EBUSY)
+   err = ttm_bo_reserve(bo, true, true, NULL);
+   if (unlikely(err != 0)) {
+   if (err != -EBUSY)
return VM_FAULT_NOPAGE;
 
if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) {
@@ -162,8 +164,8 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
}
 
if (bdev->driver->fault_reserve_notify) {
-   ret = bdev->driver->fault_reserve_notify(bo);
-   switch (ret) {
+   err = bdev->driver->fault_reserve_notify(bo);
+   switch (err) {
case 0:
break;
case -EBUSY:
@@ -191,13 +193,13 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
goto out_unlock;
}
 
-   ret = ttm_mem_io_lock(man, true);
-   if (unlikely(ret != 0)) {
+   err = ttm_mem_io_lock(man, true);
+   if (unlikely(err != 0)) {
ret = VM_FAULT_NOPAGE;
goto out_unlock;
}
-   ret = ttm_mem_io_reserve_vm(bo);
-   if (unlikely(ret != 0)) {
+   err = ttm_mem_io_reserve_vm(bo);
+   if (unlikely(err != 0)) {
ret = VM_FAULT_SIGBUS;
goto out_io_unlock;
}
@@ -265,21 +267,21 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
}
 
if (vma->vm_flags & VM_MIXEDMAP)
-   ret = vm_insert_mixed(, address,
+   err = vm_insert_mixed(, address,
__pfn_to_pfn_t(pfn, PFN_DEV));
else
-   ret = vm_insert_pfn(, address, pfn);
+   err = vm_insert_pfn(, address, pfn);
 
/*
 * Somebody beat us to this PTE or prefaulting to
 * an already populated PTE, or prefaulting error.
 */
 
-   if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
+   if (unlikely((err == -EBUSY) || (err != 0 && i > 0)))
break;
-   else if (unlikely(ret != 0)) {
+   else if (unlikely(err != 0)) {
ret =
-   (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+   (err == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
goto out_io_unlock;
}
 
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/14] lustre: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 .../staging/lustre/lustre/llite/llite_mmap.c  | 37 +++
 .../lustre/lustre/llite/vvp_internal.h|  2 +-
 2 files changed, 14 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c 
b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index 214b07554e62..061d98871959 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -231,23 +231,18 @@ static int ll_page_mkwrite0(struct vm_area_struct *vma, 
struct page *vmpage,
return result;
 }
 
-static inline int to_fault_error(int result)
+static inline vm_fault_t to_fault_error(int result)
 {
switch (result) {
case 0:
-   result = VM_FAULT_LOCKED;
-   break;
+   return VM_FAULT_LOCKED;
case -EFAULT:
-   result = VM_FAULT_NOPAGE;
-   break;
+   return VM_FAULT_NOPAGE;
case -ENOMEM:
-   result = VM_FAULT_OOM;
-   break;
+   return VM_FAULT_OOM;
default:
-   result = VM_FAULT_SIGBUS;
-   break;
+   return VM_FAULT_SIGBUS;
}
-   return result;
 }
 
 /**
@@ -261,7 +256,7 @@ static inline int to_fault_error(int result)
  * \retval VM_FAULT_ERROR on general error
  * \retval NOPAGE_OOM not have memory for allocate new page
  */
-static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
+static vm_fault_t ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
struct lu_env  *env;
struct cl_io*io;
@@ -269,7 +264,7 @@ static int ll_fault0(struct vm_area_struct *vma, struct 
vm_fault *vmf)
struct page  *vmpage;
unsigned long   ra_flags;
int   result = 0;
-   int   fault_ret = 0;
+   vm_fault_tfault_ret = 0;
u16 refcheck;
 
env = cl_env_get();
@@ -323,7 +318,7 @@ static int ll_fault0(struct vm_area_struct *vma, struct 
vm_fault *vmf)
return fault_ret;
 }
 
-static int ll_fault(struct vm_fault *vmf)
+static vm_fault_t ll_fault(struct vm_fault *vmf)
 {
int count = 0;
bool printed = false;
@@ -364,7 +359,7 @@ static int ll_fault(struct vm_fault *vmf)
return result;
 }
 
-static int ll_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t ll_page_mkwrite(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
int count = 0;
@@ -390,22 +385,16 @@ static int ll_page_mkwrite(struct vm_fault *vmf)
switch (result) {
case 0:
LASSERT(PageLocked(vmf->page));
-   result = VM_FAULT_LOCKED;
-   break;
+   return VM_FAULT_LOCKED;
case -ENODATA:
case -EAGAIN:
case -EFAULT:
-   result = VM_FAULT_NOPAGE;
-   break;
+   return VM_FAULT_NOPAGE;
case -ENOMEM:
-   result = VM_FAULT_OOM;
-   break;
+   return VM_FAULT_OOM;
default:
-   result = VM_FAULT_SIGBUS;
-   break;
+   return VM_FAULT_SIGBUS;
}
-
-   return result;
 }
 
 /**
diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h 
b/drivers/staging/lustre/lustre/llite/vvp_internal.h
index 7d3abb43584a..c194966a3d82 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_internal.h
+++ b/drivers/staging/lustre/lustre/llite/vvp_internal.h
@@ -83,7 +83,7 @@ struct vvp_io {
/**
 * fault API used bitflags for return code.
 */
-   unsigned intft_flags;
+   vm_fault_tft_flags;
/**
 * check that flags are from filemap_fault
 */
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] mm: move arch specific VM_FAULT_* flags to mm.h

2018-05-15 Thread Christoph Hellwig
Various architectures define their own internal flags.  Not sure a public
header like mm.h is a good place, but keeping them inside the arch code
with possible conflicts also seems like a bad idea.  Maybe we just need
to stop overloading the value instead.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/mm/fault.c   | 3 ---
 arch/arm64/mm/fault.c | 3 ---
 arch/s390/mm/fault.c  | 6 --
 arch/unicore32/mm/fault.c | 3 ---
 include/linux/mm.h| 7 +++
 5 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 32034543f49c..b696eabccf60 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -201,9 +201,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, 
struct pt_regs *regs)
 }
 
 #ifdef CONFIG_MMU
-#define VM_FAULT_BADMAP0x01
-#define VM_FAULT_BADACCESS 0x02
-
 /*
  * Check that the permissions on the VMA allow for the fault which occurred.
  * If we encountered a write fault, we must have write permission, otherwise
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 91c53a7d2575..3d0b1f8eacce 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -318,9 +318,6 @@ static void do_bad_area(unsigned long addr, unsigned int 
esr, struct pt_regs *re
}
 }
 
-#define VM_FAULT_BADMAP0x01
-#define VM_FAULT_BADACCESS 0x02
-
 static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
   unsigned int mm_flags, unsigned long vm_flags,
   struct task_struct *tsk)
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index e074480d3598..48c781ae25d0 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -44,12 +44,6 @@
 #define __SUBCODE_MASK 0x0600
 #define __PF_RES_FIELD 0x8000ULL
 
-#define VM_FAULT_BADCONTEXT0x01
-#define VM_FAULT_BADMAP0x02
-#define VM_FAULT_BADACCESS 0x04
-#define VM_FAULT_SIGNAL0x08
-#define VM_FAULT_PFAULT0x10
-
 enum fault_type {
KERNEL_FAULT,
USER_FAULT,
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 381473412937..6c3c1a82925f 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -148,9 +148,6 @@ void do_bad_area(unsigned long addr, unsigned int fsr, 
struct pt_regs *regs)
__do_kernel_fault(mm, addr, fsr, regs);
 }
 
-#define VM_FAULT_BADMAP0x01
-#define VM_FAULT_BADACCESS 0x02
-
 /*
  * Check that the permissions on the VMA allow for the fault which occurred.
  * If we encountered a write fault, we must have write permission, otherwise
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 338b8a1afb02..64d09e3afc24 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1250,6 +1250,13 @@ static inline void clear_page_pfmemalloc(struct page 
*page)
 * and needs fsync() to complete (for
 * synchronous page faults in DAX) */
 
+/* Only for use in architecture specific page fault handling: */
+#define VM_FAULT_BADMAP0x01
+#define VM_FAULT_BADACCESS 0x02
+#define VM_FAULT_BADCONTEXT0x04
+#define VM_FAULT_SIGNAL0x08
+#define VM_FAULT_PFAULT0x10
+
 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
 VM_FAULT_FALLBACK)
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/14] ocfs2: separate errno from VM_FAULT_* values

2018-05-15 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 fs/ocfs2/mmap.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index fb9a20e3d608..e75c1fc5333e 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,11 +44,11 @@
 #include "ocfs2_trace.h"
 
 
-static int ocfs2_fault(struct vm_fault *vmf)
+static vm_fault_t ocfs2_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
sigset_t oldset;
-   int ret;
+   vm_fault_t ret;
 
ocfs2_block_signals();
ret = filemap_fault(vmf);
@@ -59,10 +59,10 @@ static int ocfs2_fault(struct vm_fault *vmf)
return ret;
 }
 
-static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
-   struct page *page)
+static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
+   struct buffer_head *di_bh, struct page *page)
 {
-   int ret = VM_FAULT_NOPAGE;
+   vm_fault_t ret = VM_FAULT_NOPAGE;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
@@ -71,6 +71,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct 
buffer_head *di_bh,
struct page *locked_page = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
+   int err;
 
last_index = (size - 1) >> PAGE_SHIFT;
 
@@ -105,12 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct 
buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_MASK) + 1;
 
-   ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+   err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
   _page, , di_bh, page);
-   if (ret) {
-   if (ret != -ENOSPC)
-   mlog_errno(ret);
-   if (ret == -ENOMEM)
+   if (err) {
+   if (err != -ENOSPC)
+   mlog_errno(err);
+   if (err == -ENOMEM)
ret = VM_FAULT_OOM;
else
ret = VM_FAULT_SIGBUS;
@@ -121,20 +122,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct 
buffer_head *di_bh,
ret = VM_FAULT_NOPAGE;
goto out;
}
-   ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
-   BUG_ON(ret != len);
+   err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
+   BUG_ON(err != len);
ret = VM_FAULT_LOCKED;
 out:
return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf)
 {
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct buffer_head *di_bh = NULL;
sigset_t oldset;
-   int ret;
+   vm_fault_t ret = 0;
+   int err;
 
sb_start_pagefault(inode->i_sb);
ocfs2_block_signals();
@@ -144,10 +146,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf)
 * node. Taking the data lock will also ensure that we don't
 * attempt page truncation as part of a downconvert.
 */
-   ret = ocfs2_inode_lock(inode, _bh, 1);
-   if (ret < 0) {
+   err = ocfs2_inode_lock(inode, _bh, 1);
+   if (err < 0) {
mlog_errno(ret);
-   if (ret == -ENOMEM)
+   if (err == -ENOMEM)
ret = VM_FAULT_OOM;
else
ret = VM_FAULT_SIGBUS;
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/14] mm: turn on vm_fault_t type checking

2018-05-15 Thread Christoph Hellwig
Switch vm_fault_t to point to an unsigned int with __bіtwise annotations.
This both catches any old ->fault or ->page_mkwrite instance with plain
compiler type checking, as well as finding more intricate problems with
sparse.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/mm/fault.c |  2 +-
 arch/arc/mm/fault.c   |  3 +-
 arch/arm/mm/fault.c   |  5 +-
 arch/arm64/mm/fault.c |  7 +-
 arch/hexagon/mm/vm_fault.c|  2 +-
 arch/ia64/mm/fault.c  |  2 +-
 arch/m68k/mm/fault.c  |  2 +-
 arch/microblaze/mm/fault.c|  2 +-
 arch/mips/mm/fault.c  |  2 +-
 arch/nds32/mm/fault.c |  2 +-
 arch/nios2/mm/fault.c |  2 +-
 arch/openrisc/mm/fault.c  |  2 +-
 arch/parisc/mm/fault.c|  2 +-
 arch/powerpc/include/asm/copro.h  |  2 +-
 arch/powerpc/mm/copro_fault.c |  2 +-
 arch/powerpc/mm/fault.c   | 10 +--
 arch/powerpc/platforms/cell/spufs/fault.c |  2 +-
 arch/riscv/mm/fault.c |  3 +-
 arch/s390/kernel/vdso.c   |  2 +-
 arch/s390/mm/fault.c  |  2 +-
 arch/sh/mm/fault.c|  2 +-
 arch/sparc/mm/fault_32.c  |  4 +-
 arch/sparc/mm/fault_64.c  |  3 +-
 arch/um/kernel/trap.c |  2 +-
 arch/unicore32/mm/fault.c | 10 +--
 arch/x86/entry/vdso/vma.c |  4 +-
 arch/x86/mm/fault.c   | 11 +--
 arch/xtensa/mm/fault.c|  2 +-
 drivers/dax/device.c  | 21 +++---
 drivers/gpu/drm/drm_vm.c  | 10 +--
 drivers/gpu/drm/etnaviv/etnaviv_drv.h |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem.c |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c   |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.h   |  2 +-
 drivers/gpu/drm/gma500/framebuffer.c  |  6 +-
 drivers/gpu/drm/gma500/gem.c  |  2 +-
 drivers/gpu/drm/gma500/psb_drv.h  |  2 +-
 drivers/gpu/drm/i915/i915_drv.h   |  2 +-
 drivers/gpu/drm/i915/i915_gem.c   | 21 ++
 drivers/gpu/drm/msm/msm_drv.h |  2 +-
 drivers/gpu/drm/msm/msm_gem.c |  2 +-
 drivers/gpu/drm/qxl/qxl_ttm.c |  4 +-
 drivers/gpu/drm/radeon/radeon_ttm.c   |  2 +-
 drivers/gpu/drm/udl/udl_drv.h |  2 +-
 drivers/gpu/drm/udl/udl_gem.c |  2 +-
 drivers/gpu/drm/vc4/vc4_bo.c  |  2 +-
 drivers/gpu/drm/vc4/vc4_drv.h |  2 +-
 drivers/hwtracing/intel_th/msu.c  |  2 +-
 drivers/iommu/amd_iommu_v2.c  |  2 +-
 drivers/iommu/intel-svm.c |  3 +-
 drivers/misc/cxl/fault.c  |  2 +-
 drivers/misc/ocxl/context.c   |  6 +-
 drivers/misc/ocxl/link.c  |  2 +-
 drivers/misc/ocxl/sysfs.c |  2 +-
 drivers/scsi/cxlflash/superpipe.c |  4 +-
 drivers/staging/ncpfs/mmap.c  |  2 +-
 drivers/xen/privcmd.c |  2 +-
 fs/9p/vfs_file.c  |  2 +-
 fs/afs/internal.h |  2 +-
 fs/afs/write.c|  2 +-
 fs/f2fs/file.c| 10 +--
 fs/fuse/file.c|  2 +-
 fs/gfs2/file.c|  2 +-
 fs/iomap.c|  2 +-
 fs/nfs/file.c |  4 +-
 fs/nilfs2/file.c  |  2 +-
 fs/proc/vmcore.c  |  2 +-
 fs/userfaultfd.c  |  4 +-
 fs/xfs/xfs_file.c | 12 ++--
 include/linux/huge_mm.h   | 13 ++--
 include/linux/hugetlb.h   |  2 +-
 include/linux/iomap.h |  4 +-
 include/linux/mm.h| 67 +
 include/linux/mm_types.h  |  5 +-
 include/linux/oom.h   |  2 +-
 include/linux/swapops.h   |  4 +-
 include/linux/userfaultfd_k.h |  5 +-
 ipc/shm.c |  2 +-
 kernel/events/core.c  |  4 +-
 mm/gup.c  |  7 +-
 mm/hmm.c  |  2 +-
 mm/huge_memory.c  | 29 
 mm/hugetlb.c  | 25 +++
 mm/internal.h |  2 +-
 mm/khugepaged.c   |  3 +-
 mm/ksm.c  |  2 +-
 mm/memory.c   | 88 ---
 mm/mmap.c |  4 +-
 mm/shmem.c|  9 +--
 samples/vfio-mdev/mbochs.c|  4 +-
 virt/kvm/kvm_main.c   |  2 +-
 91 files 

Re: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns subvolume information

2018-05-15 Thread Misono Tomohiro
On 2018/05/15 16:57, Gu, Jinxiang/顾 金香 wrote:
> Hi, add a missed a comment.
> 
>> -Original Message-
>> From: Misono Tomohiro [mailto:misono.tomoh...@jp.fujitsu.com]
>> Sent: Tuesday, May 15, 2018 3:04 PM
>> To: Gu, Jinxiang/顾 金香 ; linux-btrfs@vger.kernel.org
>> Subject: Re: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns 
>> subvolume information
>>
>> On 2018/05/15 15:31, Gu, Jinxiang/顾 金香 wrote:
>>> Hi,
>>>
 -Original Message-
 From: linux-btrfs-ow...@vger.kernel.org
 [mailto:linux-btrfs-ow...@vger.kernel.org] On Behalf Of Tomohiro
 Misono
 Sent: Friday, May 11, 2018 3:26 PM
 To: linux-btrfs@vger.kernel.org
 Subject: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns
 subvolume information

 Add new unprivileged ioctl BTRFS_IOC_GET_SUBVOL_INFO which returns the 
 information of subvolume containing this inode.
 (i.e. returns the information in ROOT_ITEM and ROOT_BACKREF.)

 Signed-off-by: Tomohiro Misono 
 ---
  fs/btrfs/ioctl.c   | 129 
 +
  include/uapi/linux/btrfs.h |  51 ++
  2 files changed, 180 insertions(+)

 diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index
 48e2ddff32bd..64b23e22852f 100644
 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -2242,6 +2242,133 @@ static noinline int btrfs_ioctl_ino_lookup(struct 
 file *file,
return ret;
  }

 +/* Get the subvolume information in BTRFS_ROOT_ITEM and
 +BTRFS_ROOT_BACKREF */ static noinline int 
 btrfs_ioctl_get_subvol_info(struct file *file,
 + void __user *argp)
 +{
 +  struct btrfs_ioctl_get_subvol_info_args *subvol_info;
 +  struct btrfs_root *root;
 +  struct btrfs_path *path;
 +  struct btrfs_key key;
 +
 +  struct btrfs_root_item root_item;
 +  struct btrfs_root_ref *rref;
 +  struct extent_buffer *l;
 +  int slot;
 +
 +  unsigned long item_off;
 +  unsigned long item_len;
 +
 +  struct inode *inode;
 +  int ret;
 +
 +  path = btrfs_alloc_path();
 +  if (!path)
 +  return -ENOMEM;
 +
 +  subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
 +  if (!subvol_info) {
 +  btrfs_free_path(path);
 +  return -ENOMEM;
 +  }
 +  inode = file_inode(file);
 +
 +  root = BTRFS_I(inode)->root->fs_info->tree_root;
 +  key.objectid = BTRFS_I(inode)->root->root_key.objectid;
 +  key.type = BTRFS_ROOT_ITEM_KEY;
 +  key.offset = 0;
 +  ret = btrfs_search_slot(NULL, root, , path, 0, 0);
 +  if (ret < 0) {
 +  goto out;
 +  } else if (ret > 0) {
 +  u64 objectid = key.objectid;
 +
 +  if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 +  ret = btrfs_next_leaf(root, path);
 +  if (ret < 0)
 +  return ret;
>>> Should goto out; to free subvol_info and path.
>> Thanks, will update both.
>>
> 
> Since btrfs_next_leaf may return 1 when nritems of next leaf is 0,
> So, btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]); may goes 
> wrong.
> And I think it should add a judge before btrfs_item_key_to_cpu.

Ok, I will update to handle all cases.

[snip] +l = path->nodes[0];
 +  slot = path->slots[0];
 +  btrfs_item_key_to_cpu(l, , slot);
 +  if (key.objectid == subvol_info->id &&
 +  key.type == BTRFS_ROOT_BACKREF_KEY){
 +  subvol_info->parent_id = key.offset;
 +
 +  rref = btrfs_item_ptr(l, slot, struct btrfs_root_ref);
 +  subvol_info->dirid = btrfs_root_ref_dirid(l, rref);
 +
 +  item_off = btrfs_item_ptr_offset(l, slot)
 +  + sizeof(struct btrfs_root_ref);
 +  item_len = btrfs_item_size_nr(l, slot)
 +  - sizeof(struct btrfs_root_ref);
 +  read_extent_buffer(l, subvol_info->name, item_off, item_len);
 +  }
>>> If this if is not correct(ex. corrupt filesystem without backref),
>>> should it return -ENOENT, or its be ok without parent_id, dirid and name.
>>> Suggest to add logic of else.
>>
>> If backref does not exist (except top-level subvolume), it means filesystem 
>> corruption.
>> So, I'd like to return  -EUCLEAN here.

On second thought, I notice if this ioctl is called after containing subvolume 
is deleted,
the entry may not exist. So, -ENOENT is fine.

Thanks,
Tomohiro Misono

>>
>> Thanks,
>> Tomohiro Misono
>>
>>>
 +
 +  if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
 +  ret = -EFAULT;
 +
 +out:
 +  kzfree(subvol_info);
 +  btrfs_free_path(path);
 +  return ret;
 +}
 +
  static noinline int 

[PATCH v2] btrfs: update uuid_mutex and device_list_mutex comments

2018-05-15 Thread Anand Jain
Make the uuid_mutex and device_list_mutex comments inline with
the changes.

Signed-off-by: Anand Jain 
---
v1->v2: Fix typo. fs_devs -> fs_uuids

 fs/btrfs/volumes.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9dbc5b97fd94..81fb38884cac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -147,16 +147,11 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
  *
  * uuid_mutex (global lock)
  * 
- * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * Protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
- * device) or requested by the device= mount option
+ * device) or requested by the device= mount option.
  *
- * the mutex can be very coarse and can cover long-running operations
- *
- * protects: updates to fs_devices counters like missing devices, rw devices,
- * seeding, structure cloning, openning/closing devices at mount/umount time
- *
- * global::fs_devs - add, remove, updates to the global list
+ * global::fs_uuids - add, remove, updates to the global list.
  *
  * does not protect: manipulation of the fs_devices::devices list!
  *
@@ -164,12 +159,14 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
  *
  * fs_devices::device_list_mutex (per-fs, with RCU)
  * 
- * protects updates to fs_devices::devices, ie. adding and deleting
+ * Protects updates to fs_devices::devices, ie. adding and deleting , and its
+ * counters like missing devices, rw devices, seeding, structure cloning,
+ * openning/closing devices at mount/umount time.
  *
- * simple list traversal with read-only actions can be done with RCU protection
+ * Simple list traversal with read-only actions can be done with RCU 
protection.
  *
- * may be used to exclude some operations from running concurrently without any
- * modifications to the list (see write_all_supers)
+ * May be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers).
  *
  * balance_mutex
  * -
-- 
2.15.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: update uuid_mutex and device_list_mutex comments

2018-05-15 Thread Anand Jain



On 04/24/2018 11:48 PM, David Sterba wrote:

On Wed, Apr 18, 2018 at 05:56:31PM +0800, Anand Jain wrote:

@@ -155,29 +155,26 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
   *
   * uuid_mutex (global lock)
   * 
- * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
+ * Protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
   * the SCAN_DEV ioctl registration or from mount either implicitly (the first
- * device) or requested by the device= mount option
- *
- * the mutex can be very coarse and can cover long-running operations
- *
- * protects: updates to fs_devices counters like missing devices, rw devices,
- * seeding, structure cloning, openning/closing devices at mount/umount time
+ * device) or requested by the device= mount option.
   *
   * global::fs_devs - add, remove, updates to the global list

   ^^^

My typo, this should be fs_uuids.


right. Corrected in v2.




   * fs_devices::device_list_mutex (per-fs, with RCU)
   * 
- * protects updates to fs_devices::devices, ie. adding and deleting
+ * Protects updates to fs_devices::devices, ie. adding and deleting, and its
+ * counters like missing devices, rw devices, seeding, structure cloning,
+ * openning/closing devices at mount/umount time.
   *
- * simple list traversal with read-only actions can be done with RCU protection
+ * Simple list traversal with read-only actions can be done with RCU 
protection.
   *
- * may be used to exclude some operations from running concurrently without any
- * modifications to the list (see write_all_supers)
+ * May be used to exclude some operations from running concurrently without any
+ * modifications to the list (see write_all_supers).


The uuid_mutex usage is a bit muddy, so far I think that most uses are
not necessary so this is in line with this patchset. In some cases we
might need to add the device_list_mutex once uuid mutex is gone.



The clear usage of the uuid_mutex is when a new fs_devices is added to
the global fs_uuids, to prevent concurrent access by device scan and
mount.


 Yes. I have part#2 of uuid_mutex which will cleanup this part.
 I got diverged into something else before I could send. Will send soon.
 Sorry for the delay.


Another one is the seed fs manipulation, that on the higher level
works on the linked fs_devices. And the last one is the device renames
that happen after a device appears under a different name.


 Sprout doesn't need uuid_mutex, it would need device_list_mutex of
 the respective seed fs_devices. I am planning this in part#3.
 As of now its ok to continue to use uuid_mutex.


So far I haven't noticed any problems during tests of for-next with this
patchset, so I guess we'd have to try harder to trigger the potential
races.



Thre's no device add/delete/replace/scan stress tests.


  stress tests - to exerciser concurrency - right.


The
seeding is not very well covered by tests, so I'll keep the branch in
for-next, but more tests are welcome.


 Let me find time to add in part#3 as above.

Thanks, Anand



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: implement unlocked buffered write

2018-05-15 Thread robbieko
From: Robbie Ko 

This idea is from direct io. By this patch, we can make the buffered
write parallel, and improve the performance and latency. But because we
can not update isize without i_mutex, the unlocked buffered write just
can be done in front of the EOF.

We needn't worry about the race between buffered write and truncate,
because the truncate need wait until all the buffered write end.

And we also needn't worry about the race between dio write and punch hole,
because we have extent lock to protect our operation.

I ran fio to test the performance of this feature.

== Hardware ==
CPU: Intel® Xeon® D-1531
SSD: Intel S3710 200G
Volume : RAID 5 , SSD * 6

== config file ==
[global]
group_reporting
time_based
thread=1
norandommap
ioengine=libaio
bs=4k
iodepth=32
size=16G
runtime=180
numjobs=8
rw=randwrite

[file1]
filename=/mnt/btrfs/nocow/testfile

== result (iops) ==
lock = 68470
unlocked = 94242

== result (clat) ==
lock
 lat (usec): min=184, max=1209.9K, avg=3738.35, stdev=20869.49
 clat percentiles (usec):
  |  1.00th=[  322],  5.00th=[  330], 10.00th=[  334], 20.00th=[  346],
  | 30.00th=[  370], 40.00th=[  386], 50.00th=[  406], 60.00th=[  446],
  | 70.00th=[  516], 80.00th=[  612], 90.00th=[  828], 95.00th=[10432],
  | 99.00th=[84480], 99.50th=[117248], 99.90th=[226304], 99.95th=[333824],
  | 99.99th=[692224]

unlocked
 lat (usec): min=10, max=218208, avg=2691.44, stdev=5380.82
 clat percentiles (usec):
  |  1.00th=[  302],  5.00th=[  390], 10.00th=[  442], 20.00th=[  502],
  | 30.00th=[  548], 40.00th=[  596], 50.00th=[  652], 60.00th=[  724],
  | 70.00th=[  916], 80.00th=[ 5024], 90.00th=[ 5664], 95.00th=[10048],
  | 99.00th=[29568], 99.50th=[39168], 99.90th=[54016], 99.95th=[59648],
  | 99.99th=[78336]

Signed-off-by: Robbie Ko 
---
 fs/btrfs/file.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 41ab907..8eac540 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1600,6 +1600,7 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
int ret = 0;
bool only_release_metadata = false;
bool force_page_uptodate = false;
+   bool relock = false;
 
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
@@ -1609,6 +1610,18 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
if (!pages)
return -ENOMEM;
 
+   inode_dio_begin(inode);
+
+   /*
+* If the write is beyond the EOF, we need update
+* the isize, but it is protected by i_mutex. So we can
+* not unlock the i_mutex at this case.
+*/
+   if (pos + iov_iter_count(i) <= i_size_read(inode)) {
+   inode_unlock(inode);
+   relock = true;
+   }
+
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_SIZE - 1);
size_t sector_offset;
@@ -1808,6 +1821,10 @@ static noinline ssize_t __btrfs_buffered_write(struct 
file *file,
}
}
 
+   inode_dio_end(inode);
+   if (relock)
+   inode_lock(inode);
+
extent_changeset_free(data_reserved);
return num_written ? num_written : ret;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] btrfs: balance: add kernel log for end or paused

2018-05-15 Thread Anand Jain
Add a kernel log when the balance ends, either for cancel or completed
or if it is paused.
---
v1->v2: Moved from 2/3 to 3/3

 fs/btrfs/volumes.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ce68c4f42f94..a4e243a29f5c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4053,6 +4053,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
 
mutex_lock(_info->balance_mutex);
+   if (ret == -ECANCELED && atomic_read(_info->balance_pause_req))
+   btrfs_info(fs_info, "balance: paused");
+   else if (ret == -ECANCELED && atomic_read(_info->balance_cancel_req))
+   btrfs_info(fs_info, "balance: canceled");
+   else
+   btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags);
 
if (bargs) {
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] btrfs: balance: prefix kernel logs

2018-05-15 Thread Anand Jain
Kernel logs are very important for the forensic investigations of the
issues in general make it easy to use it. This patch adds 'balance:'
prefix so that it can be easily searched.

Signed-off-by: Anand Jain 
---
v1-v2: Change log update.
 fs/btrfs/volumes.c | 34 --
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9773bc143650..27da66c47ef2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3801,7 +3801,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(>data, >meta, sizeof(bctl->data))) {
btrfs_err(fs_info,
- "with mixed groups data and metadata balance 
options must be the same");
+ "balance: mixed groups data and metadata 
options must be the same");
ret = -EINVAL;
goto out;
}
@@ -3823,23 +3823,26 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
if (validate_convert_profile(>data, allowed)) {
+   int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
btrfs_err(fs_info,
- "unable to start balance with target data profile 
%llu",
- bctl->data.target);
+ "balance: invalid convert data profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(>meta, allowed)) {
+   int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
btrfs_err(fs_info,
- "unable to start balance with target metadata profile 
%llu",
- bctl->meta.target);
+ "balance: invalid convert metadata profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(>sys, allowed)) {
+   int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
btrfs_err(fs_info,
- "unable to start balance with target system profile 
%llu",
- bctl->sys.target);
+ "balance: invalid convert system profile %s",
+ get_raid_name(index));
ret = -EINVAL;
goto out;
}
@@ -3860,10 +3863,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 !(bctl->meta.target & allowed))) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
-  "force reducing metadata integrity");
+  "balance: force reducing metadata 
integrity");
} else {
btrfs_err(fs_info,
- "balance will reduce metadata 
integrity, use force if you want this");
+ "balance: reduces metadata integrity, 
use force if you want this");
ret = -EINVAL;
goto out;
}
@@ -3877,9 +3880,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
bctl->data.target : fs_info->avail_data_alloc_bits;
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
+   int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
+   int data_index = btrfs_bg_flags_to_raid_index(data_target);
btrfs_warn(fs_info,
-  "metadata profile 0x%llx has lower redundancy than 
data profile 0x%llx",
-  meta_target, data_target);
+  "balance: metadata profile %s has lower redundancy 
than data profile %s",
+  get_raid_name(meta_index), 
get_raid_name(data_index));
}
 
ret = insert_balance_item(fs_info, bctl);
@@ -3939,7 +3944,7 @@ static int balance_kthread(void *data)
 
mutex_lock(_info->balance_mutex);
if (fs_info->balance_ctl) {
-   btrfs_info(fs_info, "continuing balance");
+   btrfs_info(fs_info, "balance: resuming");
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
}
mutex_unlock(_info->balance_mutex);
@@ -3959,7 +3964,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info 
*fs_info)
mutex_unlock(_info->balance_mutex);
 
if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
- 

[PATCH v2 0/3] btrfs: balance: improve kernel logs

2018-05-15 Thread Anand Jain
Kernel logs are very important for the forensic investigations of the
issues, these patchs make balance logs easy to review.

Anand Jain (3):
  btrfs: balance: prefix kernel logs
  btrfs: balance: add args info during start and resume
  btrfs: balance: add kernel log for end or paused

 fs/btrfs/volumes.c | 185 -
 1 file changed, 169 insertions(+), 16 deletions(-)

-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: add balance args info during start and resume

2018-05-15 Thread Anand Jain


 All comments accepted in v2 in ml, except for the one below.


  clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags);
  if (bargs) {
@@ -3947,10 +4096,8 @@ static int balance_kthread(void *data)
  int ret = 0;
  mutex_lock(_info->balance_mutex);
-    if (fs_info->balance_ctl) {
-    btrfs_info(fs_info, "balance: resuming");
+    if (fs_info->balance_ctl)
  ret = btrfs_balance(fs_info->balance_ctl, NULL);
-    }

Unrelated change.


  Why?


 I believe this change is related to this patch, as this patch has
 added the resume log at btrfs_balance() and so we don't needed
 the resume log again in btrfs_kthread().
 OR.
 I don't know your reasoning. I am ignoring this comment as of now.

 V2 is in the ML.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: test ENOSPC caused by many orphan items

2018-05-15 Thread Eryu Guan
On Tue, May 15, 2018 at 07:14:02PM -0700, Omar Sandoval wrote:
> On Wed, May 16, 2018 at 09:48:58AM +0800, Eryu Guan wrote:
> > On Wed, May 09, 2018 at 11:21:55PM -0700, Omar Sandoval wrote:
> > > From: Omar Sandoval 
> > > 
> > > Btrfs has a bug where we can prematurely ENOSPC if we have lots of
> > > orphaned files, i.e., deleted files which are still open. Add a test
> > > which repeatedly creates and deletes a file while keeping all of the
> > > file descriptors open. This should succeed but doesn't on Btrfs without
> > > the fix.
> > > 
> > > Signed-off-by: Omar Sandoval 
> > > ---
> > >  tests/generic/479 |  0
> > >  tests/generic/487 | 65 +++
> > >  tests/generic/487.out |  2 ++
> > >  tests/generic/group   |  1 +
> > >  4 files changed, 68 insertions(+)
> > >  mode change 100644 => 100755 tests/generic/479
> > >  create mode 100755 tests/generic/487
> > >  create mode 100644 tests/generic/487.out
> > > 
> > > diff --git a/tests/generic/479 b/tests/generic/479
> > > old mode 100644
> > > new mode 100755
> > > diff --git a/tests/generic/487 b/tests/generic/487
> > > new file mode 100755
> > > index ..66379cf0
> > > --- /dev/null
> > > +++ b/tests/generic/487
> > > @@ -0,0 +1,65 @@
> > > +#! /bin/bash
> > > +# FS QA Test 487
> > > +#
> > > +# Test having many file descriptors referring to deleted files open. 
> > > Regression
> > > +# test for patch "Btrfs: fix ENOSPC caused by orphan items reservations".
> > > +#
> > > +#---
> > > +# Copyright (c) 2018 Omar Sandoval.  All Rights Reserved.
> > > +#
> > > +# This program is free software; you can redistribute it and/or
> > > +# modify it under the terms of the GNU General Public License as
> > > +# published by the Free Software Foundation.
> > > +#
> > > +# This program is distributed in the hope that it would be useful,
> > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > +# GNU General Public License for more details.
> > > +#
> > > +# You should have received a copy of the GNU General Public License
> > > +# along with this program; if not, write the Free Software Foundation,
> > > +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> > > +#---
> > > +#
> > > +
> > > +seq=`basename $0`
> > > +seqres=$RESULT_DIR/$seq
> > > +echo "QA output created by $seq"
> > > +
> > > +here=`pwd`
> > > +tmp=/tmp/$$
> > > +status=1 # failure is the default!
> > > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > > +
> > > +_cleanup()
> > > +{
> > > + cd /
> > > + rm -f $tmp.*
> > > +}
> > > +
> > > +. ./common/rc
> > > +. ./common/filter
> > > +
> > > +rm -f $seqres.full
> > > +
> > > +_supported_fs generic
> > > +_supported_os Linux
> > > +_require_scratch
> > > +
> > > +_scratch_mkfs_sized $((1024 * 1024 * 1024)) >> $seqres.full 2>&1
> > > +_scratch_mount
> > > +
> > > +test_file="$SCRATCH_MNT/$seq"
> > > +
> > > +(
> > > +ulimit -n $((16 * 1024))
> > > +# ~1 files on a 1 GB filesystem should be no problem.
> > > +for ((i = 1000; i < 1; i++)); do
> > > + eval "exec $i<> \"$test_file\"" && rm "$test_file" || break
> > > +done
> > 
> > There's a helper command in src that does exactly this job, e.g.
> > 
> > $here/src/multi_open_unlink -f $SCRATCH_MNT/$seq -n 1 -s 0
> > 
> > which creates & unlinks 1 files and keeps them open for 0 second in
> > $SCRATCH_MNT using "$seq" as name prefix. This reduces the test run time
> > from 13s to 1s for me.
> > 
> > It's a straightforward change, I'll just update on commit, please let me
> > know if you have different thoughts.
> > 
> > Thanks,
> > Eryu
> 
> Great, as long as it still reproduces the bug without the fix applied,
> that's perfect. Thanks!

Yes, I've verified that it still reproduces the bug on btrfs and test
passes with the fix(es) applied.

Thanks,
Eryu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: test ENOSPC caused by many orphan items

2018-05-15 Thread Omar Sandoval
On Wed, May 16, 2018 at 09:48:58AM +0800, Eryu Guan wrote:
> On Wed, May 09, 2018 at 11:21:55PM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Btrfs has a bug where we can prematurely ENOSPC if we have lots of
> > orphaned files, i.e., deleted files which are still open. Add a test
> > which repeatedly creates and deletes a file while keeping all of the
> > file descriptors open. This should succeed but doesn't on Btrfs without
> > the fix.
> > 
> > Signed-off-by: Omar Sandoval 
> > ---
> >  tests/generic/479 |  0
> >  tests/generic/487 | 65 +++
> >  tests/generic/487.out |  2 ++
> >  tests/generic/group   |  1 +
> >  4 files changed, 68 insertions(+)
> >  mode change 100644 => 100755 tests/generic/479
> >  create mode 100755 tests/generic/487
> >  create mode 100644 tests/generic/487.out
> > 
> > diff --git a/tests/generic/479 b/tests/generic/479
> > old mode 100644
> > new mode 100755
> > diff --git a/tests/generic/487 b/tests/generic/487
> > new file mode 100755
> > index ..66379cf0
> > --- /dev/null
> > +++ b/tests/generic/487
> > @@ -0,0 +1,65 @@
> > +#! /bin/bash
> > +# FS QA Test 487
> > +#
> > +# Test having many file descriptors referring to deleted files open. 
> > Regression
> > +# test for patch "Btrfs: fix ENOSPC caused by orphan items reservations".
> > +#
> > +#---
> > +# Copyright (c) 2018 Omar Sandoval.  All Rights Reserved.
> > +#
> > +# This program is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU General Public License as
> > +# published by the Free Software Foundation.
> > +#
> > +# This program is distributed in the hope that it would be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +# GNU General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU General Public License
> > +# along with this program; if not, write the Free Software Foundation,
> > +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> > +#---
> > +#
> > +
> > +seq=`basename $0`
> > +seqres=$RESULT_DIR/$seq
> > +echo "QA output created by $seq"
> > +
> > +here=`pwd`
> > +tmp=/tmp/$$
> > +status=1   # failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > +
> > +_cleanup()
> > +{
> > +   cd /
> > +   rm -f $tmp.*
> > +}
> > +
> > +. ./common/rc
> > +. ./common/filter
> > +
> > +rm -f $seqres.full
> > +
> > +_supported_fs generic
> > +_supported_os Linux
> > +_require_scratch
> > +
> > +_scratch_mkfs_sized $((1024 * 1024 * 1024)) >> $seqres.full 2>&1
> > +_scratch_mount
> > +
> > +test_file="$SCRATCH_MNT/$seq"
> > +
> > +(
> > +ulimit -n $((16 * 1024))
> > +# ~1 files on a 1 GB filesystem should be no problem.
> > +for ((i = 1000; i < 1; i++)); do
> > +   eval "exec $i<> \"$test_file\"" && rm "$test_file" || break
> > +done
> 
> There's a helper command in src that does exactly this job, e.g.
> 
> $here/src/multi_open_unlink -f $SCRATCH_MNT/$seq -n 1 -s 0
> 
> which creates & unlinks 1 files and keeps them open for 0 second in
> $SCRATCH_MNT using "$seq" as name prefix. This reduces the test run time
> from 13s to 1s for me.
> 
> It's a straightforward change, I'll just update on commit, please let me
> know if you have different thoughts.
> 
> Thanks,
> Eryu

Great, as long as it still reproduces the bug without the fix applied,
that's perfect. Thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: test ENOSPC caused by many orphan items

2018-05-15 Thread Eryu Guan
On Wed, May 09, 2018 at 11:21:55PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> Btrfs has a bug where we can prematurely ENOSPC if we have lots of
> orphaned files, i.e., deleted files which are still open. Add a test
> which repeatedly creates and deletes a file while keeping all of the
> file descriptors open. This should succeed but doesn't on Btrfs without
> the fix.
> 
> Signed-off-by: Omar Sandoval 
> ---
>  tests/generic/479 |  0
>  tests/generic/487 | 65 +++
>  tests/generic/487.out |  2 ++
>  tests/generic/group   |  1 +
>  4 files changed, 68 insertions(+)
>  mode change 100644 => 100755 tests/generic/479
>  create mode 100755 tests/generic/487
>  create mode 100644 tests/generic/487.out
> 
> diff --git a/tests/generic/479 b/tests/generic/479
> old mode 100644
> new mode 100755
> diff --git a/tests/generic/487 b/tests/generic/487
> new file mode 100755
> index ..66379cf0
> --- /dev/null
> +++ b/tests/generic/487
> @@ -0,0 +1,65 @@
> +#! /bin/bash
> +# FS QA Test 487
> +#
> +# Test having many file descriptors referring to deleted files open. 
> Regression
> +# test for patch "Btrfs: fix ENOSPC caused by orphan items reservations".
> +#
> +#---
> +# Copyright (c) 2018 Omar Sandoval.  All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#---
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1 # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> + cd /
> + rm -f $tmp.*
> +}
> +
> +. ./common/rc
> +. ./common/filter
> +
> +rm -f $seqres.full
> +
> +_supported_fs generic
> +_supported_os Linux
> +_require_scratch
> +
> +_scratch_mkfs_sized $((1024 * 1024 * 1024)) >> $seqres.full 2>&1
> +_scratch_mount
> +
> +test_file="$SCRATCH_MNT/$seq"
> +
> +(
> +ulimit -n $((16 * 1024))
> +# ~1 files on a 1 GB filesystem should be no problem.
> +for ((i = 1000; i < 1; i++)); do
> + eval "exec $i<> \"$test_file\"" && rm "$test_file" || break
> +done

There's a helper command in src that does exactly this job, e.g.

$here/src/multi_open_unlink -f $SCRATCH_MNT/$seq -n 1 -s 0

which creates & unlinks 1 files and keeps them open for 0 second in
$SCRATCH_MNT using "$seq" as name prefix. This reduces the test run time
from 13s to 1s for me.

It's a straightforward change, I'll just update on commit, please let me
know if you have different thoughts.

Thanks,
Eryu

> +)
> +
> +echo "Silence is golden"
> +
> +status=0
> +exit
> diff --git a/tests/generic/487.out b/tests/generic/487.out
> new file mode 100644
> index ..5f31fd97
> --- /dev/null
> +++ b/tests/generic/487.out
> @@ -0,0 +1,2 @@
> +QA output created by 487
> +Silence is golden
> diff --git a/tests/generic/group b/tests/generic/group
> index 505383f7..93581257 100644
> --- a/tests/generic/group
> +++ b/tests/generic/group
> @@ -489,3 +489,4 @@
>  484 auto quick
>  485 auto quick insert
>  486 auto quick attr
> +487 auto quick
> -- 
> 2.17.0
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: add parent_transid parameter to veirfy_level_key

2018-05-15 Thread Qu Wenruo


On 2018年05月16日 01:37, Liu Bo wrote:
> @parent_transid could tell whether the eb's generation has been verified
> by the caller.
> 
> Signed-off-by: Liu Bo 

Looks pretty useful to debug the btrfs/124 bug you just fixed.

But a small nitpick inlined below:

> ---
>  fs/btrfs/disk-io.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 60caa68c3618..b5d55b0ec19b 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -416,7 +416,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info 
> *fs_info,
>  
>  static int verify_level_key(struct btrfs_fs_info *fs_info,
>   struct extent_buffer *eb, int level,
> - struct btrfs_key *first_key)
> + struct btrfs_key *first_key, u64 parent_transid)
>  {
>   int found_level;
>   struct btrfs_key found_key;
> @@ -454,10 +454,10 @@ static int verify_level_key(struct btrfs_fs_info 
> *fs_info,
>   if (ret) {
>   WARN_ON(1);
>   btrfs_err(fs_info,
> -"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) 
> has=(%llu, %u, %llu)",
> +"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) 
> has=(%llu, %u, %llu) parent_transid %llu",

It would look better if parent transid can be before key output.

Despite that, everything looks good.

Thanks,
Qu

> eb->start, first_key->objectid, first_key->type,
> first_key->offset, found_key.objectid,
> -   found_key.type, found_key.offset);
> +   found_key.type, found_key.offset, parent_transid);
>   }
>  #endif
>   return ret;
> @@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct 
> btrfs_fs_info *fs_info,
>  parent_transid, 0))
>   ret = -EIO;
>   else if (verify_level_key(fs_info, eb, level,
> -   first_key))
> +   first_key, parent_transid))
>   ret = -EUCLEAN;
>   else
>   break;
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] Btrfs: fix the corruption by reading stale btree blocks

2018-05-15 Thread Qu Wenruo


On 2018年05月16日 01:37, Liu Bo wrote:
> If a btree block, aka. extent buffer, is not available in the extent
> buffer cache, it'll be read out from the disk instead, i.e.
> 
> btrfs_search_slot()
>   read_block_for_search()  # hold parent and its lock, go to read child
> btrfs_release_path()
> read_tree_block()  # read child
> 
> Unfortunately, the parent lock got released before reading child, so
> commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had
> used 0 as parent transid to read the child block.  It forces
> read_tree_block() not to check if parent transid is different with the
> generation id of the child that it reads out from disk.
> 
> A simple PoC is included in btrfs/124,
> 
> 0. A two-disk raid1 btrfs,
> 
> 1. Right after mkfs.btrfs, block A is allocated to be device tree's root.
> 
> 2. Mount this filesystem and put it in use, after a while, device tree's
>root got COW but block A hasn't been allocated/overwritten yet.
> 
> 3. Umount it and reload the btrfs module to remove both disks from the
>global @fs_devices list.
> 
> 4. mount -odegraded dev1 and write some data, so now block A is allocated
>to be a leaf in checksum tree.  Note that only dev1 has the latest
>metadata of this filesystem.
> 
> 5. Umount it and mount it again normally (with both disks), since raid1
>can pick up one disk by the writer task's pid, if btrfs_search_slot()
>needs to read block A, dev2 which does NOT have the latest metadata
>might be read for block A, then we got a stale block A.
> 
> 6. As parent transid is not checked, block A is marked as uptodate and
>put into the extent buffer cache, so the future search won't bother
>to read disk again, which means it'll make changes on this stale
>one and make it dirty and flush it onto disk.
> 
> To avoid the problem, parent transid needs to be passed to
> read_tree_block().
> 
> In order to get a valid parent transid, we need to hold the parent's
> lock until finish reading child.

Thanks for the detailed explanation.

It explains the first_key check error reported, thanks a lot!

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> 
> Signed-off-by: Liu Bo 
> ---
>  fs/btrfs/ctree.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 3fd44835b386..b3f6f300e492 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -2436,10 +2436,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
> *path, int level)
>   if (p->reada != READA_NONE)
>   reada_for_search(fs_info, p, level, slot, key->objectid);
>  
> - btrfs_release_path(p);
> -
>   ret = -EAGAIN;
> - tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
> + tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
> _key);
>   if (!IS_ERR(tmp)) {
>   /*
> @@ -2454,6 +2452,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
> *path, int level)
>   } else {
>   ret = PTR_ERR(tmp);
>   }
> +
> + btrfs_release_path(p);
>   return ret;
>  }
>  
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH V3] test online label ioctl

2018-05-15 Thread Dave Chinner
On Tue, May 15, 2018 at 10:22:37AM -0500, Eric Sandeen wrote:
> This tests the online label ioctl that btrfs has, which has been
> recently proposed for XFS.
> 
> To run, it requires an updated xfs_io with the label command and a
> filesystem that supports it
> 
> A slight change here to _require_xfs_io_command as well, so that tests
> which simply fail with "Inappropriate ioctl" can be caught in the
> common case.
> 
> Signed-off-by: Eric Sandeen 
> ---
> 
> (urgh send as proper new thread, sorry)
> 
> This passes on btrfs, _notruns on xfs/ext4 of yore, and passes
> on xfs w/ my online label patchset (as long as xfs_io has the new
> capability)
> 
> V2: Add a max label length helper
> Set the proper btrfs max label length o_O oops
> Filter trailing whitespace from blkid output
> 
> V3: lowercase local vars, simplify max label len function

Looks good now, but I wondered about one thing the test doesn't
cover: can you clear the label by setting it to a null string?
i.e you check max length bounds, but don't check empty string
behaviour...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread David Sterba
On Mon, May 14, 2018 at 11:46:09PM +0300, Timofey Titovets wrote:
> > > @@ -396,6 +396,14 @@ static inline int inode_need_compress(struct inode
> *inode, u64 start, u64 end)
> > >  {
> > >   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> > >
> > > + /*
> > > +  * Btrfs doesn't support compression without csum or CoW.
> > > +  * This should have the highest priority.
> > > +  */
> > > + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
> > > + BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
> > > + return 0;
> 
> > This is also the wrong place to fix that, NODATASUM or NODATACOW inode
> > should never make it to compress_file_range (that calls
> > inode_need_compress).
> 
> 
> David, i've talk about that some time ago:
> https://www.spinics.net/lists/linux-btrfs/msg73137.html
> 
> NoCow files can be *easy* compressed.

I missed your previous mail, the issue with compression and nocow/nosum
is there. Seems like some of the combinations are not properly handled
when it's mount option vs defrag.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] btrfs_search_slot cleanups

2018-05-15 Thread David Sterba
On Wed, May 16, 2018 at 01:52:02AM +0800, Liu Bo wrote:
> These're the cleanups I made for btrfs_search_slot() and its callees.

All the patches have very terse changelog and I have no idea why you
think the code change is correct so I can compare my review against
that.

It's touching code around locking that's never bad to be more
verbose and introduce a bit of the context, the simple sentence is maybe
clear to you when you go through the code, but imagine you read the
changelog in a week or month or a year. It still needs to make sense or
give enough pointers to find out from the code.

Please update and resend, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: add parent_transid parameter to veirfy_level_key

2018-05-15 Thread David Sterba
On Wed, May 16, 2018 at 01:37:37AM +0800, Liu Bo wrote:
> @parent_transid could tell whether the eb's generation has been verified
> by the caller.

Can you please write why you add this change? Like is it an enhancement
or debugging help or someting like that.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] Btrfs: use more straightforward extent_buffer_uptodate

2018-05-15 Thread Liu Bo
In read_block_for_search(), it's straightforward to use
extent_buffer_uptodate() instead since 0 is passed as parent transid to
btrfs_buffer_uptodate(), which means the check for parent transid is not
needed.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9fa3d77c98d4..a96d308c51b8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2445,7 +2445,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
*path, int level)
 * and give up so that our caller doesn't loop forever
 * on our EAGAINs.
 */
-   if (!btrfs_buffer_uptodate(tmp, 0, 0))
+   if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
} else {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] Btrfs: remove always true check in unlock_up

2018-05-15 Thread Liu Bo
@path->lock[i] is always true at this point.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f7c3f581f647..16d28a4ec54f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2330,7 +2330,7 @@ static noinline void unlock_up(struct btrfs_path *path, 
int level,
no_skips = 1;
 
t = path->nodes[i];
-   if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+   if (i >= lowest_unlock && i > skip_level) {
btrfs_tree_unlock_rw(t, path->locks[i]);
path->locks[i] = 0;
if (write_lock_level &&
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] Btrfs: remove unused check of skip_locking

2018-05-15 Thread Liu Bo
The check is superfluous since all of callers who set search_for_commit
also have skip_locking set.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 399839df7a8f..cf34eca41d4e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2623,8 +2623,6 @@ static struct extent_buffer 
*btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
if (p->need_commit_sem)
up_read(_info->commit_root_sem);
-   if (!p->skip_locking)
-   btrfs_tree_read_lock(b);
 
goto out;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] Btrfs: move get root of btrfs_search_slot to a helper

2018-05-15 Thread Liu Bo
It's good to have a helper instead of having all get-root details
open-coded.

The new helper locks (if necessary) and sets root node of the path.

There is no functional change in this commit.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 112 +--
 1 file changed, 67 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a96d308c51b8..399839df7a8f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2598,6 +2598,72 @@ int btrfs_find_item(struct btrfs_root *fs_root, struct 
btrfs_path *path,
return 0;
 }
 
+static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root 
*root,
+   struct btrfs_path *p,
+   int write_lock_level)
+{
+   struct btrfs_fs_info *fs_info = root->fs_info;
+   struct extent_buffer *b;
+   int root_lock;
+   int level = 0;
+
+   /*
+* we try very hard to do read locks on the root
+*/
+   root_lock = BTRFS_READ_LOCK;
+
+   if (p->search_commit_root) {
+   /*
+* the commit roots are read only so we always do read locks
+*/
+   if (p->need_commit_sem)
+   down_read(_info->commit_root_sem);
+   b = root->commit_root;
+   extent_buffer_get(b);
+   level = btrfs_header_level(b);
+   if (p->need_commit_sem)
+   up_read(_info->commit_root_sem);
+   if (!p->skip_locking)
+   btrfs_tree_read_lock(b);
+
+   goto out;
+   }
+
+   if (p->skip_locking) {
+   b = btrfs_root_node(root);
+   level = btrfs_header_level(b);
+   goto out;
+   }
+
+   /*
+* we don't know the level of the root node until we actually
+* have it read locked
+*/
+   b = btrfs_read_lock_root_node(root);
+   level = btrfs_header_level(b);
+   if (level > write_lock_level)
+   goto out;
+
+   /*
+* whoops, must trade for write lock
+*/
+   btrfs_tree_read_unlock(b);
+   free_extent_buffer(b);
+   b = btrfs_lock_root_node(root);
+   root_lock = BTRFS_WRITE_LOCK;
+   /*
+* the level might have changed, check again
+*/
+   level = btrfs_header_level(b);
+
+out:
+   p->nodes[level] = b;
+   if (!p->skip_locking)
+   p->locks[level] = root_lock;
+   return b;
+}
+
+
 /*
  * btrfs_search_slot - look for a key in a tree and perform necessary
  * modifications to preserve tree invariants.
@@ -2634,7 +2700,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, 
struct btrfs_root *root,
int err;
int level;
int lowest_unlock = 1;
-   int root_lock;
/* everything at write_lock_level or lower must be write locked */
int write_lock_level = 0;
u8 lowest_level = 0;
@@ -2672,50 +2737,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, 
struct btrfs_root *root,
 
 again:
prev_cmp = -1;
-   /*
-* we try very hard to do read locks on the root
-*/
-   root_lock = BTRFS_READ_LOCK;
-   level = 0;
-   if (p->search_commit_root) {
-   /*
-* the commit roots are read only
-* so we always do read locks
-*/
-   if (p->need_commit_sem)
-   down_read(_info->commit_root_sem);
-   b = root->commit_root;
-   extent_buffer_get(b);
-   level = btrfs_header_level(b);
-   if (p->need_commit_sem)
-   up_read(_info->commit_root_sem);
-   if (!p->skip_locking)
-   btrfs_tree_read_lock(b);
-   } else {
-   if (p->skip_locking) {
-   b = btrfs_root_node(root);
-   level = btrfs_header_level(b);
-   } else {
-   /* we don't know the level of the root node
-* until we actually have it read locked
-*/
-   b = btrfs_read_lock_root_node(root);
-   level = btrfs_header_level(b);
-   if (level <= write_lock_level) {
-   /* whoops, must trade for write lock */
-   btrfs_tree_read_unlock(b);
-   free_extent_buffer(b);
-   b = btrfs_lock_root_node(root);
-   root_lock = BTRFS_WRITE_LOCK;
-
-   /* the level might have changed, check again */
-   level = btrfs_header_level(b);
-   }
-   }
-   }
-   p->nodes[level] = b;
- 

[PATCH 1/6] Btrfs: remove superfluous free_extent_buffer

2018-05-15 Thread Liu Bo
@tmp must be NULL at this point, free_extent_buffer is not needed at all.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b3f6f300e492..9fa3d77c98d4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2432,7 +2432,6 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
*path, int level)
btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
 
-   free_extent_buffer(tmp);
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] btrfs_search_slot cleanups

2018-05-15 Thread Liu Bo
These're the cleanups I made for btrfs_search_slot() and its callees.

Liu Bo (6):
  Btrfs: remove superfluous free_extent_buffer
  Btrfs: use more straightforward extent_buffer_uptodate
  Btrfs: move get root of btrfs_search_slot to a helper
  Btrfs: remove unused check of skip_locking
  Btrfs: grab write lock directly if write_lock_level is the max level
  Btrfs: remove always true check in unlock_up

 fs/btrfs/ctree.c | 118 +--
 1 file changed, 70 insertions(+), 48 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] Btrfs: grab write lock directly if write_lock_level is the max level

2018-05-15 Thread Liu Bo
In case of (cow && (p->keep_locks || p->lowest_level)), write_lock_level
is the max level, and we should grab write lock of root node from the very
beginning.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cf34eca41d4e..f7c3f581f647 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2633,20 +2633,23 @@ static struct extent_buffer 
*btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
 
-   /*
-* we don't know the level of the root node until we actually
-* have it read locked
-*/
-   b = btrfs_read_lock_root_node(root);
-   level = btrfs_header_level(b);
-   if (level > write_lock_level)
-   goto out;
+   if (write_lock_level < BTRFS_MAX_LEVEL) {
+   /*
+* we don't know the level of the root node until we actually
+* have it read locked
+*/
+   b = btrfs_read_lock_root_node(root);
+   level = btrfs_header_level(b);
+   if (level > write_lock_level)
+   goto out;
+
+   /*
+* whoops, must trade for write lock
+*/
+   btrfs_tree_read_unlock(b);
+   free_extent_buffer(b);
+   }
 
-   /*
-* whoops, must trade for write lock
-*/
-   btrfs_tree_read_unlock(b);
-   free_extent_buffer(b);
b = btrfs_lock_root_node(root);
root_lock = BTRFS_WRITE_LOCK;
/*
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix the corruption by reading stale btree blocks

2018-05-15 Thread Filipe Manana
On Tue, May 15, 2018 at 6:37 PM, Liu Bo  wrote:
> If a btree block, aka. extent buffer, is not available in the extent
> buffer cache, it'll be read out from the disk instead, i.e.
>
> btrfs_search_slot()
>   read_block_for_search()  # hold parent and its lock, go to read child
> btrfs_release_path()
> read_tree_block()  # read child
>
> Unfortunately, the parent lock got released before reading child, so
> commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had
> used 0 as parent transid to read the child block.  It forces
> read_tree_block() not to check if parent transid is different with the
> generation id of the child that it reads out from disk.
>
> A simple PoC is included in btrfs/124,
>
> 0. A two-disk raid1 btrfs,
>
> 1. Right after mkfs.btrfs, block A is allocated to be device tree's root.
>
> 2. Mount this filesystem and put it in use, after a while, device tree's
>root got COW but block A hasn't been allocated/overwritten yet.
>
> 3. Umount it and reload the btrfs module to remove both disks from the
>global @fs_devices list.
>
> 4. mount -odegraded dev1 and write some data, so now block A is allocated
>to be a leaf in checksum tree.  Note that only dev1 has the latest
>metadata of this filesystem.
>
> 5. Umount it and mount it again normally (with both disks), since raid1
>can pick up one disk by the writer task's pid, if btrfs_search_slot()
>needs to read block A, dev2 which does NOT have the latest metadata
>might be read for block A, then we got a stale block A.
>
> 6. As parent transid is not checked, block A is marked as uptodate and
>put into the extent buffer cache, so the future search won't bother
>to read disk again, which means it'll make changes on this stale
>one and make it dirty and flush it onto disk.
>
> To avoid the problem, parent transid needs to be passed to
> read_tree_block().
>
> In order to get a valid parent transid, we need to hold the parent's
> lock until finish reading child.
>
> Signed-off-by: Liu Bo 
Reviewed-by: Filipe Manana 

Looks good, great finding and explanation.

> ---
>  fs/btrfs/ctree.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 3fd44835b386..b3f6f300e492 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -2436,10 +2436,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
> *path, int level)
> if (p->reada != READA_NONE)
> reada_for_search(fs_info, p, level, slot, key->objectid);
>
> -   btrfs_release_path(p);
> -
> ret = -EAGAIN;
> -   tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
> +   tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
>   _key);
> if (!IS_ERR(tmp)) {
> /*
> @@ -2454,6 +2452,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
> *path, int level)
> } else {
> ret = PTR_ERR(tmp);
> }
> +
> +   btrfs_release_path(p);
> return ret;
>  }
>
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix the corruption by reading stale btree blocks

2018-05-15 Thread Liu Bo
If a btree block, aka. extent buffer, is not available in the extent
buffer cache, it'll be read out from the disk instead, i.e.

btrfs_search_slot()
  read_block_for_search()  # hold parent and its lock, go to read child
btrfs_release_path()
read_tree_block()  # read child

Unfortunately, the parent lock got released before reading child, so
commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had
used 0 as parent transid to read the child block.  It forces
read_tree_block() not to check if parent transid is different with the
generation id of the child that it reads out from disk.

A simple PoC is included in btrfs/124,

0. A two-disk raid1 btrfs,

1. Right after mkfs.btrfs, block A is allocated to be device tree's root.

2. Mount this filesystem and put it in use, after a while, device tree's
   root got COW but block A hasn't been allocated/overwritten yet.

3. Umount it and reload the btrfs module to remove both disks from the
   global @fs_devices list.

4. mount -odegraded dev1 and write some data, so now block A is allocated
   to be a leaf in checksum tree.  Note that only dev1 has the latest
   metadata of this filesystem.

5. Umount it and mount it again normally (with both disks), since raid1
   can pick up one disk by the writer task's pid, if btrfs_search_slot()
   needs to read block A, dev2 which does NOT have the latest metadata
   might be read for block A, then we got a stale block A.

6. As parent transid is not checked, block A is marked as uptodate and
   put into the extent buffer cache, so the future search won't bother
   to read disk again, which means it'll make changes on this stale
   one and make it dirty and flush it onto disk.

To avoid the problem, parent transid needs to be passed to
read_tree_block().

In order to get a valid parent transid, we need to hold the parent's
lock until finish reading child.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3fd44835b386..b3f6f300e492 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2436,10 +2436,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
*path, int level)
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
 
-   btrfs_release_path(p);
-
ret = -EAGAIN;
-   tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+   tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
  _key);
if (!IS_ERR(tmp)) {
/*
@@ -2454,6 +2452,8 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path 
*path, int level)
} else {
ret = PTR_ERR(tmp);
}
+
+   btrfs_release_path(p);
return ret;
 }
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: add parent_transid parameter to veirfy_level_key

2018-05-15 Thread Liu Bo
@parent_transid could tell whether the eb's generation has been verified
by the caller.

Signed-off-by: Liu Bo 
---
 fs/btrfs/disk-io.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60caa68c3618..b5d55b0ec19b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -416,7 +416,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info 
*fs_info,
 
 static int verify_level_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int level,
-   struct btrfs_key *first_key)
+   struct btrfs_key *first_key, u64 parent_transid)
 {
int found_level;
struct btrfs_key found_key;
@@ -454,10 +454,10 @@ static int verify_level_key(struct btrfs_fs_info *fs_info,
if (ret) {
WARN_ON(1);
btrfs_err(fs_info,
-"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) 
has=(%llu, %u, %llu)",
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) 
has=(%llu, %u, %llu) parent_transid %llu",
  eb->start, first_key->objectid, first_key->type,
  first_key->offset, found_key.objectid,
- found_key.type, found_key.offset);
+ found_key.type, found_key.offset, parent_transid);
}
 #endif
return ret;
@@ -493,7 +493,7 @@ static int btree_read_extent_buffer_pages(struct 
btrfs_fs_info *fs_info,
   parent_transid, 0))
ret = -EIO;
else if (verify_level_key(fs_info, eb, level,
- first_key))
+ first_key, parent_transid))
ret = -EUCLEAN;
else
break;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/15] Review uuid_mutex usage

2018-05-15 Thread David Sterba
On Thu, Apr 12, 2018 at 10:29:23AM +0800, Anand Jain wrote:
> uuid_mutex lock is not a per-fs lock but a global lock. The main aim of
> this patch-set is to critically review the usage of this lock, and delete
> the unnecessary once. By doing this we improve the concurrency of
> device operations across multiple btrfs filesystems is in the system.
> 
> patch 1: Was sent before, I am including it here, as its about uuid_mutex.
> 
> patch 2-9: Are cleanup and or preparatory patches.
> 
> patch 10-14: Drops the uuid_mutex and makes sure there is enough lock,
> as discussed in the patch change log.
> 
> patch 15: A generic cleanup patch around functions in the same context.
> 
> These patches are on top of
>   https://github.com/kdave/btrfs-devel.git remove-volume-mutex
> And it will be a good idea to go along with the kill-volume-mutex patches.
> 
> This is tested with xfstests and there are no _new_ regression. And I am
> trying to understand the old regressions, and notice that they are
> inconsistent.
> 
> Anand Jain (15):
>   btrfs: optimize move uuid_mutex closer to the critical section
>   btrfs: rename struct btrfs_fs_devices::list
>   btrfs: cleanup __btrfs_open_devices() drop head pointer
>   btrfs: rename __btrfs_close_devices to close_fs_devices
>   btrfs: rename __btrfs_open_devices to open_fs_devices
>   btrfs: cleanup find_device() drop list_head pointer
>   btrfs: cleanup btrfs_rm_device() promote fs_devices pointer
>   btrfs: cleanup btrfs_rm_device() use cur_devices
>   btrfs: uuid_mutex in read_chunk_tree, add a comment
>   btrfs: drop uuid_mutex in btrfs_free_extra_devids()
>   btrfs: drop uuid_mutex in btrfs_open_devices()
>   btrfs: drop uuid_mutex in close_fs_devices()
>   btrfs: drop uuid_mutex in btrfs_dev_replace_finishing()
>   btrfs: drop uuid_mutex in btrfs_destroy_dev_replace_tgtdev()
>   btrfs: cleanup btrfs_destroy_dev_replace_tgtdev() localize
> btrfs_fs_devices

Patches 10 and 12 haven't been merged, the rest is now in misc-next.
Testing hasn't revealed any problems related to the uuid/device locks
but as said before we don't have stress tests.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 12/15] btrfs: drop uuid_mutex in close_fs_devices()

2018-05-15 Thread David Sterba
On Thu, Apr 12, 2018 at 10:29:35AM +0800, Anand Jain wrote:
> close_fs_devices() closes devices of a given fsid, and it is limited
> to all the devices of a fsid, so we don't have to hold the global
> uuid_mutex, instead we need the device_list_mutex as the device state is
> being changed.
> 
> Signed-off-by: Anand Jain 
> ---
>  fs/btrfs/volumes.c | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index dfebf8f29916..4c29214e0c18 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -995,7 +995,6 @@ static void btrfs_prepare_close_one_device(struct 
> btrfs_device *device)
>   device->uuid);
>   BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
>  
> - /* Safe because we are under uuid_mutex */
>   if (device->name) {
>   name = rcu_string_strdup(device->name->str, GFP_NOFS);
>   BUG_ON(!name); /* -ENOMEM */
> @@ -1013,10 +1012,12 @@ static int close_fs_devices(struct btrfs_fs_devices 
> *fs_devices)
>  
>   INIT_LIST_HEAD(_put);
>  
> - if (--fs_devices->opened > 0)
> + mutex_lock(_devices->device_list_mutex);
> + if (--fs_devices->opened > 0) {
> + mutex_unlock(_devices->device_list_mutex);
>   return 0;
> + }
>  
> - mutex_lock(_devices->device_list_mutex);
>   list_for_each_entry_safe(device, tmp, _devices->devices, dev_list) {
>   btrfs_prepare_close_one_device(device);
>   list_add(>dev_list, _put);
> @@ -1050,13 +1051,11 @@ int btrfs_close_devices(struct btrfs_fs_devices 
> *fs_devices)
>   struct btrfs_fs_devices *seed_devices = NULL;
>   int ret;
>  
> - mutex_lock(_mutex);
>   ret = close_fs_devices(fs_devices);
>   if (!fs_devices->opened) {
>   seed_devices = fs_devices->seed;
>   fs_devices->seed = NULL;

This still touches ->seed, and also reads ->opened, some locking is
required here or it has to be clearly explained why it's not.

>   }
> - mutex_unlock(_mutex);
>  
>   while (seed_devices) {
>   fs_devices = seed_devices;
> -- 
> 2.7.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 10/15] btrfs: drop uuid_mutex in btrfs_free_extra_devids()

2018-05-15 Thread David Sterba
On Thu, Apr 12, 2018 at 10:29:33AM +0800, Anand Jain wrote:
> btrfs_free_extra_devids() frees the orphan fsid::devid but its search is
> limited to btrfs_fs_devices::devices, so we dont need uuid_mutex.

>From that it's not clear why there's no locking at all now:

> @@ -897,7 +897,6 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices 
> *fs_devices, int step)
>   struct btrfs_device *device, *next;
>   struct btrfs_device *latest_dev = NULL;
>  
> - mutex_lock(_mutex);

ie. why is this not replaced by the device_list_lock. Please resend this
patch and explain in the changelog.

>  again:
>   /* This is the initialized path, it is safe to release the devices. */
>   list_for_each_entry_safe(device, next, _devices->devices, dev_list) {
> @@ -951,8 +950,6 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices 
> *fs_devices, int step)
>   }
>  
>   fs_devices->latest_bdev = latest_dev->bdev;
> -
> - mutex_unlock(_mutex);
>  }
>  
>  static void free_device_rcu(struct rcu_head *head)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3] test online label ioctl

2018-05-15 Thread Eric Sandeen

This tests the online label ioctl that btrfs has, which has been
recently proposed for XFS.

To run, it requires an updated xfs_io with the label command and a
filesystem that supports it

A slight change here to _require_xfs_io_command as well, so that tests
which simply fail with "Inappropriate ioctl" can be caught in the
common case.

Signed-off-by: Eric Sandeen 
---

(urgh send as proper new thread, sorry)

This passes on btrfs, _notruns on xfs/ext4 of yore, and passes
on xfs w/ my online label patchset (as long as xfs_io has the new
capability)

V2: Add a max label length helper
Set the proper btrfs max label length o_O oops
Filter trailing whitespace from blkid output

V3: lowercase local vars, simplify max label len function



diff --git a/common/rc b/common/rc
index ffe53236..333cfb82 100644
--- a/common/rc
+++ b/common/rc
@@ -2144,6 +2144,9 @@ _require_xfs_io_command()
echo $testio | grep -q "Inappropriate ioctl" && \
_notrun "xfs_io $command support is missing"
;;
+   "label")
+   testio=`$XFS_IO_PROG -c "label" $TEST_DIR 2>&1`
+   ;;
"open")
# -c "open $f" is broken in xfs_io <= 4.8. Along with the fix,
# a new -C flag was introduced to execute one shot commands.
@@ -2182,7 +2185,7 @@ _require_xfs_io_command()
rm -f $testfile 2>&1 > /dev/null
echo $testio | grep -q "not found" && \
_notrun "xfs_io $command support is missing"
-   echo $testio | grep -q "Operation not supported" && \
+   echo $testio | grep -q "Operation not supported\|Inappropriate ioctl" 
&& \
_notrun "xfs_io $command failed (old kernel/wrong fs?)"
echo $testio | grep -q "Invalid" && \
_notrun "xfs_io $command failed (old kernel/wrong fs/bad args?)"
@@ -3788,6 +3791,29 @@ _require_scratch_feature()
esac
 }
 
+# The maximum filesystem label length, /not/ including terminating NULL

+_label_get_max()
+{
+   case $FSTYP in
+   xfs)
+   echo 12
+   ;;
+   btrfs)
+   echo 255
+   ;;
+   *)
+   _notrun "$FSTYP does not define maximum label length"
+   ;;
+   esac
+}
+
+# Helper to check above early in a script
+_require_label_get_max()
+{
+   # Just call _label_get_max which will notrun if appropriate
+   dummy=$(_label_get_max)
+}
+
 init_rc
 
 

diff --git a/tests/generic/479 b/tests/generic/479
old mode 100644
new mode 100755
diff --git a/tests/generic/488 b/tests/generic/488
new file mode 100755
index ..9521c5ba
--- /dev/null
+++ b/tests/generic/488
@@ -0,0 +1,90 @@
+#! /bin/bash
+# FS QA Test 488
+#
+# Test the online filesystem label set/get ioctls
+#
+#---
+# Copyright (c) 2018 Red Hat, Inc.  All Rights Reserved.
+# Author: Eric Sandeen 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+_supported_fs generic
+_supported_os Linux
+_require_scratch
+_require_xfs_io_command "label"
+_require_label_get_max
+
+_scratch_mkfs > $seqres.full 2>&1
+_scratch_mount
+
+# Make sure we can set & clear the label
+$XFS_IO_PROG -c "label label.$seq" $SCRATCH_MNT
+$XFS_IO_PROG -c "label" $SCRATCH_MNT
+
+# And that userspace can see it now, while mounted
+# NB: some blkid has trailing whitespace, filter it out here
+blkid -s LABEL $SCRATCH_DEV | _filter_scratch | sed -e "s/ $//g"
+
+# And that the it is still there when it's unmounted
+_scratch_unmount
+blkid -s LABEL $SCRATCH_DEV | _filter_scratch | sed -e "s/ $//g"
+
+# And that it persists after a remount
+_scratch_mount
+$XFS_IO_PROG -c "label" $SCRATCH_MNT
+
+# And that a too-long label is rejected, beyond 

Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread David Sterba
On Tue, May 15, 2018 at 04:51:26PM +0900, Misono Tomohiro wrote:
> Incompat flag of lzo/zstd compression should be set at:
>  1. mount time (-o compress/compress-force)
>  2. when defrag is done
>  3. when property is set
> 
> Currently 3. is missing and this commit adds this.

That was missed during the review, thanks for catching it.

> Signed-off-by: Tomohiro Misono 

Reviewed-by: David Sterba 

For 4.17-rc and for stable 4.14+.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: sysfs: Use enum/define value intead of magic number

2018-05-15 Thread David Sterba
On Tue, May 15, 2018 at 04:31:47PM +0900, Misono Tomohiro wrote:
> Signed-off-by: Tomohiro Misono 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: sysfs: Add entry which shows rmdir(2) can work for subvolume

2018-05-15 Thread David Sterba
On Tue, May 15, 2018 at 04:33:12PM +0900, Misono Tomohiro wrote:
> Deletion of a subvolume by rmdir(2) has become allowed by the
> 'commit cd2decf640b1 ("btrfs: Allow rmdir(2) to delete an empty
> subvolume")'.
> 
> It is a kind of new feature and this commits add new sysfs entry
>   /sys/fs/btrfs/features/rmdir_subvol
> to indicate the feature.
> 
> Since the behavior is independent of feature bits of superblock,
> new type FEAT_KERNEL is added to struct btrfs_feature_set.
> Features of FEAT_KERNEL is supposed to be visible only in /sys/fs/features
> and not in /sys/fs/UUID/features.

As the rmdir_subvol is a static feature, depending only on the kernel
version, it's not needed to use the same infrastructure as the optional
features. It also makes it unnecesarily complicated, to distinguish the
on-disk and kernel-only features and it's not a per-filesystem feature.

It should be exported among btrfs_feature_attr_group and
btrfs_supported_feature_attrs, possibly adding a new type of helpers if
needed.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.15.6 crash: BUG at fs/btrfs/ctree.c:1862

2018-05-15 Thread Marc MERLIN
On Tue, May 15, 2018 at 09:36:11AM +0100, Filipe Manana wrote:
> We got a fix for this recently:  https://patchwork.kernel.org/patch/10396523/

Thanks very much for the notice, sorry that I missed it.

Marc
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/   | PGP 7F55D5F27AAF9D08
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: verify key failure

2018-05-15 Thread Liu Bo
On Tue, May 15, 2018 at 9:29 AM, Qu Wenruo  wrote:
>
>
> On 2018年05月14日 22:35, Liu Bo wrote:
>> Hi,
>>
>> I got another warning of verify_level_key by running btrfs/124 in a loop, 
>> I'm testing against 4.17-rc3.
>>
>> Not sure if it's false positive.
>>
>> [101414.336691] WARNING: CPU: 3 PID: 30194 at fs/btrfs/disk-io.c:455 
>> btree_read_extent_buffer_pages+0x183/0x220 [btrfs]
>> [101414.340372] Modules linked in: btrfs(O) xor zstd_decompress 
>> zstd_compress xxhash zlib_inflate lzo_compress lzo_decompress zlib_deflate 
>> raid6_pq dm_flakey [last unloaded: xor]
>> [101414.345713] CPU: 3 PID: 30194 Comm: btrfs Tainted: GW  O  
>> 4.17.0-rc3-liubo+ #35
>> [101414.348501] RIP: 0010:btree_read_extent_buffer_pages+0x183/0x220 [btrfs]
>> ...
>> [101414.369713] Call Trace:
>> [101414.370477]  read_tree_block+0x3d/0x60 [btrfs]
>> [101414.371946]  read_block_for_search.isra.11+0x19c/0x350 [btrfs]
>> [101414.373915]  btrfs_search_slot+0x4a0/0xa60 [btrfs]
>> [101414.375489]  ? trace_hardirqs_on_caller+0x12/0x1c0
>> [101414.377080]  ? btrfs_lookup_ordered_extent+0x8b/0xd0 [btrfs]
>> [101414.379007]  btrfs_lookup_csum+0x42/0x130 [btrfs]
>> [101414.380456]  __btrfs_lookup_bio_sums+0x2fb/0x6a0 [btrfs]
>> [101414.381554]  btrfs_submit_bio_hook+0xbb/0x180 [btrfs]
>> [101414.382598]  submit_one_bio+0x57/0x80 [btrfs]
>> [101414.383509]  submit_extent_page+0xd5/0x1f0 [btrfs]
>> [101414.384507]  __do_readpage+0x2a6/0x770 [btrfs]
>> [101414.385449]  ? btrfs_create_repair_bio+0x100/0x100 [btrfs]
>> [101414.386576]  ? btrfs_direct_IO+0x3a0/0x3a0 [btrfs]
>> [101414.387569]  ? btrfs_direct_IO+0x3a0/0x3a0 [btrfs]
>> [101414.388562]  __extent_readpages+0x2e2/0x330 [btrfs]
>> [101414.389584]  extent_readpages+0x10e/0x1a0 [btrfs]
>> [101414.390565]  __do_page_cache_readahead+0x283/0x340
>> [101414.391550]  ? ondemand_readahead+0x207/0x460
>> [101414.392451]  ondemand_readahead+0x207/0x460
>> [101414.393353]  relocate_file_extent_cluster+0x364/0x4c0 [btrfs]
>> [101414.394546]  relocate_block_group+0x5d4/0x6e0 [btrfs]
>> ...
>> [101414.432616] BTRFS error (device sdb): tree first key mismatch detected, 
>> bytenr=30523392 key expected=(18446744073709551606, 128, 1120665600) has=(1, 
>> 204, 22020096)
>
> The expected key is completely fine, while the found one obviously
> belongs to extent tree.
>
> Maybe that's the bug which I'm always chasing.
>

The following patch is already in 4.17-rc3,

btrfs: Fix wrong first_key parameter in replace_path

> Can you reproduce it again with btrfs_print_tree() added to provide more
> info?
>

Not sure if I'd have time working on this one, but I'll let you know
if I get it again.

My test box is nothing special, just a plain kvm VM.

thanks,
liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: verify key failure

2018-05-15 Thread Liu Bo
On Tue, May 15, 2018 at 12:10 AM, Chris Mason  wrote:
>
>
> On 14 May 2018, at 10:35, Liu Bo wrote:
>
>> Hi,
>>
>> I got another warning of verify_level_key by running btrfs/124 in a loop,
>> I'm testing against 4.17-rc3.
>>
>> Not sure if it's false positive.
>
>
> How long does this take to trigger?
>


btrfs/124 takes ~24s on my box, and it took 10 ~ 15 runs to hit.


thanks,
liubo

> -chris
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Qu Wenruo


On 2018年05月15日 18:36, Nikolay Borisov wrote:
> 
> 
> On 15.05.2018 11:48, Qu Wenruo wrote:
> 
> 
>
>
> static inline int inode_need_compress(struct inode *inode, u64 start, u64 
> end)  
> { 
>   
> struct btrfs_fs_info *fs_info =trfs_sb(inode->i_sb);  
> 
>   
>   
> /* defrag ioctl */
>   
> if (BTRFS_I(inode)->defrag_compress)  
>   
> return 1; 
>   
> /* bad compression ratios */  
>   
> if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)   
>   
> return 0; 
>   

 Not exactly.
 Force-compress should less us bypass bad compression ratio, so it should
 be at least before ratio check.
> 
> Fair enough, what prompted me in suggesting this is that perhaps the
> check for BTRFS_INODE_NOCOMPRESS should be somwhere at the top of the
> function (alongside the newly added two checks for inode flags), no ?
> INODE_NOCOMPRESS can be set via icotl not necessarily only due to bad
> compression ratio.

This is much trickier than expected.

The point here is, what's the correct behavior for compress-force.
Should it override manually set NOCOMPRESS?

Unfortunately I have no idea at all.
So I can only leave it as is for now.

Thanks,
Qu

> 

 Thanks,
 Qu

> /* force compress */  
>   
> if (btrfs_test_opt(fs_info, FORCE_COMPRESS))  
>   
> return 1; 
>   
> if (btrfs_test_opt(fs_info, COMPRESS) ||  
>   
> BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||   
>   
> BTRFS_I(inode)->prop_compress)
>   
> return btrfs_compress_heuristic(inode, start, end);   
>   
> return 0; 
>   
> } 
> 
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 11:48, Qu Wenruo wrote:




 static inline int inode_need_compress(struct inode *inode, u64 start, u64 
 end)  
 {  
  
 struct btrfs_fs_info *fs_info =trfs_sb(inode->i_sb);   


  
 /* defrag ioctl */ 
  
 if (BTRFS_I(inode)->defrag_compress)   
  
 return 1;  
  
 /* bad compression ratios */   
  
 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
  
 return 0;  
  
>>>
>>> Not exactly.
>>> Force-compress should less us bypass bad compression ratio, so it should
>>> be at least before ratio check.

Fair enough, what prompted me in suggesting this is that perhaps the
check for BTRFS_INODE_NOCOMPRESS should be somwhere at the top of the
function (alongside the newly added two checks for inode flags), no ?
INODE_NOCOMPRESS can be set via icotl not necessarily only due to bad
compression ratio.

>>>
>>> Thanks,
>>> Qu
>>>
 /* force compress */   
  
 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))   
  
 return 1;  
  
 if (btrfs_test_opt(fs_info, COMPRESS) ||   
  
 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
  
 BTRFS_I(inode)->prop_compress) 
  
 return btrfs_compress_heuristic(inode, start, end);
  
 return 0;  
  
 } 

> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: remove BTRFS_LIST_LAYOUT_RAW

2018-05-15 Thread Gu Jinxiang
Since
commit 9005b603d723 ("btrfs-progs: use libbtrfsutil for subvol show"),
BTRFS_LIST_LAYOUT_RAW has no usage.
So, remove it.

Signed-off-by: Gu Jinxiang 
---
 btrfs-list.c | 20 
 btrfs-list.h |  3 +--
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/btrfs-list.c b/btrfs-list.c
index e01c5899..16be6b2f 100644
--- a/btrfs-list.c
+++ b/btrfs-list.c
@@ -1374,23 +1374,6 @@ static void print_subvolume_column(struct root_info 
*subv,
}
 }
 
-static void print_one_subvol_info_raw(struct root_info *subv,
-   const char *raw_prefix)
-{
-   int i;
-
-   for (i = 0; i < BTRFS_LIST_ALL; i++) {
-   if (!btrfs_list_columns[i].need_print)
-   continue;
-
-   if (raw_prefix)
-   printf("%s",raw_prefix);
-
-   print_subvolume_column(subv, i);
-   }
-   printf("\n");
-}
-
 static void print_one_subvol_info_table(struct root_info *subv)
 {
int i;
@@ -1480,9 +1463,6 @@ static void print_all_subvol_info(struct root_lookup 
*sorted_tree,
case BTRFS_LIST_LAYOUT_TABLE:
print_one_subvol_info_table(entry);
break;
-   case BTRFS_LIST_LAYOUT_RAW:
-   print_one_subvol_info_raw(entry, raw_prefix);
-   break;
}
 next:
n = rb_next(n);
diff --git a/btrfs-list.h b/btrfs-list.h
index 6e5fc778..299e3122 100644
--- a/btrfs-list.h
+++ b/btrfs-list.h
@@ -33,8 +33,7 @@
 
 enum btrfs_list_layout {
BTRFS_LIST_LAYOUT_DEFAULT = 0,
-   BTRFS_LIST_LAYOUT_TABLE,
-   BTRFS_LIST_LAYOUT_RAW
+   BTRFS_LIST_LAYOUT_TABLE
 };
 
 /*
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.6 12/14] btrfs: dedupe: Add ioctl for inband deduplication

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ioctl interface for inband deduplication, which includes:
1) enable
2) disable
3) status

And a pseudo RO compat flag, to imply that btrfs now supports inband
dedup.
However we don't add any ondisk format change, it's just a pseudo RO
compat flag.

All these ioctl interfaces are state-less, which means caller don't need
to bother previous dedupe state before calling them, and only need to
care the final desired state.

For example, if user want to enable dedupe with specified block size and
limit, just fill the ioctl structure and call enable ioctl.
No need to check if dedupe is already running.

These ioctls will handle things like re-configure or disable quite well.

Also, for invalid parameters, enable ioctl interface will set the field
of the first encountered invalid parameter to (-1) to inform caller.
While for limit_nr/limit_mem, the value will be (0).

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 50 
 fs/btrfs/dedupe.h  | 17 +++---
 fs/btrfs/disk-io.c |  3 ++
 fs/btrfs/ioctl.c   | 67 ++
 fs/btrfs/sysfs.c   |  2 ++
 include/uapi/linux/btrfs.h | 12 ++-
 6 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 1ad66db5ec75..d3e00318fce9 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled || !dedupe_info) {
+   dargs->status = 0;
+   dargs->blocksize = 0;
+   dargs->backend = 0;
+   dargs->hash_algo = 0;
+   dargs->limit_nr = 0;
+   dargs->current_nr = 0;
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   return;
+   }
+   mutex_lock(_info->lock);
+   dargs->status = 1;
+   dargs->blocksize = dedupe_info->blocksize;
+   dargs->backend = dedupe_info->backend;
+   dargs->hash_algo = dedupe_info->hash_algo;
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   dargs->current_nr = dedupe_info->current_nr;
+   mutex_unlock(_info->lock);
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+}
+
 static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -408,6 +437,27 @@ static void unblock_all_writers(struct btrfs_fs_info 
*fs_info)
percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
 }
 
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   fs_info->dedupe_enabled = 0;
+   /* same as disable */
+   smp_wmb();
+   dedupe_info = fs_info->dedupe_info;
+   fs_info->dedupe_info = NULL;
+
+   if (!dedupe_info)
+   return 0;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
+
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
 {
struct btrfs_dedupe_info *dedupe_info;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index ebcbb89d79a0..85a87093ab04 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -96,6 +96,15 @@ static inline struct btrfs_dedupe_hash 
*btrfs_dedupe_alloc_hash(u16 algo)
 int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dedupe_args *dargs);
 
+
+/*
+ * Get inband dedupe info
+ * Since it needs to access different backends' hash size, which
+ * is not exported, we need such simple function.
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
 /*
  * Disable dedupe and invalidate all its dedupe data.
  * Called at dedupe disable time.
@@ -107,12 +116,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
 
 /*
- * Get current dedupe status.
- * Return 0 for success
- * No possible error yet
+ * Cleanup current btrfs_dedupe_info
+ * Called in umount time
  */
-void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
-struct btrfs_ioctl_dedupe_args *dargs);
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
 

[PATCH v14.6 14/14] btrfs: dedupe: Introduce new reconfigure ioctl

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe
ioctls.

Now dedupe enable and reconfigure ioctl are stateful.


| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Not allowed |
| Enabled   |  reconf| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  dsiable   | Disabled|
| Disabled  |  reconf| Not allowed |

(While disbale is always stateless)

While for guys prefer stateless ioctl (myself for example), new FORCE
flag is introduced.

In FORCE mode, enable/disable is completely stateless.

| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  disable   | Disabled|


Also, re-configure ioctl will only modify specified fields.
Unlike enable, un-specified fields will be filled with default value.

For example:
 # btrfs dedupe enable --block-size 64k /mnt
 # btrfs dedupe reconfigure --limit-hash 1m /mnt
Will leads to:
 dedupe blocksize: 64K
 dedupe hash limit nr: 1m

While for enable:
 # btrfs dedupe enable --force --block-size 64k /mnt
 # btrfs dedupe enable --force --limit-hash 1m /mnt
Will reset blocksize to default value:
 dedupe blocksize: 128K << reset
 dedupe hash limit nr: 1m

Suggested-by: David Sterba 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 131 ++---
 fs/btrfs/dedupe.h  |  13 
 fs/btrfs/ioctl.c   |  13 
 include/uapi/linux/btrfs.h |  11 +++-
 4 files changed, 143 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index d3e00318fce9..82ba711201c5 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+/*
+ * Copy from current dedupe info to fill dargs.
+ * For reconf case, only fill members which is uninitialized.
+ */
+static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF);
+
+   dargs->status = 1;
+
+   if (!reconf || (reconf && dargs->blocksize == (u64)-1))
+   dargs->blocksize = dedupe_info->blocksize;
+   if (!reconf || (reconf && dargs->backend == (u16)-1))
+   dargs->backend = dedupe_info->backend;
+   if (!reconf || (reconf && dargs->hash_algo == (u16)-1))
+   dargs->hash_algo = dedupe_info->hash_algo;
+
+   /*
+* For re-configure case, if not modifying limit,
+* therir limit will be set to 0, unlike other fields
+*/
+   if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) {
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
+   /* current_nr doesn't makes sense for reconfig case */
+   if (!reconf)
+   dargs->current_nr = dedupe_info->current_nr;
+}
+
 void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
 struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
return;
}
mutex_lock(_info->lock);
-   dargs->status = 1;
-   dargs->blocksize = dedupe_info->blocksize;
-   dargs->backend = dedupe_info->backend;
-   dargs->hash_algo = dedupe_info->hash_algo;
-   dargs->limit_nr = dedupe_info->limit_nr;
-   dargs->limit_mem = dedupe_info->limit_nr *
-   (sizeof(struct inmem_hash) +
-btrfs_hash_sizes[dedupe_info->hash_algo]);
-   dargs->current_nr = dedupe_info->current_nr;
+   get_dedupe_status(dedupe_info, dargs);
mutex_unlock(_info->lock);
memset(dargs->__unused, -1, sizeof(dargs->__unused));
 }
@@ -102,17 +128,50 @@ static int init_dedupe_info(struct btrfs_dedupe_info 
**ret_info,
 static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dedupe_args *dargs)
 {
-   u64 blocksize = dargs->blocksize;
-   u64 limit_nr = dargs->limit_nr;
-   u64 limit_mem = dargs->limit_mem;
-   u16 hash_algo = dargs->hash_algo;
-   u8 backend = dargs->backend;
+   struct btrfs_dedupe_info 

[PATCH v14.6 09/14] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Unlike in-memory or on-disk dedupe method, only SHA256 hash method is
supported yet, so implement btrfs_dedupe_calc_hash() interface using
SHA256.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 033c78ceef6a..1ad66db5ec75 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -631,3 +631,50 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
}
return ret;
 }
+
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash)
+{
+   int i;
+   int ret;
+   struct page *p;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+   struct crypto_shash *tfm = dedupe_info->dedupe_driver;
+   u64 dedupe_bs;
+   u64 sectorsize = fs_info->sectorsize;
+
+   SHASH_DESC_ON_STACK(sdesc, tfm);
+
+   if (!fs_info->dedupe_enabled || !hash)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   WARN_ON(!IS_ALIGNED(start, sectorsize));
+
+   dedupe_bs = dedupe_info->blocksize;
+
+   sdesc->tfm = tfm;
+   sdesc->flags = 0;
+   ret = crypto_shash_init(sdesc);
+   if (ret)
+   return ret;
+   for (i = 0; sectorsize * i < dedupe_bs; i++) {
+   char *d;
+
+   p = find_get_page(inode->i_mapping,
+ (start >> PAGE_SHIFT) + i);
+   if (WARN_ON(!p))
+   return -ENOENT;
+   d = kmap(p);
+   ret = crypto_shash_update(sdesc, d, sectorsize);
+   kunmap(p);
+   put_page(p);
+   if (ret)
+   return ret;
+   }
+   ret = crypto_shash_final(sdesc, hash->hash);
+   return ret;
+}
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v10.2 4/5] btrfs-progs: dedupe: Add status subcommand

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Add status subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  3 +
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 81 ++
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index de32eb97d9dd..df068c31ca3a 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,9 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*status* ::
+Show current in-band de-duplication status of a filesystem.
+
 BACKENDS
 
 Btrfs in-band de-duplication will support different storage backends, with
diff --git a/btrfs-completion b/btrfs-completion
index 2f113e01fb01..c8e67b459341 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -41,7 +41,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe='enable disable'
+   commands_dedupe='enable disable status'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 031766c1d91c..854cbda131a3 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -302,12 +302,93 @@ out:
return 0;
 }
 
+static const char * const cmd_dedupe_ib_status_usage[] = {
+   "btrfs dedupe status ",
+   "Show current in-band(write time) de-duplication status of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_status(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+   int print_limit = 1;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_status_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   ret = 1;
+   goto out;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_STATUS;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to get inband deduplication status: %s",
+ strerror(errno));
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+   if (dargs.status == 0) {
+   printf("Status: \t\t\tDisabled\n");
+   goto out;
+   }
+   printf("Status:\t\t\tEnabled\n");
+
+   if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256)
+   printf("Hash algorithm:\t\tSHA-256\n");
+   else
+   printf("Hash algorithm:\t\tUnrecognized(%x)\n",
+   dargs.hash_algo);
+
+   if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   printf("Backend:\t\tIn-memory\n");
+   print_limit = 1;
+   } else  {
+   printf("Backend:\t\tUnrecognized(%x)\n",
+   dargs.backend);
+   }
+
+   printf("Dedup Blocksize:\t%llu\n", dargs.blocksize);
+
+   if (print_limit) {
+   u64 cur_mem;
+
+   /* Limit nr may be 0 */
+   if (dargs.limit_nr)
+   cur_mem = dargs.current_nr * (dargs.limit_mem /
+   dargs.limit_nr);
+   else
+   cur_mem = 0;
+
+   printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr,
+   dargs.limit_nr);
+   printf("Memory usage: \t\t[%s/%s]\n",
+   pretty_size(cur_mem),
+   pretty_size(dargs.limit_mem));
+   }
+out:
+   close_file_or_dir(fd, dirstream);
+   return ret;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
{ "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
  NULL, 0},
+   { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v10.2 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Introduce reconfigure subcommand to co-operate with new kernel ioctl
modification.

Signed-off-by: Qu Wenruo 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  7 ++
 cmds-dedupe-ib.c   | 75 +-
 2 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index df068c31ca3a..5fc4bb0d5940 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,13 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*reconfigure* [options] ::
+Re-configure in-band de-duplication parameters of a filesystem.
++
+In-band de-duplication must be enbaled first before re-configuration.
++
+[Options] are the same with 'btrfs dedupe-inband enable'.
+
 *status* ::
 Show current in-band de-duplication status of a filesystem.
 
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 854cbda131a3..925d5a8f756a 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = {
NULL
 };
 
-
 #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \
 ({ \
if (dargs->member != old->member && \
@@ -88,6 +87,12 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
}
report_option_parameter(dargs, old, flags, u8, -1, x);
}
+
+   if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) {
+   error("must enable dedupe before reconfiguration");
+   return;
+   }
+
if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) ||
report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) ||
report_fatal_parameter(dargs, old, backend, u16, -1, u) ||
@@ -100,14 +105,17 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
old->limit_nr, old->limit_mem);
 }
 
-static int cmd_dedupe_ib_enable(int argc, char **argv)
+static int enable_reconfig_dedupe(int argc, char **argv, int reconf)
 {
int ret;
int fd = -1;
char *path;
u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT;
+   int blocksize_set = 0;
u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256;
+   int hash_algo_set = 0;
u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
+   int backend_set = 0;
u64 limit_nr = 0;
u64 limit_mem = 0;
u64 sys_mem = 0;
@@ -129,20 +137,22 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
{ NULL, 0, NULL, 0}
};
 
-   c = getopt_long(argc, argv, "s:b:a:l:m:", long_options, NULL);
+   c = getopt_long(argc, argv, "s:b:a:l:m:f", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 's':
-   if (!strcasecmp("inmemory", optarg))
+   if (!strcasecmp("inmemory", optarg)) {
backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
-   else {
+   backend_set = 1;
+   } else {
error("unsupported dedupe backend: %s", optarg);
exit(1);
}
break;
case 'b':
blocksize = parse_size(optarg);
+   blocksize_set = 1;
break;
case 'a':
if (strcmp("sha256", optarg)) {
@@ -226,26 +236,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
return 1;
}
memset(, -1, sizeof(dargs));
-   dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE;
-   dargs.blocksize = blocksize;
-   dargs.hash_algo = hash_algo;
-   dargs.limit_nr = limit_nr;
-   dargs.limit_mem = limit_mem;
-   dargs.backend = backend;
-   if (force)
-   dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE;
-   else
-   dargs.flags = 0;
+   if (reconf) {
+   dargs.cmd = BTRFS_DEDUPE_CTL_RECONF;
+   if (blocksize_set)
+   dargs.blocksize = blocksize;
+   if (hash_algo_set)
+   dargs.hash_algo = hash_algo;
+   if (backend_set)
+   dargs.backend = backend;
+   dargs.limit_nr = limit_nr;
+   dargs.limit_mem = limit_mem;
+   } else {
+   dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE;
+   dargs.blocksize = blocksize;
+   

[PATCH v10.2 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Add disable subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  5 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 42 ++
 3 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 82f970a69953..de32eb97d9dd 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,6 +22,11 @@ use with caution.
 
 SUBCOMMAND
 --
+*disable* ::
+Disable in-band de-duplication for a filesystem.
++
+This will trash all stored dedupe hash.
++
 *enable* [options] ::
 Enable in-band de-duplication for a filesystem.
 +
diff --git a/btrfs-completion b/btrfs-completion
index 69e02ad11990..2f113e01fb01 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -41,7 +41,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe='enable'
+   commands_dedupe='enable disable'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index cb62d0064167..031766c1d91c 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -262,10 +262,52 @@ out:
return ret;
 }
 
+static const char * const cmd_dedupe_ib_disable_usage[] = {
+   "btrfs dedupe disable ",
+   "Disable in-band(write time) de-duplication of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_disable(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_disable_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   return 1;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to disable inband deduplication: %s",
+ strerror(errno));
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+
+out:
+   close_file_or_dir(fd, dirstream);
+   return 0;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
+   { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.6 07/14] btrfs: delayed-ref: Add support for increasing data ref under spinlock

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

For in-band dedupe, btrfs needs to increase data ref with delayed_ref
locked, so add a new function btrfs_add_delayed_data_ref_lock() to
increase extent ref with delayed_refs already locked.

Signed-off-by: Qu Wenruo 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 37 +
 fs/btrfs/delayed-ref.h | 10 ++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e1b0651686f7..58a64448d777 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -824,6 +824,31 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
*fs_info,
return -ENOMEM;
 }
 
+/*
+ * Do real delayed data ref insert.
+ * Caller must hold delayed_refs->lock and allocation memory
+ * for dref,head_ref and record.
+ */
+void btrfs_add_delayed_data_ref_locked(struct btrfs_fs_info *fs_info,
+   struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_data_ref *dref,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+   u64 owner, u64 offset, u64 reserved, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod)
+{
+   head_ref = add_delayed_ref_head(fs_info, trans, head_ref, qrecord,
+   bytenr, num_bytes, ref_root, reserved,
+   action, 1, 0, qrecord_inserted_ret,
+   old_ref_mod, new_ref_mod);
+
+   add_delayed_data_ref(fs_info, trans, head_ref, >node, bytenr,
+num_bytes, parent, ref_root, owner, offset,
+action);
+}
+
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
@@ -870,14 +895,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
 * insert both the head node and the new ref without dropping
 * the spin lock
 */
-   head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record,
-   bytenr, num_bytes, ref_root, reserved,
-   action, 1, 0, _inserted,
-   old_ref_mod, new_ref_mod);
-
-   add_delayed_data_ref(fs_info, trans, head_ref, >node, bytenr,
-  num_bytes, parent, ref_root, owner, offset,
-  action);
+   btrfs_add_delayed_data_ref_locked(fs_info, trans, ref, head_ref, record,
+   bytenr, num_bytes, parent, ref_root, owner, offset,
+   reserved, action, _inserted, old_ref_mod,
+   new_ref_mod);
spin_unlock(_refs->lock);
 
if (qrecord_inserted)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 7f00db50bd24..ad6c5dceea38 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -234,12 +234,22 @@ static inline void btrfs_put_delayed_ref_head(struct 
btrfs_delayed_ref_head *hea
kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
 }
 
+struct btrfs_qgroup_extent_record;
 int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
   struct btrfs_trans_handle *trans,
   u64 bytenr, u64 num_bytes, u64 parent,
   u64 ref_root, int level, int action,
   struct btrfs_delayed_extent_op *extent_op,
   int *old_ref_mod, int *new_ref_mod);
+void btrfs_add_delayed_data_ref_locked(struct btrfs_fs_info *fs_info,
+   struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_data_ref *dref,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+   u64 owner, u64 offset, u64 reserved, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod);
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
   struct btrfs_trans_handle *trans,
   u64 bytenr, u64 num_bytes,
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.6 04/14] btrfs: dedupe: Introduce function to initialize dedupe info

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Add generic function to initialize dedupe info.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/Makefile  |   2 +-
 fs/btrfs/dedupe.c  | 173 +
 fs/btrfs/dedupe.h  |  13 ++-
 include/uapi/linux/btrfs.h |   4 +-
 4 files changed, 188 insertions(+), 4 deletions(-)
 create mode 100644 fs/btrfs/dedupe.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..78fdc87dba39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-  uuid-tree.o props.o free-space-tree.o tree-checker.o
+  uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
new file mode 100644
index ..39db05b14398
--- /dev/null
+++ b/fs/btrfs/dedupe.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "dedupe.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+
+struct inmem_hash {
+   struct rb_node hash_node;
+   struct rb_node bytenr_node;
+   struct list_head lru_list;
+
+   u64 bytenr;
+   u32 num_bytes;
+
+   u8 hash[];
+};
+
+static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
+   struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS);
+   if (!dedupe_info)
+   return -ENOMEM;
+
+   dedupe_info->hash_algo = dargs->hash_algo;
+   dedupe_info->backend = dargs->backend;
+   dedupe_info->blocksize = dargs->blocksize;
+   dedupe_info->limit_nr = dargs->limit_nr;
+
+   /* only support SHA256 yet */
+   dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0);
+   if (IS_ERR(dedupe_info->dedupe_driver)) {
+   int ret;
+
+   ret = PTR_ERR(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return ret;
+   }
+
+   dedupe_info->hash_root = RB_ROOT;
+   dedupe_info->bytenr_root = RB_ROOT;
+   dedupe_info->current_nr = 0;
+   INIT_LIST_HEAD(_info->lru_list);
+   mutex_init(_info->lock);
+
+   *ret_info = dedupe_info;
+   return 0;
+}
+
+/*
+ * Helper to check if parameters are valid.
+ * The first invalid field will be set to (-1), to info user which parameter
+ * is invalid.
+ * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned
+ * to info user, since user can specify any value to limit, except 0.
+ */
+static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   u64 blocksize = dargs->blocksize;
+   u64 limit_nr = dargs->limit_nr;
+   u64 limit_mem = dargs->limit_mem;
+   u16 hash_algo = dargs->hash_algo;
+   u8 backend = dargs->backend;
+
+   /*
+* Set all reserved fields to -1, allow user to detect
+* unsupported optional parameters.
+*/
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX ||
+   blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN ||
+   blocksize < fs_info->sectorsize ||
+   !is_power_of_2(blocksize) ||
+   blocksize < PAGE_SIZE) {
+   dargs->blocksize = (u64)-1;
+   return -EINVAL;
+   }
+   if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) {
+   dargs->hash_algo = (u16)-1;
+   return -EINVAL;
+   }
+   if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) {
+   dargs->backend = (u8)-1;
+   return -EINVAL;
+   }
+
+   /* Backend specific check */
+   if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   /* only one limit is accepted for enable*/
+   if (dargs->limit_nr && dargs->limit_mem) {
+   dargs->limit_nr = 0;
+   dargs->limit_mem = 0;
+   return -EINVAL;
+   }
+
+   if (!limit_nr && !limit_mem)
+   dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT;
+   else {
+   u64 tmp = (u64)-1;
+
+   if (limit_mem) {
+   

[PATCH v10.2 0/5] In-band de-duplication for btrfs-progs

2018-05-15 Thread Lu Fengqi
Patchset can be fetched from github:
https://github.com/littleroad/btrfs-progs.git dedupe_latest

Inband dedupe(in-memory backend only) ioctl support for btrfs-progs.

v7 changes:
   Update ctree.h to follow kernel structure change
   Update print-tree to follow kernel structure change
V8 changes:
   Move dedup props and on-disk backend support out of the patchset
   Change command group name to "dedupe-inband", to avoid confusion with
   possible out-of-band dedupe. Suggested by Mark.
   Rebase to latest devel branch.
V9 changes:
   Follow kernels ioctl change to support FORCE flag, new reconf ioctl,
   and more precious error reporting.
v10 changes:
   Rebase to v4.10.
   Add BUILD_ASSERT for btrfs_ioctl_dedupe_args
v10.1 changes:
   Rebase to v4.14.
v10.2 changes:
   Rebase to v4.16.1.

Qu Wenruo (5):
  btrfs-progs: Basic framework for dedupe-inband command group
  btrfs-progs: dedupe: Add enable command for dedupe command group
  btrfs-progs: dedupe: Add disable support for inband dedupelication
  btrfs-progs: dedupe: Add status subcommand
  btrfs-progs: dedupe: introduce reconfigure subcommand

 Documentation/Makefile.in  |   1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 167 
 Documentation/btrfs.asciidoc   |   4 +
 Makefile   |   3 +-
 btrfs-completion   |   6 +-
 btrfs.c|   2 +
 cmds-dedupe-ib.c   | 442 +
 commands.h |   2 +
 dedupe-ib.h|  28 ++
 ioctl.h|  38 ++
 10 files changed, 691 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.6 10/14] btrfs: ordered-extent: Add support for dedupe

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ordered-extent support for dedupe.

Note, current ordered-extent support only supports non-compressed source
extent.
Support for compressed source extent will be added later.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/ordered-data.c | 46 +
 fs/btrfs/ordered-data.h | 13 
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 94a62fcd9125..5d63e4011722 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -13,6 +13,7 @@
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "dedupe.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -171,7 +172,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ struct btrfs_dedupe_hash *hash)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -192,6 +194,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+   entry->hash = NULL;
+   /*
+* A hash hit means we have already incremented the extents delayed
+* ref.
+* We must handle this even if another process is trying to
+* turn off dedupe, otherwise we will leak a reference.
+*/
+   if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) {
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = root->fs_info->dedupe_info;
+   if (WARN_ON(dedupe_info == NULL)) {
+   kmem_cache_free(btrfs_ordered_extent_cache,
+   entry);
+   return -EINVAL;
+   }
+   entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo);
+   if (!entry->hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   entry->hash->bytenr = hash->bytenr;
+   entry->hash->num_bytes = hash->num_bytes;
+   memcpy(entry->hash->hash, hash->hash,
+  btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, >flags);
 
@@ -246,15 +275,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
+int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  struct btrfs_dedupe_hash *hash)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE, hash);
+}
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -263,7 +300,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, 
u64 file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, NULL);
 }
 
 /*
@@ -574,6 +611,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(>list);
kfree(sum);
}
+   kfree(entry->hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git 

[PATCH v14.6 03/14] btrfs: dedupe: Introduce dedupe framework and its header

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce the header for btrfs in-band(write time) de-duplication
framework and needed header.

The new de-duplication framework is going to support 2 different dedupe
methods and 1 dedupe hash.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   7 ++
 fs/btrfs/dedupe.h  | 136 -
 fs/btrfs/disk-io.c |   1 +
 include/uapi/linux/btrfs.h |  34 ++
 4 files changed, 176 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9285d91af1c..f894dfad61e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1132,6 +1132,13 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
 #endif
+
+   /*
+* Inband de-duplication related structures
+*/
+   unsigned long dedupe_enabled:1;
+   struct btrfs_dedupe_info *dedupe_info;
+   struct mutex dedupe_ioctl_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 90281a7a35a8..681cf4717396 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -6,7 +6,139 @@
 #ifndef BTRFS_DEDUPE_H
 #define BTRFS_DEDUPE_H
 
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
+#include 
+#include 
+#include 
 
+static const int btrfs_hash_sizes[] = { 32 };
+
+/*
+ * For caller outside of dedupe.c
+ *
+ * Different dedupe backends should have their own hash structure
+ */
+struct btrfs_dedupe_hash {
+   u64 bytenr;
+   u32 num_bytes;
+
+   /* last field is a variable length array of dedupe hash */
+   u8 hash[];
+};
+
+struct btrfs_dedupe_info {
+   /* dedupe blocksize */
+   u64 blocksize;
+   u16 backend;
+   u16 hash_algo;
+
+   struct crypto_shash *dedupe_driver;
+
+   /*
+* Use mutex to portect both backends
+* Even for in-memory backends, the rb-tree can be quite large,
+* so mutex is better for such use case.
+*/
+   struct mutex lock;
+
+   /* following members are only used in in-memory backend */
+   struct rb_root hash_root;
+   struct rb_root bytenr_root;
+   struct list_head lru_list;
+   u64 limit_nr;
+   u64 current_nr;
+};
+
+struct btrfs_trans_handle;
+
+static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
+{
+   return (hash && hash->bytenr);
+}
+
+int btrfs_dedupe_hash_size(u16 algo);
+struct btrfs_dedupe_hash *btrfs_dedupe_alloc_hash(u16 algo);
+
+/*
+ * Initial inband dedupe info
+ * Called at dedupe enable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (from unsupported param to tree creation error for some backends)
+ */
+int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
+   struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Disable dedupe and invalidate all its dedupe data.
+ * Called at dedupe disable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
+
+/*
+ * Get current dedupe status.
+ * Return 0 for success
+ * No possible error yet
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Calculate hash for dedupe.
+ * Caller must ensure [start, start + dedupe_bs) has valid data.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (error from hash codes)
+ */
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash);
+
+/*
+ * Search for duplicated extents by calculated hash
+ * Caller must call btrfs_dedupe_calc_hash() first to get the hash.
+ *
+ * @inode: the inode for we are writing
+ * @file_pos: offset inside the inode
+ * As we will increase extent ref immediately after a hash match,
+ * we need @file_pos and @inode in this case.
+ *
+ * Return > 0 for a hash match, and the extent ref will be
+ * *INCREASED*, and hash->bytenr/num_bytes will record the existing
+ * extent data.
+ * Return 0 for a hash miss. Nothing is done
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash);
+
+/*
+ * Add a dedupe hash into dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_add(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash);
+
+/*
+ * Remove a dedupe hash 

[PATCH v10.2 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Add enable subcommand for dedupe commmand group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc | 114 +-
 btrfs-completion   |   6 +-
 cmds-dedupe-ib.c   | 241 +
 ioctl.h|   2 +
 4 files changed, 361 insertions(+), 2 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 9ee2bc75db3a..82f970a69953 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,7 +22,119 @@ use with caution.
 
 SUBCOMMAND
 --
-Nothing yet
+*enable* [options] ::
+Enable in-band de-duplication for a filesystem.
++
+`Options`
++
+-f|--force
+Force 'enable' command to be exected.
+Will skip memory limit check and allow 'enable' to be executed even in-band
+de-duplication is already enabled.
++
+NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be
+reset to its default value.
+
+-s|--storage-backend 
+Specify de-duplication hash storage backend.
+Only 'inmemory' backend is supported yet.
+If not specified, default value is 'inmemory'.
++
+Refer to *BACKENDS* sector for more information.
+
+-b|--blocksize 
+Specify dedupe block size.
+Supported values are power of 2 from '16K' to '8M'.
+Default value is '128K'.
++
+Refer to *BLOCKSIZE* sector for more information.
+
+-a|--hash-algorithm 
+Specify hash algorithm.
+Only 'sha256' is supported yet.
+
+-l|--limit-hash 
+Specify maximum number of hashes stored in memory.
+Only works for 'inmemory' backend.
+Conflicts with '-m' option.
++
+Only positive values are valid.
+Default value is '32K'.
+
+-m|--limit-memory 
+Specify maximum memory used for hashes.
+Only works for 'inmemory' backend.
+Conflicts with '-l' option.
++
+Only value larger than or equal to '1024' is valid.
+No default value.
++
+NOTE: Memory limit will be rounded down to kernel internal hash size,
+so the memory limit shown in 'btrfs dedupe status' may be different
+from the .
+
+WARNING: Too large value for '-l' or '-m' will easily trigger OOM.
+Please use with caution according to system memory.
+
+NOTE: In-band de-duplication is not compactible with compression yet.
+And compression has higher priority than in-band de-duplication, means if
+compression and de-duplication is enabled at the same time, only compression
+will work.
+
+BACKENDS
+
+Btrfs in-band de-duplication will support different storage backends, with
+different use case and features.
+
+In-memory backend::
+This backend provides backward-compatibility, and more fine-tuning options.
+But hash pool is non-persistent and may exhaust kernel memory if not setup
+properly.
++
+This backend can be used on old btrfs(without '-O dedupe' mkfs option).
+When used on old btrfs, this backend needs to be enabled manually after mount.
++
+Designed for fast hash search speed, in-memory backend will keep all dedupe
+hashes in memory. (Although overall performance is still much the same with
+'ondisk' backend if all 'ondisk' hash can be cached in memory)
++
+And only keeps limited number of hash in memory to avoid exhausting memory.
+Hashes over the limit will be dropped following Last-Recent-Use behavior.
+So this backend has a consistent overhead for given limit but can\'t ensure
+all duplicated blocks will be de-duplicated.
++
+After umount and mount, in-memory backend need to refill its hash pool.
+
+On-disk backend::
+This backend provides persistent hash pool, with more smart memory management
+for hash pool.
+But it\'s not backward-compatible, meaning it must be used with '-O dedupe' 
mkfs
+option and older kernel can\'t mount it read-write.
++
+Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk.
+This behavior may cause extra disk IO for hash search under high memory
+pressure.
++
+After umount and mount, on-disk backend still has its hash on disk, no need to
+refill its dedupe hash pool.
+
+Currently, only 'inmemory' backend is supported in btrfs-progs.
+
+DEDUPE BLOCK SIZE
+
+In-band de-duplication is done at dedupe block size.
+Any data smaller than dedupe block size won\'t go through in-band
+de-duplication.
+
+And dedupe block size affects dedupe rate and fragmentation heavily.
+
+Smaller block size will cause more fragments, but higher dedupe rate.
+
+Larger block size will cause less fragments, but lower dedupe rate.
+
+In-band de-duplication rate is highly related to the workload pattern.
+So it\'s highly recommended to align dedupe block size to the workload
+block size to make full use of de-duplication.
 
 EXIT STATUS
 ---
diff --git a/btrfs-completion b/btrfs-completion
index ae683f4ecf61..69e02ad11990 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -29,7 

[PATCH v10.2 1/5] btrfs-progs: Basic framework for dedupe-inband command group

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Add basic ioctl header and command group framework for later use.
Alone with basic man page doc.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/Makefile.in  |  1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 40 ++
 Documentation/btrfs.asciidoc   |  4 +++
 Makefile   |  3 +-
 btrfs.c|  2 ++
 cmds-dedupe-ib.c   | 35 +++
 commands.h |  2 ++
 dedupe-ib.h| 28 +++
 ioctl.h| 36 +++
 9 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in
index 184647c41940..402155fae001 100644
--- a/Documentation/Makefile.in
+++ b/Documentation/Makefile.in
@@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc
 MAN8_TXT += btrfs-replace.asciidoc
 MAN8_TXT += btrfs-restore.asciidoc
 MAN8_TXT += btrfs-property.asciidoc
+MAN8_TXT += btrfs-dedupe-inband.asciidoc
 
 # Category 5 manual page
 MAN5_TXT += btrfs-man5.asciidoc
diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
new file mode 100644
index ..9ee2bc75db3a
--- /dev/null
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -0,0 +1,40 @@
+btrfs-dedupe(8)
+==
+
+NAME
+
+btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs
+filesystem
+
+SYNOPSIS
+
+*btrfs dedupe-inband*  
+
+DESCRIPTION
+---
+*btrfs dedupe-inband* is used to enable/disable or show current in-band 
de-duplication
+status of a btrfs filesystem.
+
+Kernel support for in-band de-duplication starts from 4.8.
+
+WARNING: In-band de-duplication is still an experimental feautre of btrfs,
+use with caution.
+
+SUBCOMMAND
+--
+Nothing yet
+
+EXIT STATUS
+---
+*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is
+returned in case of failure.
+
+AVAILABILITY
+
+*btrfs* is part of btrfs-progs.
+Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for
+further details.
+
+SEE ALSO
+
+`mkfs.btrfs`(8),
diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc
index 7316ac094413..d37ae3571bd3 100644
--- a/Documentation/btrfs.asciidoc
+++ b/Documentation/btrfs.asciidoc
@@ -50,6 +50,10 @@ COMMANDS
Do off-line check on a btrfs filesystem. +
See `btrfs-check`(8) for details.
 
+*dedupe*::
+   Control btrfs in-band(write time) de-duplication. +
+   See `btrfs-dedupe`(8) for details.
+
 *device*::
Manage devices managed by btrfs, including add/delete/scan and so
on. +
diff --git a/Makefile b/Makefile
index cbd855336b2f..5dcd5c985293 100644
--- a/Makefile
+++ b/Makefile
@@ -123,7 +123,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o 
cmds-device.o cmds-scrub.o \
   cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \
   cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \
   cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o 
\
-  mkfs/common.o check/mode-common.o check/mode-lowmem.o
+  mkfs/common.o check/mode-common.o check/mode-lowmem.o \
+  cmds-dedupe-ib.o
 libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o 
\
   kernel-lib/crc32c.o messages.o \
   uuid-tree.o utils-lib.o rbtree-utils.o
diff --git a/btrfs.c b/btrfs.c
index 2d39f2ced3e8..2168f5a8bc7f 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = {
{ "quota", cmd_quota, NULL, _cmd_group, 0 },
{ "qgroup", cmd_qgroup, NULL, _cmd_group, 0 },
{ "replace", cmd_replace, NULL, _cmd_group, 0 },
+   { "dedupe-inband", cmd_dedupe_ib, NULL, _ib_cmd_group,
+   0 },
{ "help", cmd_help, cmd_help_usage, NULL, 0 },
{ "version", cmd_version, cmd_version_usage, NULL, 0 },
NULL_CMD_STRUCT
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
new file mode 100644
index ..73c923a797da
--- /dev/null
+++ b/cmds-dedupe-ib.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+
+#include "ctree.h"
+#include "ioctl.h"
+
+#include "commands.h"
+#include "utils.h"
+#include "kerncompat.h"
+#include "dedupe-ib.h"
+
+static const char * const dedupe_ib_cmd_group_usage[] = {
+   "btrfs dedupe-inband  [options] ",
+   NULL
+};
+
+static 

[PATCH v14.6 11/14] btrfs: dedupe: Inband in-memory only de-duplication implement

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Core implement for inband de-duplication.
It reuses the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The workflow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   4 +-
 fs/btrfs/dedupe.h  |  18 +++
 fs/btrfs/extent-tree.c |  31 -
 fs/btrfs/extent_io.c   |   5 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file.c|   3 +
 fs/btrfs/inode.c   | 305 ++---
 fs/btrfs/ioctl.c   |   1 +
 fs/btrfs/relocation.c  |  17 +++
 9 files changed, 329 insertions(+), 56 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f894dfad61e1..3af75b2c1251 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -107,9 +107,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
BTRFS_RESERVE_COMPRESS,
+   BTRFS_RESERVE_DEDUPE,
 };
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type);
 int inode_need_compress(struct inode *inode, u64 start, u64 end);
 
 struct btrfs_mapping_tree {
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index f19f6a8ff2ba..ebcbb89d79a0 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include "btrfs_inode.h"
 
 static const int btrfs_hash_sizes[] = { 32 };
 
@@ -50,6 +51,23 @@ struct btrfs_dedupe_info {
 
 struct btrfs_trans_handle;
 
+static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode)
+{
+   struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+   return fs_info->dedupe_info->blocksize;
+}
+
+static inline int inode_need_dedupe(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   return 1;
+}
+
 static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
 {
return (hash && hash->bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c9cc925bb475..a234f651589d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2626,6 +2627,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle 
*trans,
btrfs_pin_extent(fs_info, head->bytenr,
 head->num_bytes, 1);
if (head->is_data) {
+   /*
+* If insert_reserved is given, it means
+* a new extent is revered, then deleted
+* in one tran, and inc/dec get merged to 0.
+*
+* In this case, we need to remove its dedupe
+* hash.
+*/
+   ret = btrfs_dedupe_del(trans, fs_info, head->bytenr);
+   if (ret < 0)
+   return ret;
ret = btrfs_del_csums(trans, fs_info, head->bytenr,
  head->num_bytes);
}
@@ -6077,15 +6089,17 @@ static void btrfs_calculate_inode_block_rsv_size(struct 
btrfs_fs_info *fs_info,
spin_unlock(_rsv->lock);
 }
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type)
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
else if (reserve_type == BTRFS_RESERVE_COMPRESS)
return SZ_128K;
-
-   ASSERT(0);
-   return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_DEDUPE)
+   return btrfs_dedupe_blocksize(inode);
+   else
+   return BTRFS_MAX_EXTENT_SIZE;
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
@@ -6096,7 +6110,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode 
*inode, u64 num_bytes,
enum 

[PATCH v14.6 06/14] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_del() to remove hash from in-memory
dedupe tree.
And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces.

Also for btrfs_dedupe_disable(), add new functions to wait existing
writer and block incoming writers to eliminate all possible race.

Cc: Mark Fasheh 
Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
---
 fs/btrfs/dedupe.c | 132 +++---
 1 file changed, 126 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index a4871f16df13..c279189df859 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -174,12 +174,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
return ret;
 }
 
-int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
-{
-   /* Place holder for bisect, will be implemented in later patches */
-   return 0;
-}
-
 static int inmem_insert_hash(struct rb_root *root,
 struct inmem_hash *hash, int hash_len)
 {
@@ -322,3 +316,129 @@ int btrfs_dedupe_add(struct btrfs_trans_handle *trans,
return inmem_add(dedupe_info, hash);
return -EINVAL;
 }
+
+static struct inmem_hash *
+inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct rb_node **p = _info->bytenr_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+
+   if (bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   return NULL;
+}
+
+/* Delete a hash from in-memory dedupe tree */
+static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct inmem_hash *hash;
+
+   mutex_lock(_info->lock);
+   hash = inmem_search_bytenr(dedupe_info, bytenr);
+   if (!hash) {
+   mutex_unlock(_info->lock);
+   return 0;
+   }
+
+   __inmem_del(dedupe_info, hash);
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+/* Remove a dedupe hash from dedupe tree */
+int btrfs_dedupe_del(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   return inmem_del(dedupe_info, bytenr);
+   return -EINVAL;
+}
+
+static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
+{
+   struct inmem_hash *entry, *tmp;
+
+   mutex_lock(_info->lock);
+   list_for_each_entry_safe(entry, tmp, _info->lru_list, lru_list)
+   __inmem_del(dedupe_info, entry);
+   mutex_unlock(_info->lock);
+}
+
+/*
+ * Helper function to wait and block all incoming writers
+ *
+ * Use rw_sem introduced for freeze to wait/block writers.
+ * So during the block time, no new write will happen, so we can
+ * do something quite safe, espcially helpful for dedupe disable,
+ * as it affect buffered write.
+ */
+static void block_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+   down_write(>s_umount);
+}
+
+static void unblock_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   up_write(>s_umount);
+   percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+}
+
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+   int ret;
+
+   dedupe_info = fs_info->dedupe_info;
+
+   if (!dedupe_info)
+   return 0;
+
+   /* Don't allow disable status change in RO mount */
+   if (fs_info->sb->s_flags & MS_RDONLY)
+   return -EROFS;
+
+   /*
+* Wait for all unfinished writers and block further writers.
+* Then sync the whole fs so all current write will go through
+* dedupe, and all later write won't go through dedupe.
+*/
+   block_all_writers(fs_info);
+   ret = sync_filesystem(fs_info->sb);
+   fs_info->dedupe_enabled = 0;
+   fs_info->dedupe_info = NULL;
+   unblock_all_writers(fs_info);
+   if (ret < 0)
+   return ret;
+
+   /* now we are OK to clean up everything */
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   

[PATCH v14.6 05/14] btrfs: dedupe: Introduce function to add hash into in-memory tree

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_add() to add hash into in-memory tree.
And now we can implement the btrfs_dedupe_add() interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/dedupe.c | 151 ++
 1 file changed, 151 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 39db05b14398..a4871f16df13 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -20,6 +20,14 @@ struct inmem_hash {
u8 hash[];
 };
 
+static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
+{
+   if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes)))
+   return NULL;
+   return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo],
+   GFP_NOFS);
+}
+
 static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -171,3 +179,146 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
/* Place holder for bisect, will be implemented in later patches */
return 0;
 }
+
+static int inmem_insert_hash(struct rb_root *root,
+struct inmem_hash *hash, int hash_len)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+   if (memcmp(hash->hash, entry->hash, hash_len) < 0)
+   p = &(*p)->rb_left;
+   else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>hash_node, parent, p);
+   rb_insert_color(>hash_node, root);
+   return 0;
+}
+
+static int inmem_insert_bytenr(struct rb_root *root,
+  struct inmem_hash *hash)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+   if (hash->bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (hash->bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>bytenr_node, parent, p);
+   rb_insert_color(>bytenr_node, root);
+   return 0;
+}
+
+static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
+   struct inmem_hash *hash)
+{
+   list_del(>lru_list);
+   rb_erase(>hash_node, _info->hash_root);
+   rb_erase(>bytenr_node, _info->bytenr_root);
+
+   if (!WARN_ON(dedupe_info->current_nr == 0))
+   dedupe_info->current_nr--;
+
+   kfree(hash);
+}
+
+/*
+ * Insert a hash into in-memory dedupe tree
+ * Will remove exceeding last recent use hash.
+ *
+ * If the hash mathced with existing one, we won't insert it, to
+ * save memory
+ */
+static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
+struct btrfs_dedupe_hash *hash)
+{
+   int ret = 0;
+   u16 algo = dedupe_info->hash_algo;
+   struct inmem_hash *ihash;
+
+   ihash = inmem_alloc_hash(algo);
+
+   if (!ihash)
+   return -ENOMEM;
+
+   /* Copy the data out */
+   ihash->bytenr = hash->bytenr;
+   ihash->num_bytes = hash->num_bytes;
+   memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]);
+
+   mutex_lock(_info->lock);
+
+   ret = inmem_insert_bytenr(_info->bytenr_root, ihash);
+   if (ret > 0) {
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   ret = inmem_insert_hash(_info->hash_root, ihash,
+   btrfs_hash_sizes[algo]);
+   if (ret > 0) {
+   /*
+* We only keep one hash in tree to save memory, so if
+* hash conflicts, free the one to insert.
+*/
+   rb_erase(>bytenr_node, _info->bytenr_root);
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   list_add(>lru_list, _info->lru_list);
+   dedupe_info->current_nr++;
+
+   /* Remove the last dedupe hash if we exceed limit */
+   while (dedupe_info->current_nr > dedupe_info->limit_nr) {
+   struct inmem_hash *last;
+
+   last = list_entry(dedupe_info->lru_list.prev,
+ struct inmem_hash, lru_list);
+   __inmem_del(dedupe_info, last);
+   }
+out:
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+int btrfs_dedupe_add(struct 

[PATCH v14.6 13/14] btrfs: relocation: Enhance error handling to avoid BUG_ON

2018-05-15 Thread Lu Fengqi
From: Qu Wenruo 

Since the introduction of btrfs dedupe tree, it's possible that balance can
race with dedupe disabling.

When this happens, dedupe_enabled will make btrfs_get_fs_root() return
PTR_ERR(-ENOENT).
But due to a bug in error handling branch, when this happens
backref_cache->nr_nodes is increased but the node is neither added to
backref_cache or nr_nodes decreased.
Causing BUG_ON() in backref_cache_cleanup()

[ 2611.668810] [ cut here ]
[ 2611.669946] kernel BUG at
/home/sat/ktest/linux/fs/btrfs/relocation.c:243!
[ 2611.670572] invalid opcode:  [#1] SMP
[ 2611.686797] Call Trace:
[ 2611.687034]  []
btrfs_relocate_block_group+0x1b3/0x290 [btrfs]
[ 2611.687706]  []
btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs]
[ 2611.688385]  [] btrfs_balance+0xb22/0x11e0 [btrfs]
[ 2611.688966]  [] btrfs_ioctl_balance+0x391/0x3a0
[btrfs]
[ 2611.689587]  [] btrfs_ioctl+0x1650/0x2290 [btrfs]
[ 2611.690145]  [] ? lru_cache_add+0x3a/0x80
[ 2611.690647]  [] ?
lru_cache_add_active_or_unevictable+0x4c/0xc0
[ 2611.691310]  [] ? handle_mm_fault+0xcd4/0x17f0
[ 2611.691842]  [] ? cp_new_stat+0x153/0x180
[ 2611.692342]  [] ? __vma_link_rb+0xfd/0x110
[ 2611.692842]  [] ? vma_link+0xb9/0xc0
[ 2611.693303]  [] do_vfs_ioctl+0xa1/0x5a0
[ 2611.693781]  [] ? __do_page_fault+0x1b4/0x400
[ 2611.694310]  [] SyS_ioctl+0x41/0x70
[ 2611.694758]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0
05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b
0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44
[ 2611.697870] RIP  []
relocate_block_group+0x741/0x7a0 [btrfs]
[ 2611.698818]  RSP 

This patch will call remove_backref_node() in error handling branch, and
cache the returned -ENOENT in relocate_tree_block() and continue
balancing.

Reported-by: Satoru Takeuchi 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/relocation.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1b1dbfe53bbe..fb9f486e088a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -885,6 +885,13 @@ struct backref_node *build_backref_tree(struct 
reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
+   /*
+* Don't forget to cleanup current node.
+* As it may not be added to backref_cache but nr_node
+* increased.
+* This will cause BUG_ON() in backref_cache_cleanup().
+*/
+   remove_backref_node(>backref_cache, cur);
goto out;
}
 
@@ -3058,14 +3065,21 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
}
 
rb_node = rb_first(blocks);
-   while (rb_node) {
+   for (rb_node = rb_first(blocks); rb_node; rb_node = rb_next(rb_node)) {
block = rb_entry(rb_node, struct tree_block, rb_node);
 
node = build_backref_tree(rc, >key,
  block->level, block->bytenr);
if (IS_ERR(node)) {
+   /*
+* The root(dedupe tree yet) of the tree block is
+* going to be freed and can't be reached.
+* Just skip it and continue balancing.
+*/
+   if (PTR_ERR(node) == -ENOENT)
+   continue;
err = PTR_ERR(node);
-   goto out;
+   break;
}
 
ret = relocate_tree_block(trans, rc, node, >key,
@@ -3073,11 +3087,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
if (ret < 0) {
if (ret != -EAGAIN || rb_node == rb_first(blocks))
err = ret;
-   goto out;
+   break;
}
-   rb_node = rb_next(rb_node);
}
-out:
err = finish_pending_nodes(trans, rc, path, err);
 
 out_free_path:
-- 
2.17.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.6 08/14] btrfs: dedupe: Introduce function to search for an existing hash

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_search() to handle the job for in-memory
hash tree.

The trick is, we must ensure the delayed ref head is not being run at
the time we search the for the hash.

With inmem_search(), we can implement the btrfs_dedupe_search()
interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 189 ++
 1 file changed, 189 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index c279189df859..033c78ceef6a 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -8,6 +8,7 @@
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "delayed-ref.h"
+#include "qgroup.h"
 
 struct inmem_hash {
struct rb_node hash_node;
@@ -442,3 +443,191 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
kfree(dedupe_info);
return 0;
 }
+
+/*
+ * Caller must ensure the corresponding ref head is not being run.
+ */
+static struct inmem_hash *
+inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash)
+{
+   struct rb_node **p = _info->hash_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+   u16 hash_algo = dedupe_info->hash_algo;
+   int hash_len = btrfs_hash_sizes[hash_algo];
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+
+   if (memcmp(hash, entry->hash, hash_len) < 0) {
+   p = &(*p)->rb_left;
+   } else if (memcmp(hash, entry->hash, hash_len) > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Found, need to re-add it to LRU list head */
+   list_del(>lru_list);
+   list_add(>lru_list, _info->lru_list);
+   return entry;
+   }
+   }
+   return NULL;
+}
+
+static int inmem_search(struct btrfs_dedupe_info *dedupe_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash)
+{
+   int ret;
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *head;
+   struct btrfs_delayed_ref_head *insert_head;
+   struct btrfs_delayed_data_ref *insert_dref;
+   struct btrfs_qgroup_extent_record *insert_qrecord = NULL;
+   struct inmem_hash *found_hash;
+   int free_insert = 1;
+   int qrecord_inserted = 0;
+   u64 bytenr;
+   u32 num_bytes;
+
+   insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+   if (!insert_head)
+   return -ENOMEM;
+   insert_head->extent_op = NULL;
+   insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+   if (!insert_dref) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head);
+   return -ENOMEM;
+   }
+   if (test_bit(BTRFS_FS_QUOTA_ENABLED, >fs_info->flags) &&
+   is_fstree(root->root_key.objectid)) {
+   insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS);
+   if (!insert_qrecord) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep,
+   insert_head);
+   kmem_cache_free(btrfs_delayed_data_ref_cachep,
+   insert_dref);
+   return -ENOMEM;
+   }
+   }
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto free_mem;
+   }
+
+again:
+   mutex_lock(_info->lock);
+   found_hash = inmem_search_hash(dedupe_info, hash->hash);
+   /* If we don't find a duplicated extent, just return. */
+   if (!found_hash) {
+   ret = 0;
+   goto out;
+   }
+   bytenr = found_hash->bytenr;
+   num_bytes = found_hash->num_bytes;
+
+   delayed_refs = >transaction->delayed_refs;
+
+   spin_lock(_refs->lock);
+   head = btrfs_find_delayed_ref_head(>transaction->delayed_refs,
+  bytenr);
+   if (!head) {
+   /*
+* We can safely insert a new delayed_ref as long as we
+* hold delayed_refs->lock.
+* Only need to use atomic inc_extent_ref()
+*/
+   btrfs_add_delayed_data_ref_locked(root->fs_info, trans,
+   insert_dref, insert_head, insert_qrecord,
+   bytenr, num_bytes, 0, root->root_key.objectid,
+   

[PATCH v14.6 01/14] btrfs: introduce type based delalloc metadata reserve

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce type based metadata reserve parameter for delalloc space
reservation/freeing function.

The problem we are going to solve is, btrfs use different max extent
size for different mount options.

For compression, the max extent size is 128K, while for non-compress write
it's 128M.
And furthermore, split/merge extent hook highly depends that max extent
size.

Such situation contributes to quite a lot of false ENOSPC.

So this patch introduces the facility to help solve these false ENOSPC
related to different max extent size.

Currently, only normal 128M extent size is supported. More types will
follow soon.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  43 ++---
 fs/btrfs/extent-tree.c   |  48 ---
 fs/btrfs/file.c  |  30 +
 fs/btrfs/free-space-cache.c  |   6 +-
 fs/btrfs/inode-map.c |   9 ++-
 fs/btrfs/inode.c | 115 +--
 fs/btrfs/ioctl.c |  23 +++
 fs/btrfs/ordered-data.c  |   6 +-
 fs/btrfs/ordered-data.h  |   3 +-
 fs/btrfs/relocation.c|  22 ---
 fs/btrfs/tests/inode-tests.c |  15 +++--
 11 files changed, 223 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2771cc56a622..8f1c0db037b7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -92,11 +92,24 @@ static const int btrfs_csum_sizes[] = { 4 };
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
-static inline u32 count_max_extents(u64 size)
+static inline u32 count_max_extents(u64 size, u64 max_extent_size)
 {
-   return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+   return div_u64(size + max_extent_size - 1, max_extent_size);
 }
 
+/*
+ * Type based metadata reserve type
+ * This affects how btrfs reserve metadata space for buffered write.
+ *
+ * This is caused by the different max extent size for normal COW
+ * and compression, and further in-band dedupe
+ */
+enum btrfs_metadata_reserve_type {
+   BTRFS_RESERVE_NORMAL,
+};
+
+u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
 };
@@ -2759,8 +2772,9 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free);
+   struct extent_changeset *reserved,
+   u64 start, u64 len, bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -2774,13 +2788,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-   bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_delalloc_reserve_space(struct inode *inode,
-   struct extent_changeset **reserved, u64 start, u64 len);
+   struct extent_changeset **reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
@@ -3207,7 +3225,11 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info 
*fs_info, int delay_iput,
   int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  unsigned int 

[PATCH v14.6 00/14] Btrfs In-band De-duplication

2018-05-15 Thread Lu Fengqi
This patchset can be fetched from github:
https://github.com/littleroad/linux.git dedupe_latest

This is just a normal rebase update.
Now the new base is v4.17-rc5

Normal test cases from auto group exposes no regression, and ib-dedupe
group can pass without problem.

Changelog:
v2:
  Totally reworked to handle multiple backends
v3:
  Fix a stupid but deadly on-disk backend bug
  Add handle for multiple hash on same bytenr corner case to fix abort
  trans error
  Increase dedup rate by enhancing delayed ref handler for both backend.
  Move dedup_add() to run_delayed_ref() time, to fix abort trans error.
  Increase dedup block size up limit to 8M.
v4:
  Add dedup prop for disabling dedup for given files/dirs.
  Merge inmem_search() and ondisk_search() into generic_search() to save
  some code
  Fix another delayed_ref related bug.
  Use the same mutex for both inmem and ondisk backend.
  Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup
  rate.
v5:
  Reuse compress routine for much simpler dedup function.
  Slightly improved performance due to above modification.
  Fix race between dedup enable/disable
  Fix for false ENOSPC report
v6:
  Further enable/disable race window fix.
  Minor format change according to checkpatch.
v7:
  Fix one concurrency bug with balance.
  Slightly modify return value from -EINVAL to -EOPNOTSUPP for
  btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands
  and wrong parameter.
  Rebased to integration-4.6.
v8:
  Rename 'dedup' to 'dedupe'.
  Add support to allow dedupe and compression work at the same time.
  Fix several balance related bugs. Special thanks to Satoru Takeuchi,
  who exposed most of them.
  Small dedupe hit case performance improvement.
v9:
  Re-order the patchset to completely separate pure in-memory and any
  on-disk format change.
  Fold bug fixes into its original patch.
v10:
  Adding back missing bug fix patch.
  Reduce on-disk item size.
  Hide dedupe ioctl under CONFIG_BTRFS_DEBUG.
v11:
  Remove other backend and props support to focus on the framework and
  in-memory backend. Suggested by David.
  Better disable and buffered write race protection.
  Comprehensive fix to dedupe metadata ENOSPC problem.
v12:
  Stateful 'enable' ioctl and new 'reconf' ioctl
  New FORCE flag for enable ioctl to allow stateless ioctl
  Precise error report and extendable ioctl structure.
v12.1
  Rebase to David's for-next-20160704 branch
  Add co-ordinate patch for subpage and dedupe patchset.
v12.2
  Rebase to David's for-next-20160715 branch
  Add co-ordinate patch for other patchset.
v13
  Rebase to David's for-next-20160906 branch
  Fix a reserved space leak bug, which only frees quota reserved space
  but not space_info->byte_may_use.
v13.1
  Rebase to Chris' for-linux-4.9 branch
v14
  Use generic ENOSPC fix for both compression and dedupe.
v14.1
  Further split ENOSPC fix.
v14.2
  Rebase to v4.11-rc2.
  Co-operate with count_max_extent() to calculate num_extents.
  No longer rely on qgroup fixes.
v14.3
  Rebase to v4.12-rc1.
v14.4
  Rebase to kdave/for-4.13-part1.
v14.5
  Rebase to v4.15-rc3.
v14.6
  Rebase to v4.17-rc5.

Qu Wenruo (4):
  btrfs: delayed-ref: Add support for increasing data ref under spinlock
  btrfs: dedupe: Inband in-memory only de-duplication implement
  btrfs: relocation: Enhance error handling to avoid BUG_ON
  btrfs: dedupe: Introduce new reconfigure ioctl

Wang Xiaoguang (10):
  btrfs: introduce type based delalloc metadata reserve
  btrfs: Introduce COMPRESS reserve type to fix false enospc for
compression
  btrfs: dedupe: Introduce dedupe framework and its header
  btrfs: dedupe: Introduce function to initialize dedupe info
  btrfs: dedupe: Introduce function to add hash into in-memory tree
  btrfs: dedupe: Introduce function to remove hash from in-memory tree
  btrfs: dedupe: Introduce function to search for an existing hash
  btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
  btrfs: ordered-extent: Add support for dedupe
  btrfs: dedupe: Add ioctl for inband deduplication

 fs/btrfs/Makefile|   2 +-
 fs/btrfs/ctree.h |  54 ++-
 fs/btrfs/dedupe.c| 813 +++
 fs/btrfs/dedupe.h| 183 +++-
 fs/btrfs/delayed-ref.c   |  37 +-
 fs/btrfs/delayed-ref.h   |  10 +
 fs/btrfs/disk-io.c   |   4 +
 fs/btrfs/extent-tree.c   |  69 ++-
 fs/btrfs/extent_io.c |   8 +-
 fs/btrfs/extent_io.h |   2 +
 fs/btrfs/file.c  |  36 +-
 fs/btrfs/free-space-cache.c  |   6 +-
 fs/btrfs/inode-map.c |   9 +-
 fs/btrfs/inode.c | 479 +
 fs/btrfs/ioctl.c | 106 -
 fs/btrfs/ordered-data.c  |  52 ++-
 fs/btrfs/ordered-data.h  |  16 +-
 fs/btrfs/relocation.c|  64 ++-
 fs/btrfs/sysfs.c |   2 +
 fs/btrfs/tests/inode-tests.c |  15 +-
 include/uapi/linux/btrfs.h   |  55 +++
 21 files changed, 1854 insertions(+), 168 deletions(-)
 

[PATCH v14.6 02/14] btrfs: Introduce COMPRESS reserve type to fix false enospc for compression

2018-05-15 Thread Lu Fengqi
From: Wang Xiaoguang 

When testing btrfs compression, sometimes we got ENOSPC error, though fs
still has much free space, xfstests generic/171, generic/172, generic/173,
generic/174, generic/175 can reveal this bug in my test environment when
compression is enabled.

After some debugging work, we found that it's
btrfs_delalloc_reserve_metadata() which sometimes tries to reserve too
much metadata space, even for very small data range.

In btrfs_delalloc_reserve_metadata(), the number of metadata bytes to
reserve is calculated by the difference between outstanding extents and
reserved extents.
But due to bad designed drop_outstanding_extent() function, it can make
the difference too big, and cause problem.

The problem happens in the following flow with compression enabled.

1) Buffered write 128M data with 128K blocksize
   outstanding_extents = 1
   reserved_extents = 1024 (128M / 128K, one blocksize will get one
reserved_extent)

   Note: it's btrfs_merge_extent_hook() to merge outstanding extents.
 But reserved extents are still 1024.

2) Allocate extents for dirty range
   cow_file_range_async() split above large extent into small 128K
   extents.
   Let's assume 2 compressed extents have been split.

   So we have:
   outstanding_extents = 3
   reserved_extents = 1024

   range [0, 256K) has extents allocated

3) One ordered extent get finished
   btrfs_finish_ordered_io()
   |- btrfs_delalloc_release_metadata()
  |- drop_outstanding_extent()

   drop_outstanding_extent() will free *ALL* redundant reserved extents.
   So we have:
   outstanding_extents = 2 (One has finished)
   reserved_extents = 2

4) Continue allocating extents for dirty range
   cow_file_range_async() continue handling the remaining range.

   When the whole 128M range is done and assume no more ordered extents
   have finished.
   outstanding_extents = 1023 (One has finished in Step 3)
   reserved_extents = 2 (*ALL* freed in Step 3)

5) Another buffered write happens to the file
   btrfs_delalloc_reserve_metadata() will calculate metadata space.

   The calculation is:
   meta_to_reserve = (outstanding_extents - reserved_extents) * \
 nodesize * max_tree_level(8) * 2

   If nodesize is 16K, it's 1021 * 16K * 8 * 2, near 256M.
   If nodesize is 64K, it's about 1G.

   That's totally insane.

The fix is to introduce new reserve type, COMPRESSION, to info outstanding
extents calculation algorithm, to get correct outstanding_extents based
extent size.

So in Step 1), outstanding_extents = 1024 reserved_extents = 1024
Step 2): outstanding_extents = 1024 reserved_extents = 1024
Step 3): outstanding_extents = 1023 reserved_extents = 1023

And in Step 5) we reserve correct amount of metadata space.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/extent_io.c   |  7 ++--
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/file.c|  3 ++
 fs/btrfs/inode.c   | 81 +++---
 fs/btrfs/ioctl.c   |  2 ++
 fs/btrfs/relocation.c  |  3 ++
 8 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f1c0db037b7..f9285d91af1c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -106,9 +106,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
  */
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
+   BTRFS_RESERVE_COMPRESS,
 };
 
 u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+int inode_need_compress(struct inode *inode, u64 start, u64 end);
 
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 588211ee7ed7..c9cc925bb475 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6081,6 +6081,8 @@ u64 btrfs_max_extent_size(enum 
btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_COMPRESS)
+   return SZ_128K;
 
ASSERT(0);
return BTRFS_MAX_EXTENT_SIZE;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e99b329002cf..dfe6c8576569 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -599,7 +599,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 
start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
 
if (bits & EXTENT_DELALLOC)
-   bits |= EXTENT_NORESERVE;
+   bits |= EXTENT_NORESERVE | EXTENT_COMPRESS;
 
if (delete)
bits |= ~EXTENT_CTLBITS;
@@ -1492,6 +1492,7 @@ static noinline u64 find_delalloc_range(struct 
extent_io_tree *tree,
u64 cur_start = 

Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Qu Wenruo


On 2018年05月15日 16:35, Nikolay Borisov wrote:
> 
> 
> On 15.05.2018 11:30, Qu Wenruo wrote:
>>
>>
>> On 2018年05月15日 16:21, Nikolay Borisov wrote:
>>>
>>>
>>> On 15.05.2018 10:36, Qu Wenruo wrote:
 As btrfs(5) specified:

Note
If nodatacow or nodatasum are enabled, compression is disabled.

 If NODATASUM or NODATACOW set, we should not compress the extent.

 Normally NODATACOW is detected properly in run_delalloc_range() so
 compression won't happen for NODATACOW.

 However for NODATASUM we don't have any check, and it can cause
 compressed extent without csum pretty easily, just by:
 --
 mkfs.btrfs -f $dev
 mount $dev $mnt -o nodatasum
 touch $mnt/foobar
 mount -o remount,datasum,compress $mnt
 xfs_io -f -c "pwrite 0 128K" $mnt/foobar
 --

 And in fact, we have bug report about corrupted compressed extent
 without proper data checksum so even RAID1 can't recover the corruption.
 (https://bugzilla.kernel.org/show_bug.cgi?id9707)

 Running compression without proper checksum could cause more damage when
 corruption happens, so there is no need to allow compression for
 NODATACSUM.

 Reported-by: James Harvey 
 Signed-off-by: Qu Wenruo 
 ---
  fs/btrfs/inode.c | 8 
  1 file changed, 8 insertions(+)

 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
 index d241285a0d2a..dbef3f404559 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
 @@ -396,6 +396,14 @@ static inline int inode_need_compress(struct inode 
 *inode, u64 start, u64 end)
  {
struct btrfs_fs_info *fs_info =trfs_sb(inode->i_sb);
  
 +  /*
 +   * Btrfs doesn't support compression without csum or CoW.
 +   * This should have the highest priority.
 +   */
 +  if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
 +  BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 +  return 0;
 +
>>>
>>> How is this not buggy, given that if inode_need_compress as called from 
>>> compress_file_range will return zero, meaning we jump to cont: label. 
>>> Then in the case of an inline extent we can execute : 
>>
>> In that case, you won't go into compress_file_range() at all.
>>
>> As the only caller of compress_file_range() is async_cow_start(), which
>> get queued in cow_file_range_async().
>>
>> And cow_file_range_async() traces back to run_delalloc_range().
>> Here we determine (basically) where some dirty range goes.
>>
>> The modification in inode_need_compress() mostly affects the decision in
>> run_delalloc_range(), so we won't go cow_file_range_async(), thus we
>> won't hit the problem you described.
> 
> So you have re-iterated what I've described further below. This means it
> should be possible to remove the invocation of inode_need_compress in
> compress_file_range and simplify the code there, no?

Yep, that's true.

> Perhaps
> will_compress can also be removed etc?  As it stands currently it's
> spaghetti code.

Nice idea to further clean this code up.

I'll update both patch after receiving enough feedback.

Thanks,
Qu

> 
>>>
>>> ret =ow_file_range_inline(inode, start, end,  
>>>total_compressed,   
>>>compress_type, pages);   
>>>
>>> where compress_type would have been set at the beginning of the 
>>> function unconditionally to fs_info->compress_type. 
>>>
>>> For non-inline extents I guess we are ok, given that will_compress 
>>> will not be set. However, this code is rather messy and I'm not sure 
>>> it's well defined what's going to happen in this case with inline extents. 
>>>
>>> OTOH, I think there is something fundamentally wrong in calling 
>>> inode_need_compress in compress_file_range. I.e they work at different 
>>> abstractions. IMO compress_file_range should only be called if we know 
>>> we have to compress the range. 
>>>
>>> So looking around the code in run_delalloc_range (the only function 
>>> which calls cow_file_range_async) we already have : 
>>>
>>>  } else if (!inode_need_compress(inode, start, end)) {   
>>> ret =ow_file_range(inode, locked_page, start, end, end, 
>>>   
>>>   page_started, nr_written, 1, NULL);   
>>>
>>> and in the else branch we have the cow_file_range_async. So the code 
>>> is sort of half-way there to actually decoupling compression checking from 
>>> performing the actual compression. 
>>>
>>>
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
>>>
>>> One more thing, in inode_need_compress shouldn't the inode specific
>>> checks come first something like :
>>>
>>>
>>> static inline int inode_need_compress(struct inode *inode, u64 start, u64 
>>> end)  
>>> { 

Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Su Yue


On 05/15/2018 04:35 PM, Duncan wrote:
> Su Yue posted on Tue, 15 May 2018 16:05:01 +0800 as excerpted:
> 
> 
>>
>> On 05/15/2018 03:51 PM, Misono Tomohiro wrote:
>>> Incompat flag of lzo/zstd compression should be set at:
>>>  1. mount time (-o compress/compress-force)
>>>  2. when defrag is done 3. when property is set
>>>
>>> Currently 3. is missing and this commit adds this.
>>>
>>>
>> If I don't misunderstand, compression property of an inode is only apply
>> for *the* inode, not the whole filesystem.
>> So the original logical should be okay.
> 
> But the inode is on the filesystem, and if it's compressed with lzo/zstd, 
> the incompat flag should be set to avoid mounting with an earlier kernel 
> that doesn't understand that compression and would therefore, if we're 
> lucky, simply fail to read the data compressed in that file/inode.  (If 
> we're unlucky it could blow up with kernel memory corruption like James 
> Harvey's current case of unexpected, corrupted compressed data in a nocow 
> file that being nocow, doesn't have csum validation to fail and abort the 
> decompression, and shouldn't be compressed at all.)
> 
> So better to set the incompat flag and refuse to mount at all on kernels 
> that don't have the required compression support.
> 

Get it.
As your conclusion, it's indeed better to set the incompat flag.

Thanks,
Su




pEpkey.asc
Description: application/pgp-keys


Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Duncan
Su Yue posted on Tue, 15 May 2018 16:05:01 +0800 as excerpted:


> 
> On 05/15/2018 03:51 PM, Misono Tomohiro wrote:
>> Incompat flag of lzo/zstd compression should be set at:
>>  1. mount time (-o compress/compress-force)
>>  2. when defrag is done 3. when property is set
>> 
>> Currently 3. is missing and this commit adds this.
>> 
>> 
> If I don't misunderstand, compression property of an inode is only apply
> for *the* inode, not the whole filesystem.
> So the original logical should be okay.

But the inode is on the filesystem, and if it's compressed with lzo/zstd, 
the incompat flag should be set to avoid mounting with an earlier kernel 
that doesn't understand that compression and would therefore, if we're 
lucky, simply fail to read the data compressed in that file/inode.  (If 
we're unlucky it could blow up with kernel memory corruption like James 
Harvey's current case of unexpected, corrupted compressed data in a nocow 
file that being nocow, doesn't have csum validation to fail and abort the 
decompression, and shouldn't be compressed at all.)

So better to set the incompat flag and refuse to mount at all on kernels 
that don't have the required compression support.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 4.15.6 crash: BUG at fs/btrfs/ctree.c:1862

2018-05-15 Thread Filipe Manana
On Tue, May 15, 2018 at 12:10 AM, Marc MERLIN  wrote:
> static noinline struct extent_buffer *
> read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
>int slot)
> {
> int level = btrfs_header_level(parent);
> struct extent_buffer *eb;
>
> if (slot < 0 || slot >= btrfs_header_nritems(parent))
> return ERR_PTR(-ENOENT);
>
> BUG_ON(level == 0);
>
>
>
> BTRFS info (device dm-2): relocating block group 13404622290944 flags data
> BTRFS info (device dm-2): found 9959 extents
> BTRFS info (device dm-2): found 9959 extents
> BTRFS info (device dm-2): relocating block group 13403548549120 flags data
> [ cut here ]
> kernel BUG at fs/btrfs/ctree.c:1862!
> invalid opcode:  [#1] PREEMPT SMP PTI
> CPU: 5 PID: 8103 Comm: btrfs Tainted: G U   
> 4.15.6-amd64-preempt-sysrq-20171018 #3
> Hardware name: System manufacturer System Product Name/P8H67-M PRO, BIOS 3904 
> 04/27/2013
> RIP: 0010:read_node_slot+0x3c/0x9e
> RSP: 0018:becfaa0b7b58 EFLAGS: 00210246
> RAX: 00a0 RBX: 000c RCX: 0003
> RDX: 000c RSI: 9a60e9d9de78 RDI: 00052f6e
> RBP: 9a60e9d9de78 R08: 0001 R09: becfaa0b7bf6
> R10: 9a64988bd7e9 R11: 9a64988bd7c8 R12: e003d4bdb800
> R13: 9a64a481 R14:  R15: 
> FS:  7fba34c9c8c0() GS:9a64de34() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 5a8b9c9a CR3: 0001446c6004 CR4: 001606e0
> Call Trace:
>  tree_advance+0xb1/0x11e
>  btrfs_compare_trees+0x1c2/0x4d6
>  ? process_extent+0xdcf/0xdcf
>  btrfs_ioctl_send+0x81e/0xc70
>  ? __kmalloc_track_caller+0xfb/0x10f
>  _btrfs_ioctl_send+0xbc/0xe6
>  ? paravirt_sched_clock+0x5/0x8
>  ? set_task_rq+0x2f/0x80
>  ? task_rq_unlock+0x22/0x36
>  btrfs_ioctl+0x162f/0x1dc8
>  ? select_task_rq_fair+0xb65/0xb7a
>  ? update_load_avg+0x16d/0x442
>  ? list_add+0x15/0x2e
>  ? cfs_rq_throttled.isra.30+0x9/0x18
>  ? vfs_ioctl+0x1b/0x28
>  vfs_ioctl+0x1b/0x28
>  do_vfs_ioctl+0x4f4/0x53f
>  ? __audit_syscall_entry+0xbf/0xe3
>  SyS_ioctl+0x52/0x76
>  do_syscall_64+0x72/0x81
>  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7fba34d835e7
> RSP: 002b:7ffc32cf4cb8 EFLAGS: 0202 ORIG_RAX: 0010
> RAX: ffda RBX: 523f RCX: 7fba34d835e7
> RDX: 7ffc32cf4d40 RSI: 40489426 RDI: 0004
> RBP: 0004 R08:  R09: 7fba34c9b700
> R10: 7fba34c9b9d0 R11: 0202 R12: 0003
> R13: 563a30b87020 R14: 0001 R15: 0001
> Code: f5 53 4c 8b a6 98 00 00 00 89 d3 4c 89 e7 e8 67 fd ff ff 85 db 78 63 4c 
> 89 e7 41 88 c6 e8 92 fb ff ff 39 d8 76 54 45 84 f6 75 02 <0f> 0b 89 de 48 89 
> ef e8 2e ff ff ff 89 de 49 89 c4 48 89 ef e8
> RIP: read_node_slot+0x3c/0x9e RSP: becfaa0b7b58
> ---[ end trace a24e7de6b77b5cb1 ]---
> Kernel panic - not syncing: Fatal exception
> Kernel Offset: 0x1900 from 0x8100 (relocation range: 
> 0x8000-0xbfff)

We got a fix for this recently:  https://patchwork.kernel.org/patch/10396523/


>
> --
> "A mouse is a device used to point at the xterm you want to type in" - A.S.R.
> Microsoft is to operating systems 
>    what McDonalds is to gourmet 
> cooking
> Home page: http://marc.merlins.org/   | PGP 
> 7F55D5F27AAF9D08
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 11:30, Qu Wenruo wrote:
> 
> 
> On 2018年05月15日 16:21, Nikolay Borisov wrote:
>>
>>
>> On 15.05.2018 10:36, Qu Wenruo wrote:
>>> As btrfs(5) specified:
>>>
>>> Note
>>> If nodatacow or nodatasum are enabled, compression is disabled.
>>>
>>> If NODATASUM or NODATACOW set, we should not compress the extent.
>>>
>>> Normally NODATACOW is detected properly in run_delalloc_range() so
>>> compression won't happen for NODATACOW.
>>>
>>> However for NODATASUM we don't have any check, and it can cause
>>> compressed extent without csum pretty easily, just by:
>>> --
>>> mkfs.btrfs -f $dev
>>> mount $dev $mnt -o nodatasum
>>> touch $mnt/foobar
>>> mount -o remount,datasum,compress $mnt
>>> xfs_io -f -c "pwrite 0 128K" $mnt/foobar
>>> --
>>>
>>> And in fact, we have bug report about corrupted compressed extent
>>> without proper data checksum so even RAID1 can't recover the corruption.
>>> (https://bugzilla.kernel.org/show_bug.cgi?id=199707)
>>>
>>> Running compression without proper checksum could cause more damage when
>>> corruption happens, so there is no need to allow compression for
>>> NODATACSUM.
>>>
>>> Reported-by: James Harvey 
>>> Signed-off-by: Qu Wenruo 
>>> ---
>>>  fs/btrfs/inode.c | 8 
>>>  1 file changed, 8 insertions(+)
>>>
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index d241285a0d2a..dbef3f404559 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -396,6 +396,14 @@ static inline int inode_need_compress(struct inode 
>>> *inode, u64 start, u64 end)
>>>  {
>>> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>>  
>>> +   /*
>>> +* Btrfs doesn't support compression without csum or CoW.
>>> +* This should have the highest priority.
>>> +*/
>>> +   if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
>>> +   BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
>>> +   return 0;
>>> +
>>
>> How is this not buggy, given that if inode_need_compress as called from 
>> compress_file_range will return zero, meaning we jump to cont: label. 
>> Then in the case of an inline extent we can execute : 
> 
> In that case, you won't go into compress_file_range() at all.
> 
> As the only caller of compress_file_range() is async_cow_start(), which
> get queued in cow_file_range_async().
> 
> And cow_file_range_async() traces back to run_delalloc_range().
> Here we determine (basically) where some dirty range goes.
> 
> The modification in inode_need_compress() mostly affects the decision in
> run_delalloc_range(), so we won't go cow_file_range_async(), thus we
> won't hit the problem you described.

So you have re-iterated what I've described further below. This means it
should be possible to remove the invocation of inode_need_compress in
compress_file_range and simplify the code there, no? Perhaps
will_compress can also be removed etc?  As it stands currently it's
spaghetti code.

>>
>> ret = cow_file_range_inline(inode, start, end,  
>>total_compressed,   
>>compress_type, pages);   
>>
>> where compress_type would have been set at the beginning of the 
>> function unconditionally to fs_info->compress_type. 
>>
>> For non-inline extents I guess we are ok, given that will_compress 
>> will not be set. However, this code is rather messy and I'm not sure 
>> it's well defined what's going to happen in this case with inline extents. 
>>
>> OTOH, I think there is something fundamentally wrong in calling 
>> inode_need_compress in compress_file_range. I.e they work at different 
>> abstractions. IMO compress_file_range should only be called if we know 
>> we have to compress the range. 
>>
>> So looking around the code in run_delalloc_range (the only function 
>> which calls cow_file_range_async) we already have : 
>>
>>  } else if (!inode_need_compress(inode, start, end)) {   
>> ret = cow_file_range(inode, locked_page, start, end, end,
>>
>>   page_started, nr_written, 1, NULL);   
>>
>> and in the else branch we have the cow_file_range_async. So the code 
>> is sort of half-way there to actually decoupling compression checking from 
>> performing the actual compression. 
>>
>>
>>> /* force compress */
>>> if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
>>> return 1;
>>
>> One more thing, in inode_need_compress shouldn't the inode specific
>> checks come first something like :
>>
>>
>> static inline int inode_need_compress(struct inode *inode, u64 start, u64 
>> end)  
>> {
>>
>> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);   
>>
>>  
>>
>> /* defrag ioctl */   
>>
>>   

Re: [PATCH 2/2] btrfs: lzo: Avoid decompressing obviously corrupted data

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 11:32, Qu Wenruo wrote:
> 
> 
> On 2018年05月15日 16:05, Nikolay Borisov wrote:
>>
>>
>> On 15.05.2018 10:36, Qu Wenruo wrote:
>>> Unlike zlib decompression, lzo decompression doesn't need any
>>> initialization, thus we can't detect early corruption from
>>> initialization.
>>>
>>> However for lzo compressed extent, its first 4bytes records the real
>>> unaligned compressed data size.
>>> We could use this as a clue, since any compressed extent should not
>>> exceed 128K, thus if we find such compressed data length, we are sure
>>> it's corrupted, then no need to continue decompression.
>>>
>>> Normally, such problem won't really bother anyone, as compression relies
>>> on dataCoW and data csum, which means normally such corruption should be
>>> detect by data csum before going into compression.
>>> However due to a bug in compression condition, it's possible to create
>>> compressed extent without csum.
>>>
>>> So we still need to do extra check for lzo just in case the compressed
>>> data is corrupted.
>>>
>>> Signed-off-by: Qu Wenruo 
>>> ---
>>> lease note that, even with the binary dump of corrupted extent provided
>>> by the original reporter, James Harvey, I can only reproduce the "decompress
>>> failed" error message, but not the serious memory corruption followed.
>>> So there must be something missing, maybe we need to double check both
>>> btrfs lzo caller and kernel lzo lib.
>>>
>>> But anyway, making btrfs lzo compression a little more robust is never a
>>> bad thing.
>>> ---
>>>  fs/btrfs/compression.h | 1 +
>>>  fs/btrfs/lzo.c | 4 
>>>  2 files changed, 5 insertions(+)
>>>
>>> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
>>> index cc605f7b23fb..317703d9b073 100644
>>> --- a/fs/btrfs/compression.h
>>> +++ b/fs/btrfs/compression.h
>>> @@ -6,6 +6,7 @@
>>>  #ifndef BTRFS_COMPRESSION_H
>>>  #define BTRFS_COMPRESSION_H
>>>  
>>> +#include 
>>
>> Stray include otherwise:
> 
> Surprisingly that's really needed.
> 
> As in compression.h we uses SZ_*, while we didn't include that header.
> It's other *.c files get that header included first so no compiler error.
> 
> However in this case, lzo.c is only including compression.h, no other
> headers including sizes.h, so it will cause compiler error.
> 
> That's to fix it.

That's a separate change with a separate changelog
> 
> Thanks,
> Qu
> 
>>
>> Reviewed-by: Nikolay Borisov 
>>
>>>  /*
>>>   * We want to make sure that amount of RAM required to uncompress an 
>>> extent is
>>>   * reasonable, so we limit the total size in ram of a compressed extent to
>>> diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
>>> index 0667ea07f766..7ae2c0925770 100644
>>> --- a/fs/btrfs/lzo.c
>>> +++ b/fs/btrfs/lzo.c
>>> @@ -271,6 +271,10 @@ static int lzo_decompress_bio(struct list_head *ws, 
>>> struct compressed_bio *cb)
>>>  
>>> data_in = kmap(pages_in[0]);
>>> tot_len = read_compress_length(data_in);
>>> +   if (tot_len > BTRFS_MAX_COMPRESSED) {
>>> +   ret = -EIO;
>>> +   goto done;
>>> +   }
>>>  
>>> tot_in = LZO_LEN;
>>> in_offset = LZO_LEN;
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: lzo: Avoid decompressing obviously corrupted data

2018-05-15 Thread Qu Wenruo


On 2018年05月15日 16:05, Nikolay Borisov wrote:
> 
> 
> On 15.05.2018 10:36, Qu Wenruo wrote:
>> Unlike zlib decompression, lzo decompression doesn't need any
>> initialization, thus we can't detect early corruption from
>> initialization.
>>
>> However for lzo compressed extent, its first 4bytes records the real
>> unaligned compressed data size.
>> We could use this as a clue, since any compressed extent should not
>> exceed 128K, thus if we find such compressed data length, we are sure
>> it's corrupted, then no need to continue decompression.
>>
>> Normally, such problem won't really bother anyone, as compression relies
>> on dataCoW and data csum, which means normally such corruption should be
>> detect by data csum before going into compression.
>> However due to a bug in compression condition, it's possible to create
>> compressed extent without csum.
>>
>> So we still need to do extra check for lzo just in case the compressed
>> data is corrupted.
>>
>> Signed-off-by: Qu Wenruo 
>> ---
>> lease note that, even with the binary dump of corrupted extent provided
>> by the original reporter, James Harvey, I can only reproduce the "decompress
>> failed" error message, but not the serious memory corruption followed.
>> So there must be something missing, maybe we need to double check both
>> btrfs lzo caller and kernel lzo lib.
>>
>> But anyway, making btrfs lzo compression a little more robust is never a
>> bad thing.
>> ---
>>  fs/btrfs/compression.h | 1 +
>>  fs/btrfs/lzo.c | 4 
>>  2 files changed, 5 insertions(+)
>>
>> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
>> index cc605f7b23fb..317703d9b073 100644
>> --- a/fs/btrfs/compression.h
>> +++ b/fs/btrfs/compression.h
>> @@ -6,6 +6,7 @@
>>  #ifndef BTRFS_COMPRESSION_H
>>  #define BTRFS_COMPRESSION_H
>>  
>> +#include 
> 
> Stray include otherwise:

Surprisingly that's really needed.

As in compression.h we uses SZ_*, while we didn't include that header.
It's other *.c files get that header included first so no compiler error.

However in this case, lzo.c is only including compression.h, no other
headers including sizes.h, so it will cause compiler error.

That's to fix it.

Thanks,
Qu

> 
> Reviewed-by: Nikolay Borisov 
> 
>>  /*
>>   * We want to make sure that amount of RAM required to uncompress an extent 
>> is
>>   * reasonable, so we limit the total size in ram of a compressed extent to
>> diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
>> index 0667ea07f766..7ae2c0925770 100644
>> --- a/fs/btrfs/lzo.c
>> +++ b/fs/btrfs/lzo.c
>> @@ -271,6 +271,10 @@ static int lzo_decompress_bio(struct list_head *ws, 
>> struct compressed_bio *cb)
>>  
>>  data_in = kmap(pages_in[0]);
>>  tot_len = read_compress_length(data_in);
>> +if (tot_len > BTRFS_MAX_COMPRESSED) {
>> +ret = -EIO;
>> +goto done;
>> +}
>>  
>>  tot_in = LZO_LEN;
>>  in_offset = LZO_LEN;
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Qu Wenruo


On 2018年05月15日 16:21, Nikolay Borisov wrote:
> 
> 
> On 15.05.2018 10:36, Qu Wenruo wrote:
>> As btrfs(5) specified:
>>
>>  Note
>>  If nodatacow or nodatasum are enabled, compression is disabled.
>>
>> If NODATASUM or NODATACOW set, we should not compress the extent.
>>
>> Normally NODATACOW is detected properly in run_delalloc_range() so
>> compression won't happen for NODATACOW.
>>
>> However for NODATASUM we don't have any check, and it can cause
>> compressed extent without csum pretty easily, just by:
>> --
>> mkfs.btrfs -f $dev
>> mount $dev $mnt -o nodatasum
>> touch $mnt/foobar
>> mount -o remount,datasum,compress $mnt
>> xfs_io -f -c "pwrite 0 128K" $mnt/foobar
>> --
>>
>> And in fact, we have bug report about corrupted compressed extent
>> without proper data checksum so even RAID1 can't recover the corruption.
>> (https://bugzilla.kernel.org/show_bug.cgi?id=199707)
>>
>> Running compression without proper checksum could cause more damage when
>> corruption happens, so there is no need to allow compression for
>> NODATACSUM.
>>
>> Reported-by: James Harvey 
>> Signed-off-by: Qu Wenruo 
>> ---
>>  fs/btrfs/inode.c | 8 
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index d241285a0d2a..dbef3f404559 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -396,6 +396,14 @@ static inline int inode_need_compress(struct inode 
>> *inode, u64 start, u64 end)
>>  {
>>  struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>  
>> +/*
>> + * Btrfs doesn't support compression without csum or CoW.
>> + * This should have the highest priority.
>> + */
>> +if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
>> +BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
>> +return 0;
>> +
> 
> How is this not buggy, given that if inode_need_compress as called from 
> compress_file_range will return zero, meaning we jump to cont: label. 
> Then in the case of an inline extent we can execute : 

In that case, you won't go into compress_file_range() at all.

As the only caller of compress_file_range() is async_cow_start(), which
get queued in cow_file_range_async().

And cow_file_range_async() traces back to run_delalloc_range().
Here we determine (basically) where some dirty range goes.

The modification in inode_need_compress() mostly affects the decision in
run_delalloc_range(), so we won't go cow_file_range_async(), thus we
won't hit the problem you described.
> 
> ret = cow_file_range_inline(inode, start, end,  
>total_compressed,   
>compress_type, pages);   
> 
> where compress_type would have been set at the beginning of the 
> function unconditionally to fs_info->compress_type. 
> 
> For non-inline extents I guess we are ok, given that will_compress 
> will not be set. However, this code is rather messy and I'm not sure 
> it's well defined what's going to happen in this case with inline extents. 
> 
> OTOH, I think there is something fundamentally wrong in calling 
> inode_need_compress in compress_file_range. I.e they work at different 
> abstractions. IMO compress_file_range should only be called if we know 
> we have to compress the range. 
> 
> So looking around the code in run_delalloc_range (the only function 
> which calls cow_file_range_async) we already have : 
> 
>  } else if (!inode_need_compress(inode, start, end)) {   
> ret = cow_file_range(inode, locked_page, start, end, end, 
>   
>   page_started, nr_written, 1, NULL);   
> 
> and in the else branch we have the cow_file_range_async. So the code 
> is sort of half-way there to actually decoupling compression checking from 
> performing the actual compression. 
> 
> 
>>  /* force compress */
>>  if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
>>  return 1;
> 
> One more thing, in inode_need_compress shouldn't the inode specific
> checks come first something like :
> 
> 
> static inline int inode_need_compress(struct inode *inode, u64 start, u64 
> end)  
> { 
>   
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>   
>   
>   
> /* defrag ioctl */
>   
> if (BTRFS_I(inode)->defrag_compress)  
>   
> return 1; 
>   
> /* bad compression ratios */  
>   
> if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)   
>   
> return 0; 
>   

Not 

Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Su Yue


On 05/15/2018 04:05 PM, Su Yue wrote:
> 
> 
> On 05/15/2018 03:51 PM, Misono Tomohiro wrote:
>> Incompat flag of lzo/zstd compression should be set at:
>>  1. mount time (-o compress/compress-force)
>>  2. when defrag is done
>>  3. when property is set
>>
>> Currently 3. is missing and this commit adds this.
>>
> 
> If I don't misunderstand, compression property of an inode is only

Embarrassed for bad memory about btrfs_set_fs_incompat().
The patch is fine. Just ignore this thread.

> apply for *the* inode, not the whole filesystem.
> So the original logical should be okay.
> 
> Thanks,
> Su
> 
>> Signed-off-by: Tomohiro Misono 
>> ---
>>  fs/btrfs/props.c | 12 
>>  1 file changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
>> index 53a8c95828e3..dc6140013ae8 100644
>> --- a/fs/btrfs/props.c
>> +++ b/fs/btrfs/props.c
>> @@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
>>const char *value,
>>size_t len)
>>  {
>> +struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>  int type;
>>  
>>  if (len == 0) {
>> @@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
>>  return 0;
>>  }
>>  
>> -if (!strncmp("lzo", value, 3))
>> +if (!strncmp("lzo", value, 3)) {
>>  type = BTRFS_COMPRESS_LZO;
>> -else if (!strncmp("zlib", value, 4))
>> +btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
>> +} else if (!strncmp("zlib", value, 4)) {
>>  type = BTRFS_COMPRESS_ZLIB;
>> -else if (!strncmp("zstd", value, len))
>> +} else if (!strncmp("zstd", value, len)) {
>>  type = BTRFS_COMPRESS_ZSTD;
>> -else
>> +btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
>> +} else {
>>  return -EINVAL;
>> +}
>>  
>>  BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
>>  BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
>>
> 
> 




pEpkey.asc
Description: application/pgp-keys


Re: [PATCH 1/2] btrfs: inode: Don't compress if NODATASUM or NODATACOW set

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 10:36, Qu Wenruo wrote:
> As btrfs(5) specified:
> 
>   Note
>   If nodatacow or nodatasum are enabled, compression is disabled.
> 
> If NODATASUM or NODATACOW set, we should not compress the extent.
> 
> Normally NODATACOW is detected properly in run_delalloc_range() so
> compression won't happen for NODATACOW.
> 
> However for NODATASUM we don't have any check, and it can cause
> compressed extent without csum pretty easily, just by:
> --
> mkfs.btrfs -f $dev
> mount $dev $mnt -o nodatasum
> touch $mnt/foobar
> mount -o remount,datasum,compress $mnt
> xfs_io -f -c "pwrite 0 128K" $mnt/foobar
> --
> 
> And in fact, we have bug report about corrupted compressed extent
> without proper data checksum so even RAID1 can't recover the corruption.
> (https://bugzilla.kernel.org/show_bug.cgi?id=199707)
> 
> Running compression without proper checksum could cause more damage when
> corruption happens, so there is no need to allow compression for
> NODATACSUM.
> 
> Reported-by: James Harvey 
> Signed-off-by: Qu Wenruo 
> ---
>  fs/btrfs/inode.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index d241285a0d2a..dbef3f404559 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -396,6 +396,14 @@ static inline int inode_need_compress(struct inode 
> *inode, u64 start, u64 end)
>  {
>   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  
> + /*
> +  * Btrfs doesn't support compression without csum or CoW.
> +  * This should have the highest priority.
> +  */
> + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
> + BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
> + return 0;
> +

How is this not buggy, given that if inode_need_compress as called from 
compress_file_range will return zero, meaning we jump to cont: label. 
Then in the case of an inline extent we can execute : 

ret = cow_file_range_inline(inode, start, end,  
   total_compressed,   
   compress_type, pages);   

where compress_type would have been set at the beginning of the 
function unconditionally to fs_info->compress_type. 

For non-inline extents I guess we are ok, given that will_compress 
will not be set. However, this code is rather messy and I'm not sure 
it's well defined what's going to happen in this case with inline extents. 

OTOH, I think there is something fundamentally wrong in calling 
inode_need_compress in compress_file_range. I.e they work at different 
abstractions. IMO compress_file_range should only be called if we know 
we have to compress the range. 

So looking around the code in run_delalloc_range (the only function 
which calls cow_file_range_async) we already have : 

 } else if (!inode_need_compress(inode, start, end)) {   
ret = cow_file_range(inode, locked_page, start, end, end,   
  page_started, nr_written, 1, NULL);   

and in the else branch we have the cow_file_range_async. So the code 
is sort of half-way there to actually decoupling compression checking from 
performing the actual compression. 


>   /* force compress */
>   if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
>   return 1;

One more thing, in inode_need_compress shouldn't the inode specific
checks come first something like : 


static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)  
{   
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);  

/* defrag ioctl */  
if (BTRFS_I(inode)->defrag_compress)
return 1;   
/* bad compression ratios */
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 
return 0;   
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;   
if (btrfs_test_opt(fs_info, COMPRESS) ||
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || 
BTRFS_I(inode)->prop_compress)  
return btrfs_compress_heuristic(inode, start, end); 
return 0;   
} 

> 
--
To unsubscribe from this list: 

Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Anand Jain



On 05/15/2018 03:51 PM, Misono Tomohiro wrote:

Incompat flag of lzo/zstd compression should be set at:
  1. mount time (-o compress/compress-force)
  2. when defrag is done
  3. when property is set

Currently 3. is missing and this commit adds this.

Signed-off-by: Tomohiro Misono 


Reviewed-by: Anand Jain 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Clarification needed about libbtrfs & libbtrfsutil

2018-05-15 Thread Dimitri John Ledkov
On 14 May 2018 at 21:22, Omar Sandoval  wrote:
> On Mon, May 14, 2018 at 09:40:19AM +0100, Dimitri John Ledkov wrote:
>> Are both of these meant to be public libraries, installed on the user
>> systems, and available in .so variant as well for 3rd party
>> development and public dynamic linking?
>>
>> Or are these private internal libraries, which are installed as public
>> runtime only, simply to share code between the utils, but otherwise
>> provide no abi stability and will forever remain libfoo.so.0?
>
> They're both meant to be public. In fact, libbtrfsutil is already 1.0.0.
>

Ack. Will ship them as such.

>> Or should these even be a noinst_ libraries (~= Libtool Convenience
>> Libraries), and are simply intermediate by-products?
>>
>> I'm asking because despite compiling shared & static variants of these
>> libraries, and "shared linked" and "static linked" variants of the
>> utils, it appears that all utilities are statically linking against
>> libbtrfs/libbtrfsutils. Thus no binaries nor bindings, dynamically
>> link against neither libbtrfs nor libbtrfsutil.
>>
>> Tweaking the makefile to use libs_shared variable instead of libs or
>> libs_static, results in slightly smaller binaries, dynamically linked
>> against libbtrfs/libbtrfsutil.
>>
>> But it is hard to tell if this is a bug/mistake, or an intentional feature.
>
> I'm not sure why we statically link libbtrfs into the the tools, and I
> just copied that for libbtrfsutil.

OK. I guess I can prepare a patch to dynamically link
libbtrfs/libbtrfsutil and see how that will go through review.

-- 
Regards,

Dimitri.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] btrfs: sysfs: Add entry which shows rmdir(2) can work for subvolume

2018-05-15 Thread Misono Tomohiro
On 2018/05/15 17:03, Nikolay Borisov wrote:
> 
> 
> On 15.05.2018 10:30, Misono Tomohiro wrote:
>> [based on current misc-next]
>>
>> This adds new sysfs entry
>>   /sys/fs/btrfs/features/rmdir_subvol
>> to indicate that the kernel can delete a subvolume by rmdir(2),
>> which is allowed by: https://www.spinics.net/lists/linux-btrfs/msg76938.html
>>
>> The first patch is a cleanup and the second one is a main part.
> 
> Why do we need this - for enabling testing or something else?

Yes, I want to skip xfstest if the feature does not exist.

> 
>>
>> Tomohiro Misono (2):
>>   btrfs: sysfs: Use enum/define value intead of magic number
>>   btrfs: sysfs: Add entry which shows rmdir(2) can work for subvolume
>>
>>  fs/btrfs/ctree.h   |  6 ++
>>  fs/btrfs/sysfs.c   | 39 +--
>>  fs/btrfs/sysfs.h   |  5 -
>>  include/uapi/linux/btrfs.h |  2 ++
>>  4 files changed, 41 insertions(+), 11 deletions(-)
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: lzo: Avoid decompressing obviously corrupted data

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 10:36, Qu Wenruo wrote:
> Unlike zlib decompression, lzo decompression doesn't need any
> initialization, thus we can't detect early corruption from
> initialization.
> 
> However for lzo compressed extent, its first 4bytes records the real
> unaligned compressed data size.
> We could use this as a clue, since any compressed extent should not
> exceed 128K, thus if we find such compressed data length, we are sure
> it's corrupted, then no need to continue decompression.
> 
> Normally, such problem won't really bother anyone, as compression relies
> on dataCoW and data csum, which means normally such corruption should be
> detect by data csum before going into compression.
> However due to a bug in compression condition, it's possible to create
> compressed extent without csum.
> 
> So we still need to do extra check for lzo just in case the compressed
> data is corrupted.
> 
> Signed-off-by: Qu Wenruo 
> ---
> lease note that, even with the binary dump of corrupted extent provided
> by the original reporter, James Harvey, I can only reproduce the "decompress
> failed" error message, but not the serious memory corruption followed.
> So there must be something missing, maybe we need to double check both
> btrfs lzo caller and kernel lzo lib.
> 
> But anyway, making btrfs lzo compression a little more robust is never a
> bad thing.
> ---
>  fs/btrfs/compression.h | 1 +
>  fs/btrfs/lzo.c | 4 
>  2 files changed, 5 insertions(+)
> 
> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
> index cc605f7b23fb..317703d9b073 100644
> --- a/fs/btrfs/compression.h
> +++ b/fs/btrfs/compression.h
> @@ -6,6 +6,7 @@
>  #ifndef BTRFS_COMPRESSION_H
>  #define BTRFS_COMPRESSION_H
>  
> +#include 

Stray include otherwise:

Reviewed-by: Nikolay Borisov 

>  /*
>   * We want to make sure that amount of RAM required to uncompress an extent 
> is
>   * reasonable, so we limit the total size in ram of a compressed extent to
> diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
> index 0667ea07f766..7ae2c0925770 100644
> --- a/fs/btrfs/lzo.c
> +++ b/fs/btrfs/lzo.c
> @@ -271,6 +271,10 @@ static int lzo_decompress_bio(struct list_head *ws, 
> struct compressed_bio *cb)
>  
>   data_in = kmap(pages_in[0]);
>   tot_len = read_compress_length(data_in);
> + if (tot_len > BTRFS_MAX_COMPRESSED) {
> + ret = -EIO;
> + goto done;
> + }
>  
>   tot_in = LZO_LEN;
>   in_offset = LZO_LEN;
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] btrfs: sysfs: Add entry which shows rmdir(2) can work for subvolume

2018-05-15 Thread Nikolay Borisov


On 15.05.2018 10:30, Misono Tomohiro wrote:
> [based on current misc-next]
> 
> This adds new sysfs entry
>   /sys/fs/btrfs/features/rmdir_subvol
> to indicate that the kernel can delete a subvolume by rmdir(2),
> which is allowed by: https://www.spinics.net/lists/linux-btrfs/msg76938.html
> 
> The first patch is a cleanup and the second one is a main part.

Why do we need this - for enabling testing or something else?

> 
> Tomohiro Misono (2):
>   btrfs: sysfs: Use enum/define value intead of magic number
>   btrfs: sysfs: Add entry which shows rmdir(2) can work for subvolume
> 
>  fs/btrfs/ctree.h   |  6 ++
>  fs/btrfs/sysfs.c   | 39 +--
>  fs/btrfs/sysfs.h   |  5 -
>  include/uapi/linux/btrfs.h |  2 ++
>  4 files changed, 41 insertions(+), 11 deletions(-)
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Su Yue


On 05/15/2018 03:51 PM, Misono Tomohiro wrote:
> Incompat flag of lzo/zstd compression should be set at:
>  1. mount time (-o compress/compress-force)
>  2. when defrag is done
>  3. when property is set
> 
> Currently 3. is missing and this commit adds this.
> 

If I don't misunderstand, compression property of an inode is only
apply for *the* inode, not the whole filesystem.
So the original logical should be okay.

Thanks,
Su

> Signed-off-by: Tomohiro Misono 
> ---
>  fs/btrfs/props.c | 12 
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
> index 53a8c95828e3..dc6140013ae8 100644
> --- a/fs/btrfs/props.c
> +++ b/fs/btrfs/props.c
> @@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
> const char *value,
> size_t len)
>  {
> + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>   int type;
>  
>   if (len == 0) {
> @@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
>   return 0;
>   }
>  
> - if (!strncmp("lzo", value, 3))
> + if (!strncmp("lzo", value, 3)) {
>   type = BTRFS_COMPRESS_LZO;
> - else if (!strncmp("zlib", value, 4))
> + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
> + } else if (!strncmp("zlib", value, 4)) {
>   type = BTRFS_COMPRESS_ZLIB;
> - else if (!strncmp("zstd", value, len))
> + } else if (!strncmp("zstd", value, len)) {
>   type = BTRFS_COMPRESS_ZSTD;
> - else
> + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
> + } else {
>   return -EINVAL;
> + }
>  
>   BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
>   BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
> 




pEpkey.asc
Description: application/pgp-keys


RE: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns subvolume information

2018-05-15 Thread Gu, Jinxiang
Hi, add a missed a comment.

> -Original Message-
> From: Misono Tomohiro [mailto:misono.tomoh...@jp.fujitsu.com]
> Sent: Tuesday, May 15, 2018 3:04 PM
> To: Gu, Jinxiang/顾 金香 ; linux-btrfs@vger.kernel.org
> Subject: Re: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns 
> subvolume information
> 
> On 2018/05/15 15:31, Gu, Jinxiang/顾 金香 wrote:
> > Hi,
> >
> >> -Original Message-
> >> From: linux-btrfs-ow...@vger.kernel.org
> >> [mailto:linux-btrfs-ow...@vger.kernel.org] On Behalf Of Tomohiro
> >> Misono
> >> Sent: Friday, May 11, 2018 3:26 PM
> >> To: linux-btrfs@vger.kernel.org
> >> Subject: [PATCH v4 1/3] btrfs: Add unprivileged ioctl which returns
> >> subvolume information
> >>
> >> Add new unprivileged ioctl BTRFS_IOC_GET_SUBVOL_INFO which returns the 
> >> information of subvolume containing this inode.
> >> (i.e. returns the information in ROOT_ITEM and ROOT_BACKREF.)
> >>
> >> Signed-off-by: Tomohiro Misono 
> >> ---
> >>  fs/btrfs/ioctl.c   | 129 
> >> +
> >>  include/uapi/linux/btrfs.h |  51 ++
> >>  2 files changed, 180 insertions(+)
> >>
> >> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index
> >> 48e2ddff32bd..64b23e22852f 100644
> >> --- a/fs/btrfs/ioctl.c
> >> +++ b/fs/btrfs/ioctl.c
> >> @@ -2242,6 +2242,133 @@ static noinline int btrfs_ioctl_ino_lookup(struct 
> >> file *file,
> >>return ret;
> >>  }
> >>
> >> +/* Get the subvolume information in BTRFS_ROOT_ITEM and
> >> +BTRFS_ROOT_BACKREF */ static noinline int 
> >> btrfs_ioctl_get_subvol_info(struct file *file,
> >> + void __user *argp)
> >> +{
> >> +  struct btrfs_ioctl_get_subvol_info_args *subvol_info;
> >> +  struct btrfs_root *root;
> >> +  struct btrfs_path *path;
> >> +  struct btrfs_key key;
> >> +
> >> +  struct btrfs_root_item root_item;
> >> +  struct btrfs_root_ref *rref;
> >> +  struct extent_buffer *l;
> >> +  int slot;
> >> +
> >> +  unsigned long item_off;
> >> +  unsigned long item_len;
> >> +
> >> +  struct inode *inode;
> >> +  int ret;
> >> +
> >> +  path = btrfs_alloc_path();
> >> +  if (!path)
> >> +  return -ENOMEM;
> >> +
> >> +  subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
> >> +  if (!subvol_info) {
> >> +  btrfs_free_path(path);
> >> +  return -ENOMEM;
> >> +  }
> >> +  inode = file_inode(file);
> >> +
> >> +  root = BTRFS_I(inode)->root->fs_info->tree_root;
> >> +  key.objectid = BTRFS_I(inode)->root->root_key.objectid;
> >> +  key.type = BTRFS_ROOT_ITEM_KEY;
> >> +  key.offset = 0;
> >> +  ret = btrfs_search_slot(NULL, root, , path, 0, 0);
> >> +  if (ret < 0) {
> >> +  goto out;
> >> +  } else if (ret > 0) {
> >> +  u64 objectid = key.objectid;
> >> +
> >> +  if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
> >> +  ret = btrfs_next_leaf(root, path);
> >> +  if (ret < 0)
> >> +  return ret;
> > Should goto out; to free subvol_info and path.
> Thanks, will update both.
> 

Since btrfs_next_leaf may return 1 when nritems of next leaf is 0,
So, btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]); may goes wrong.
And I think it should add a judge before btrfs_item_key_to_cpu.

> >> +  }
> >> +
> >> +  /* If the subvolume is a snapshot, offset is not zero */
> >> +  btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]);
> >> +  if (key.objectid != objectid ||
> >> +  key.type != BTRFS_ROOT_ITEM_KEY) {
> >> +  ret = -ENOENT;
> >> +  goto out;
> >> +  }
> >> +  }
> >> +
> >> +  l = path->nodes[0];
> >> +  slot = path->slots[0];
> >> +  item_off = btrfs_item_ptr_offset(l, slot);
> >> +  item_len = btrfs_item_size_nr(l, slot);
> >> +  read_extent_buffer(l, _item, item_off, item_len);
> >> +
> >> +  subvol_info->id = key.objectid;
> >> +
> >> +  subvol_info->generation = btrfs_root_generation(_item);
> >> +  subvol_info->flags = btrfs_root_flags(_item);
> >> +
> >> +  memcpy(subvol_info->uuid, root_item.uuid, BTRFS_UUID_SIZE);
> >> +  memcpy(subvol_info->parent_uuid, root_item.parent_uuid,
> >> +  BTRFS_UUID_SIZE);
> >> +  memcpy(subvol_info->received_uuid, root_item.received_uuid,
> >> +  BTRFS_UUID_SIZE);
> >> +
> >> +  subvol_info->ctransid = btrfs_root_ctransid(_item);
> >> +  subvol_info->ctime.sec = btrfs_stack_timespec_sec(_item.ctime);
> >> +  subvol_info->ctime.nsec =
> >> +btrfs_stack_timespec_nsec(_item.ctime);
> >> +
> >> +  subvol_info->otransid = btrfs_root_otransid(_item);
> >> +  subvol_info->otime.sec = btrfs_stack_timespec_sec(_item.otime);
> >> +  subvol_info->otime.nsec =
> >> +btrfs_stack_timespec_nsec(_item.otime);
> >> +
> >> +  subvol_info->stransid = btrfs_root_stransid(_item);
> >> +  

[PATCH] btrfs: property: Set incompat flag of lzo/zstd compression

2018-05-15 Thread Misono Tomohiro
Incompat flag of lzo/zstd compression should be set at:
 1. mount time (-o compress/compress-force)
 2. when defrag is done
 3. when property is set

Currently 3. is missing and this commit adds this.

Signed-off-by: Tomohiro Misono 
---
 fs/btrfs/props.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 53a8c95828e3..dc6140013ae8 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -380,6 +380,7 @@ static int prop_compression_apply(struct inode *inode,
  const char *value,
  size_t len)
 {
+   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int type;
 
if (len == 0) {
@@ -390,14 +391,17 @@ static int prop_compression_apply(struct inode *inode,
return 0;
}
 
-   if (!strncmp("lzo", value, 3))
+   if (!strncmp("lzo", value, 3)) {
type = BTRFS_COMPRESS_LZO;
-   else if (!strncmp("zlib", value, 4))
+   btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+   } else if (!strncmp("zlib", value, 4)) {
type = BTRFS_COMPRESS_ZLIB;
-   else if (!strncmp("zstd", value, len))
+   } else if (!strncmp("zstd", value, len)) {
type = BTRFS_COMPRESS_ZSTD;
-   else
+   btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+   } else {
return -EINVAL;
+   }
 
BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
-- 
2.14.3


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 2/3] btrfs: Add unprivileged ioctl which returns subvolume's ROOT_REF

2018-05-15 Thread Gu, Jinxiang


> -Original Message-
> From: linux-btrfs-ow...@vger.kernel.org 
> [mailto:linux-btrfs-ow...@vger.kernel.org] On Behalf Of Tomohiro Misono
> Sent: Friday, May 11, 2018 3:26 PM
> To: linux-btrfs@vger.kernel.org
> Subject: [PATCH v4 2/3] btrfs: Add unprivileged ioctl which returns 
> subvolume's ROOT_REF
> 
> Add unprivileged ioctl BTRFS_IOC_GET_SUBVOL_ROOTREF which returns ROOT_REF 
> information of the subvolume containing this inode
> except the subvolume name (this is because to prevent potential name leak). 
> The subvolume name will be gained by user version of
> ino_lookup ioctl (BTRFS_IOC_INO_LOOKUP_USER) which also performs permission 
> check.
> 
> The min id of root ref's subvolume to be searched is specified by @min_id in 
> struct btrfs_ioctl_get_subvol_rootref_args. After the search
> ends, @min_id is set to the last searched root ref's subvolid + 1. Also, if 
> there are more root refs than
> BTRFS_MAX_ROOTREF_BUFFER_NUM, -EOVERFLOW is returned. Therefore the caller 
> can just call this ioctl again without changing the
> argument to continue search.
> 
> Signed-off-by: Tomohiro Misono 
> ---
>  fs/btrfs/ioctl.c   | 102 
> +
>  include/uapi/linux/btrfs.h |  16 +++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 
> 64b23e22852f..7988d328aed5 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -2369,6 +2369,106 @@ static noinline int 
> btrfs_ioctl_get_subvol_info(struct file *file,
>   return ret;
>  }
> 
> +/*
> + * Return ROOT_REF information of the subvolume contining this inode
s/contining/containing

> + * except the subvolume name.
> + */
> +static noinline int btrfs_ioctl_get_subvol_rootref(struct file *file,
> +void __user *argp)
> +{
> + struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
> + struct btrfs_root_ref *rref;
> + struct btrfs_root *root;
> + struct btrfs_path *path;
> + struct btrfs_key key;
> +
> + struct extent_buffer *l;
> + int slot;
> +
> + struct inode *inode;
> + int i, nritems;
> + int ret;
> + u64 objectid;
> + u8 found;
> +
> + path = btrfs_alloc_path();
> + if (!path)
> + return -ENOMEM;
> +
> + rootrefs = memdup_user(argp, sizeof(*rootrefs));
> + if (!rootrefs) {
> + btrfs_free_path(path);
> + return -ENOMEM;
> + }
> +
> + inode = file_inode(file);
> + root = BTRFS_I(inode)->root->fs_info->tree_root;
> + objectid = BTRFS_I(inode)->root->root_key.objectid;
> +
> + key.objectid = objectid;
> + key.type = BTRFS_ROOT_REF_KEY;
> + key.offset = rootrefs->min_id;
> + found = 0;
> + while (1) {
> + ret = btrfs_search_slot(NULL, root, , path, 0, 0);
> + if (ret < 0) {
> + goto out;
> + } else if (path->slots[0] >=
> + btrfs_header_nritems(path->nodes[0])) {
> + ret = btrfs_next_leaf(root, path);
> + if (ret < 0)
> + return ret;
Should goto out; to do free work.
> + }
> +
> + l = path->nodes[0];
> + slot = path->slots[0];
> + nritems = btrfs_header_nritems(l);
> + if (nritems - slot == 0) {
> + ret = 0;
> + goto out;
> + }
> +
> + for (i = slot; i < nritems; i++) {
> + btrfs_item_key_to_cpu(l, , i);
> + if (key.objectid != objectid ||
> + key.type != BTRFS_ROOT_REF_KEY) {
> + ret = 0;
> + goto out;
> + }
> +
> + if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
> + ret = -EOVERFLOW;
> + goto out;
> + }
> +
> + rref = btrfs_item_ptr(l, i, struct btrfs_root_ref);
> + rootrefs->rootref[found].subvolid = key.offset;
> + rootrefs->rootref[found].dirid =
> +   btrfs_root_ref_dirid(l, rref);
> + found++;
> + }
> +
> + btrfs_release_path(path);
> + key.offset++;
> + }
Suggest to use btrfs_search_slot and btrfs_next_item to reduce counts of 
btrfs_search_slot.

> +
> +out:
> + if (!ret || ret == -EOVERFLOW) {
> + rootrefs->num_items = found;
> + /* update min_id for next search */
> + if (found)
> + rootrefs->min_id =
> + rootrefs->rootref[found - 1].subvolid + 1;
> + if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
> + ret = -EFAULT;
> + }
> +
> + 

Re: [PATCH v2 2/3] btrfs-progs: lowmem: check symlinks with append/immutable flags

2018-05-15 Thread Qu Wenruo


On 2018年05月15日 09:33, Su Yue wrote:
> Define new error bit INODE_FLAGS_ERROR to represents invalid inode
> flags error.
> 
> Symlinks should never have append/immutable flags set.
> While checking inodes, if found a symlink with append/immutable
> flags, report and record the inode flags error.
> 
> This is for lowmem mode.
> 
> Signed-off-by: Su Yue 

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  check/mode-lowmem.c | 10 ++
>  check/mode-lowmem.h |  1 +
>  2 files changed, 11 insertions(+)
> 
> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
> index 9890180d1d3c..f598bc364de4 100644
> --- a/check/mode-lowmem.c
> +++ b/check/mode-lowmem.c
> @@ -2274,6 +2274,7 @@ static int check_inode_item(struct btrfs_root *root, 
> struct btrfs_path *path)
>   struct btrfs_key last_key;
>   u64 inode_id;
>   u32 mode;
> + u64 flags;
>   u64 nlink;
>   u64 nbytes;
>   u64 isize;
> @@ -2307,10 +2308,19 @@ static int check_inode_item(struct btrfs_root *root, 
> struct btrfs_path *path)
>   isize = btrfs_inode_size(node, ii);
>   nbytes = btrfs_inode_nbytes(node, ii);
>   mode = btrfs_inode_mode(node, ii);
> + flags = btrfs_inode_flags(node, ii);
>   dir = imode_to_type(mode) == BTRFS_FT_DIR;
>   nlink = btrfs_inode_nlink(node, ii);
>   nodatasum = btrfs_inode_flags(node, ii) & BTRFS_INODE_NODATASUM;
>  
> + if (mode & BTRFS_FT_SYMLINK &&
> + flags & (BTRFS_INODE_IMMUTABLE | BTRFS_INODE_APPEND)) {
> + err |= INODE_FLAGS_ERROR;
> + error(
> +"symlinks must never have immutable/append flags set, root %llu inode item 
> %llu flags %llu may be corrupted",
> +   root->objectid, inode_id, flags);
> + }
> +
>   while (1) {
>   btrfs_item_key_to_cpu(path->nodes[0], _key, 
> path->slots[0]);
>   ret = btrfs_next_item(root, path);
> diff --git a/check/mode-lowmem.h b/check/mode-lowmem.h
> index e7ba62e2413e..91f7b6b1db53 100644
> --- a/check/mode-lowmem.h
> +++ b/check/mode-lowmem.h
> @@ -44,6 +44,7 @@
>  #define DIR_COUNT_AGAIN (1<<20) /* DIR isize should be recalculated 
> */
>  #define BG_ACCOUNTING_ERROR (1<<21) /* Block group accounting error */
>  #define FATAL_ERROR (1<<22) /* Fatal bit for errno */
> +#define INODE_FLAGS_ERROR(1<<23) /* Invalid inode flags */
>  
>  /*
>   * Error bit for low memory mode check.
> 



signature.asc
Description: OpenPGP digital signature


  1   2   >