Re: Btrfs/SSD

2017-05-11 Thread Duncan
Austin S. Hemmelgarn posted on Mon, 17 Apr 2017 07:53:04 -0400 as
excerpted:

> * In my personal experience, Intel, Samsung, and Crucial appear to be
> the best name brands (in relative order of quality).  I have personally
> had bad experiences with SanDisk and Kingston SSD's, but I don't have
> anything beyond circumstantial evidence indicating that it was anything
> but bad luck on both counts.

FWIW, I'm in the market for SSDs ATM, and remembered this from a couple 
weeks ago so went back to find it.  Thanks. =:^)

(I'm currently still on quarter-TB generation ssds, plus spinning rust 
for the larger media partition and backups, and want to be rid of the 
spinning rust, so am looking at half-TB to TB, which seems to be the 
pricing sweet spot these days anyway.)

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v3.1 5/6] btrfs: qgroup: Introduce extent changeset for qgroup reserve functions

2017-05-11 Thread Qu Wenruo
Introduce a new parameter, struct extent_changeset for
btrfs_qgroup_reserved_data() and its callers.

Such extent_changeset was used in btrfs_qgroup_reserve_data() to record
which range it reserved in current reserve, so it can free it at error
path.

The reason we need to export it to callers is, at buffered write error
path, without knowing what exactly which range we reserved in current
allocation, we can free space which is not reserved by us.

This will lead to qgroup reserved space underflow.

Reviewed-by: Chandan Rajendra 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.h   |  6 --
 fs/btrfs/extent-tree.c | 23 +--
 fs/btrfs/extent_io.h   | 34 +
 fs/btrfs/file.c| 12 +---
 fs/btrfs/inode-map.c   |  4 +++-
 fs/btrfs/inode.c   | 18 ++
 fs/btrfs/ioctl.c   |  5 -
 fs/btrfs/qgroup.c  | 51 --
 fs/btrfs/qgroup.h  |  3 ++-
 fs/btrfs/relocation.c  |  4 +++-
 10 files changed, 119 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e82516fe2d8..52a0147cd612 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2704,8 +2704,9 @@ enum btrfs_flush_state {
COMMIT_TRANS=   6,
 };
 
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
@@ -2723,7 +2724,8 @@ void btrfs_subvolume_release_metadata(struct 
btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4f62696131a6..ef09cc37f25f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3364,6 +3364,7 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
+   struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
u64 num_pages = 0;
@@ -3483,7 +3484,7 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
num_pages *= 16;
num_pages *= PAGE_SIZE;
 
-   ret = btrfs_check_data_free_space(inode, 0, num_pages);
+   ret = btrfs_check_data_free_space(inode, _reserved, 0, num_pages);
if (ret)
goto out_put;
 
@@ -3514,6 +3515,7 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
block_group->disk_cache_state = dcs;
spin_unlock(_group->lock);
 
+   extent_changeset_free(data_reserved);
return ret;
 }
 
@@ -4277,12 +4279,8 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode 
*inode, u64 bytes)
return ret;
 }
 
-/*
- * New check_data_free_space() with ability for precious data reservation
- * Will replace old btrfs_check_data_free_space(), but for patch split,
- * add a new function first and then replace it.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
@@ -4297,9 +4295,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 
start, u64 len)
return ret;
 
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
-   ret = btrfs_qgroup_reserve_data(inode, start, len);
+   ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
+   else
+   ret = 0;
return ret;
 }
 
@@ -6123,6 +6123,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode 
*inode, u64 num_bytes)
  * @inode: 

[RFC PATCH v3.1 3/6] btrfs: qgroup: Return actually freed bytes for qgroup release or free data

2017-05-11 Thread Qu Wenruo
btrfs_qgroup_release/free_data() only returns 0 or minus error
number(ENOMEM is the only possible error).

This is normally good enough, but sometimes we need the accurate byte
number it freed/released.

Change it to return actually released/freed bytenr number instead of 0
for success.
And slightly modify related extent_changeset structure, since in btrfs
one none-hole data extent won't be larger than 128M, so "unsigned int"
is large enough for the use case.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/extent_io.h   | 2 +-
 fs/btrfs/qgroup.c  | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e390451c72e6..4f62696131a6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4298,7 +4298,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 
start, u64 len)
 
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, start, len);
-   if (ret)
+   if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1eafa2f0ede3..cc1b08fa9fe7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -205,7 +205,7 @@ struct extent_buffer {
  */
 struct extent_changeset {
/* How many bytes are set/cleared in this operation */
-   u64 bytes_changed;
+   unsigned int bytes_changed;
 
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9f01c25469f7..ad2e99491395 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2886,6 +2886,7 @@ static int __btrfs_qgroup_release_data(struct inode 
*inode, u64 start, u64 len,
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
changeset.bytes_changed);
+   ret = changeset.bytes_changed;
 out:
ulist_release(_changed);
return ret;
-- 
2.12.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v3.1 6/6] btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges

2017-05-11 Thread Qu Wenruo
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at error path:
(Page size 4K, function name without "btrfs_" prefix)

 Task A  | Task B
--
Buffered_write [0, 2K)   |
|- check_data_free_space()   |
|  |- qgroup_reserve_data()  |
| Range aligned to page  |
| range [0, 4K)  <<< |
| 4K bytes reserved  <<< |
|- copy pages to page cache  |
 | Buffered_write [2K, 4K)
 | |- check_data_free_space()
 | |  |- qgroup_reserved_data()
 | | Range alinged to page
 | | range [0, 4K)
 | | Already reserved by A <<<
 | | 0 bytes reserved  <<<
 | |- delalloc_reserve_metadata()
 | |  And it *FAILED* (Maybe EQUOTA)
 | |- free_reserved_data_space()
  |- qgroup_free_data()
 Range aligned to page range
 [0, 4K)
 Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)

[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.

And at write back time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.

[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.

Reported-by: Chandan Rajendra 
Signed-off-by: Qu Wenruo 
Reviewed-by: Chandan Rajendra 
Tested-by: Chandan Rajendra 
---
 fs/btrfs/ctree.h   |  6 +++--
 fs/btrfs/extent-tree.c | 12 +
 fs/btrfs/file.c| 29 +++-
 fs/btrfs/inode.c   | 29 ++--
 fs/btrfs/ioctl.c   |  4 +--
 fs/btrfs/qgroup.c  | 72 ++
 fs/btrfs/qgroup.h  |  3 ++-
 fs/btrfs/relocation.c  |  8 +++---
 8 files changed, 117 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 52a0147cd612..75d2eced61b2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2707,7 +2707,10 @@ enum btrfs_flush_state {
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
 int btrfs_check_data_free_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space(struct inode *inode,
+   struct extent_changeset *reserved, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode,
+   struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2726,7 +2729,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode 
*inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ef09cc37f25f..eeeccc8a618e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4340,7 +4340,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode 
*inode, u64 start,
  * This one will handle the per-inode data rsv map for accurate reserved
  * space framework.
  */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+void btrfs_free_reserved_data_space(struct inode *inode,
+   struct extent_changeset *reserved, u64 start, u64 len)
 {
struct btrfs_root *root = BTRFS_I(inode)->root;
 
@@ -4350,7 +4351,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, 
u64 start, u64 len)
start = round_down(start, root->fs_info->sectorsize);
 
btrfs_free_reserved_data_space_noquota(inode, start, len);
-  

[RFC PATCH v3.1 2/6] btrfs: qgroup: Cleanup btrfs_qgroup_prepare_account_extents function

2017-05-11 Thread Qu Wenruo
Quite a lot of qgroup corruption happens due to wrong timing of calling
btrfs_qgroup_prepare_account_extents().

Since the safest timing is calling it just before
btrfs_qgroup_account_extents(), there is no need to separate these 2
function.

Merging them will make code cleaner and less bug prone.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c  | 50 +-
 fs/btrfs/qgroup.h  |  2 --
 fs/btrfs/transaction.c | 10 --
 3 files changed, 17 insertions(+), 45 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 905fed1ee0dd..9f01c25469f7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1403,38 +1403,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
return ret;
 }
 
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
-struct btrfs_fs_info *fs_info)
-{
-   struct btrfs_qgroup_extent_record *record;
-   struct btrfs_delayed_ref_root *delayed_refs;
-   struct rb_node *node;
-   u64 qgroup_to_skip;
-   int ret = 0;
-
-   delayed_refs = >transaction->delayed_refs;
-   qgroup_to_skip = delayed_refs->qgroup_to_skip;
-
-   /*
-* No need to do lock, since this function will only be called in
-* btrfs_commit_transaction().
-*/
-   node = rb_first(_refs->dirty_extent_root);
-   while (node) {
-   record = rb_entry(node, struct btrfs_qgroup_extent_record,
- node);
-   if (WARN_ON(!record->old_roots))
-   ret = btrfs_find_all_roots(NULL, fs_info,
-   record->bytenr, 0, >old_roots);
-   if (ret < 0)
-   break;
-   if (qgroup_to_skip)
-   ulist_del(record->old_roots, qgroup_to_skip, 0);
-   node = rb_next(node);
-   }
-   return ret;
-}
-
 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record)
@@ -2051,6 +2019,19 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans,
 
if (!ret) {
/*
+* old roots should be searched when inserting qgroup
+* extent record
+*/
+   if (WARN_ON(!record->old_roots)) {
+   /* Search commit root to find old_roots */
+   ret = btrfs_find_all_roots(NULL, fs_info,
+   record->bytenr, 0,
+   >old_roots);
+   if (ret < 0)
+   goto cleanup;
+   }
+
+   /*
 * Use SEQ_LAST as time_seq to do special search, which
 * doesn't lock tree or delayed_refs and search current
 * root. It's safe inside commit_transaction().
@@ -2059,8 +2040,11 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans,
record->bytenr, SEQ_LAST, _roots);
if (ret < 0)
goto cleanup;
-   if (qgroup_to_skip)
+   if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
+   ulist_del(record->old_roots, qgroup_to_skip,
+ 0);
+   }
ret = btrfs_qgroup_account_extent(trans, fs_info,
record->bytenr, record->num_bytes,
record->old_roots, new_roots);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fe04d3f295c6..38d14d4575c0 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -134,8 +134,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
-int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
-struct btrfs_fs_info *fs_info);
 /*
  * Inform qgroup to trace one dirty extent, its info is recorded in @record.
  * So qgroup can account it at transaction committing time.
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1..ee5b41d297d1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct 
btrfs_trans_handle *trans,
ret = commit_fs_roots(trans, fs_info);
if 

[RFC PATCH v3.1 4/6] btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quota enable

2017-05-11 Thread Qu Wenruo
[BUG]
Under the following case, we can underflow qgroup reserved space.

Task A|Task B
---
 Quota disabled   |
 Buffered write   |
 |- btrfs_check_data_free_space() |
 |  *NO* qgroup space is reserved |
 |  since quota is *DISABLED* |
 |- All pages are copied to page  |
cache |
  | Enable quota
  | Quota scan finished
  |
  | Sync_fs
  | |- run_delalloc_range
  | |- Write pages
  | |- btrfs_finish_ordered_io
  ||- insert_reserved_file_extent
  |   |- btrfs_qgroup_release_data()
  |  Since no qgroup space is
 reserved in Task A, we
 underflow qgroup reserved
 space
This can be detected by fstest btrfs/104.

[CAUSE]
In insert_reserved_file_extent() we info qgroup to release the @ram_bytes
size of qgroup reserved_space under all case.
And btrfs_qgroup_release_data() will check if qgroup is enabled.

However in above case, the buffered write happens before quota is
enabled, so we don't havee reserved space for that range.

[FIX]
In insert_reserved_file_extent(), we info qgroup to release the acctual
byte number it released.
In above case, since we don't have reserved space, we info qgroup to
release 0 byte, so the problem can be fixed.

And thanks to the @reserved parameter introduced by qgroup rework, and
previous patch to return release bytes, the fix can be as small as less
than 10 lines.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/inode.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17cbe9306faf..a1294d5baef5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2143,6 +2143,7 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
+   u64 qg_released;
int extent_inserted = 0;
int ret;
 
@@ -2198,13 +2199,17 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
-   ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
-   btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, );
+
/*
 * Release the reserved range from inode dirty range map, as it is
 * already moved into delayed_ref_head
 */
-   btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+   ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+   if (ret < 0)
+   goto out;
+   qg_released = ret;
+   ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
+   btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, );
 out:
btrfs_free_path(path);
 
-- 
2.12.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v3.1 1/6] btrfs: qgroup: Add quick exit for non-fs extents

2017-05-11 Thread Qu Wenruo
For btrfs_qgroup_account_extent(), modify make it exit quicker for
non-fs extents.

This will also reduce the noise in trace_btrfs_qgroup_account_extent
event.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c | 41 +++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3f75b5cbbfef..905fed1ee0dd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1915,6 +1915,33 @@ static int qgroup_update_counters(struct btrfs_fs_info 
*fs_info,
return 0;
 }
 
+/*
+ * Helper to check if the @roots is a list of fs tree roots
+ * Return 0 for definitely not a fs/subvol tree roots ulist
+ * Return 1 for possible fs/subvol tree roots ulist(including empty)
+ */
+static int maybe_fs_roots(struct ulist *roots)
+{
+   struct ulist_node *unode;
+   struct ulist_iterator uiter;
+
+   /* Empty one, still possible for fs roots */
+   if (!roots || roots->nnodes == 0)
+   return 1;
+
+   ULIST_ITER_INIT();
+   unode = ulist_next(roots, );
+   if (!unode)
+   return 1;
+
+   /*
+* If it contains fs tree roots, then it must belongs to fs/subvol
+* trees.
+* If it contains non-fs tree, it won't be shared to fs/subvol trees.
+*/
+   return is_fstree(unode->val);
+}
+
 int
 btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -1931,10 +1958,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle 
*trans,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
return 0;
 
-   if (new_roots)
+   if (new_roots) {
+   if (!maybe_fs_roots(new_roots))
+   goto out_free;
nr_new_roots = new_roots->nnodes;
-   if (old_roots)
+   }
+   if (old_roots) {
+   if (!maybe_fs_roots(old_roots))
+   goto out_free;
nr_old_roots = old_roots->nnodes;
+   }
+
+   /* Quick exit, either not fs tree roots, or won't affect any qgroup */
+   if (nr_old_roots == 0 && nr_new_roots == 0)
+   goto out_free;
 
BUG_ON(!fs_info->quota_root);
 
-- 
2.12.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v3.1 0/6] Qgroup fixes, Non-stack version

2017-05-11 Thread Qu Wenruo
The remaining qgroup fixes patches, based on the Chris' for-linus-4.12
branch with commit 9bcaaea7418d09691f1ffab5c49aacafe3eef9d0 as base.

Can be fetched from github:
https://github.com/adam900710/linux/tree/qgroup_fixes_non_stack

Despite the 5th patch, patches are mostly unchanged.
Only minor conflicts are addressed in this update.

The 5th patch chooses a different method to reduce stack memory usage.

Instead of allocating extent_changeset structure on stack, this time only a
pointer of extent_changeset is allocated on stack.

And real extent_changeset is allocated inside btrfs_qgroup_reserve_data().
The impact to stack memory usage of quota disabled case is reduced to minimal.

While the error handler routine is not affected either.

v2:
  Add reviewed-by tag for 2nd patch
  Update the first patch to follow the new trace point standard
RFC v3:
  Use non-stack (dyanamic allocation) for extent_changeset structure, in
  5th patch, to reduce impact for quota disabled cases.
  Rebase to latest for-linus-4.12 branch.
RFC v3.1:
  Update comment to include the newly introduced parameter
  Use init/release function to replace open coded ulist_init/release().


Qu Wenruo (6):
  btrfs: qgroup: Add quick exit for non-fs extents
  btrfs: qgroup: Cleanup btrfs_qgroup_prepare_account_extents function
  btrfs: qgroup: Return actually freed bytes for qgroup release or free
data
  btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered
write and quota enable
  btrfs: qgroup: Introduce extent changeset for qgroup reserve functions
  btrfs: qgroup: Fix qgroup reserved space underflow by only freeing
reserved ranges

 fs/btrfs/ctree.h   |  12 ++-
 fs/btrfs/extent-tree.c |  37 +
 fs/btrfs/extent_io.h   |  36 -
 fs/btrfs/file.c|  41 ++
 fs/btrfs/inode-map.c   |   4 +-
 fs/btrfs/inode.c   |  58 -
 fs/btrfs/ioctl.c   |   9 ++-
 fs/btrfs/qgroup.c  | 215 -
 fs/btrfs/qgroup.h  |   8 +-
 fs/btrfs/relocation.c  |  12 +--
 fs/btrfs/transaction.c |  10 ---
 11 files changed, 303 insertions(+), 139 deletions(-)
-- 
2.12.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: errno=-28 No space left, with kernel backtrace (blocking bug)

2017-05-11 Thread alpha_one_x86
Hi.

I wish mean: I can't.

I now for the btrfs maturity. But it's my unique alternative.

I understand. For me this bug should be important because it block all
the system, since Linux 4.1+

It's exactly what I wish, pay to have a quick fix. I don't think I wish
too much, just fix this bug and put to upstream.

Thanks for your time to read me, and thanks for confirm this bug is not
forget. A least somebody have take time to read me, great thanks for this.

Cheers,


On 05/12/17 04:01, Duncan wrote:
> alpha_one_x86 posted on Thu, 11 May 2017 17:25:32 +0200 as excerpted:
>
>> Up plz, I can work with this bug.
>>
>>
>> On 05/11/17 01:39, alpha_one_x86 wrote:
>>> Hi, this bug is very blocking for me:
>>>
>>> https://bugzilla.kernel.org/show_bug.cgi?id=195257
>>>
>>> The server is backup server, I btrfs receive (with and without -p), and
>>> of course btrfs subvolume delete The volume is 70TB, then I use
>>> space_cache=v2
> Since you can work with it, do so.  We're not stopping you. =:^)
>
> Or did you mean /can't/?
>
> Keep in mind that while btrfs is considered stabilizing, on this list at 
> least it's not considered fully stable and mature.  If you want/need a 
> filesystem that's stable and mature, there's others out there that fill 
> that requirement.  We don't claim btrfs does.  Your system, your choice 
> of filesystem and with it, filesystem maturity.
>
> Meanwhile, btrfs devs have a lot of stuff on their plate, including bugs 
> they're already working on and further development, and (as with most 
> devs) aren't going to take kindly to demands that they work on *YOUR* bug 
> *RIGHT* *NOW*.  That, if anything, is about the fastest way I know of to 
> ensure that working on it is /deprioritized/, with stuff that would have 
> been put off to work on it, done first, instead.
>
> Unless of course you're paying the salary of that dev.  If you are, then 
> you get to call the shots, to some degree at least.  Good devs tend to 
> find other employment if you're too controlling, tho, and they can 
> because good devs are in enough demand they often pick their jobs from a 
> list of offers, and they tend to be motivated by more than money so if 
> you're too demanding you can't expect to simply outbid everyone else on 
> the list, either, no matter how much money you have.  And any dev skilled 
> enough to regularly get their work into the mainline kernel can be 
> considered a good dev, so...
>
> So I'd suggest that if it's high enough priority to you, you'll find a 
> kernel dev and sponsor them to work on it for you.  But be warned, if 
> they're not already a btrfs dev, it'll take them some time to come upto 
> speed.  Otherwise, you'll wait in line with everyone else... unless you 
> push too much, in which case your reports will as I said get 
> deprioritized, and if noone else reports them, your bugs may not get 
> handled until there's nothing else waiting... which could easily push 
> resolution past 2027... yes, a decade or more out.
>

-- 
alpha_one_x86/BRULE Herman 
Main developer of Supercopier/Ultracopier/CatchChallenger, Esourcing and server 
management
IT, OS, technologies, research & development, security and business department


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: errno=-28 No space left, with kernel backtrace (blocking bug)

2017-05-11 Thread Duncan
alpha_one_x86 posted on Thu, 11 May 2017 17:25:32 +0200 as excerpted:

> Up plz, I can work with this bug.
> 
> 
> On 05/11/17 01:39, alpha_one_x86 wrote:
>> Hi, this bug is very blocking for me:
>>
>> https://bugzilla.kernel.org/show_bug.cgi?id=195257
>>
>> The server is backup server, I btrfs receive (with and without -p), and
>> of course btrfs subvolume delete The volume is 70TB, then I use
>> space_cache=v2

Since you can work with it, do so.  We're not stopping you. =:^)

Or did you mean /can't/?

Keep in mind that while btrfs is considered stabilizing, on this list at 
least it's not considered fully stable and mature.  If you want/need a 
filesystem that's stable and mature, there's others out there that fill 
that requirement.  We don't claim btrfs does.  Your system, your choice 
of filesystem and with it, filesystem maturity.

Meanwhile, btrfs devs have a lot of stuff on their plate, including bugs 
they're already working on and further development, and (as with most 
devs) aren't going to take kindly to demands that they work on *YOUR* bug 
*RIGHT* *NOW*.  That, if anything, is about the fastest way I know of to 
ensure that working on it is /deprioritized/, with stuff that would have 
been put off to work on it, done first, instead.

Unless of course you're paying the salary of that dev.  If you are, then 
you get to call the shots, to some degree at least.  Good devs tend to 
find other employment if you're too controlling, tho, and they can 
because good devs are in enough demand they often pick their jobs from a 
list of offers, and they tend to be motivated by more than money so if 
you're too demanding you can't expect to simply outbid everyone else on 
the list, either, no matter how much money you have.  And any dev skilled 
enough to regularly get their work into the mainline kernel can be 
considered a good dev, so...

So I'd suggest that if it's high enough priority to you, you'll find a 
kernel dev and sponsor them to work on it for you.  But be warned, if 
they're not already a btrfs dev, it'll take them some time to come upto 
speed.  Otherwise, you'll wait in line with everyone else... unless you 
push too much, in which case your reports will as I said get 
deprioritized, and if noone else reports them, your bugs may not get 
handled until there's nothing else waiting... which could easily push 
resolution past 2027... yes, a decade or more out.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Creating btrfs RAID on LUKS devs makes devices disappear

2017-05-11 Thread Ochi

Hello,

here is the journal.log (I hope). It's quite interesting. I rebooted the 
machine, performed a mkfs.btrfs on dm-{2,3,4} and dm-3 was missing 
afterwards (around timestamp 66.*). However, I then logged into the 
machine from another terminal (around timestamp 118.*) which triggered 
something to make the device appear again :O Indeed, dm-3 was once again 
there after logging in. Does systemd mix something up?


Hmm, I just did another mkfs once the devices where back, devices were 
missing, but they re-appeared a few seconds later, without logging into 
a terminal. After another mkfs, they were gone again and are now still 
gone after waiting a few minutes. It's really weird, I can't really tell 
what triggers this yet. Will test more tomorrow, let me know if you have 
any more ideas what to try.


Best regards
Sebastian
-- Logs begin at Sun 2017-03-26 20:36:24 CEST, end at Fri 2017-05-12 01:00:45 CEST. --
[0.00] nas kernel: Linux version 4.9.27-1-lts (builduser@andyrtr) (gcc version 6.3.1 20170306 (GCC) ) #1 SMP Mon May 8 13:37:42 CEST 2017
[0.00] nas kernel: Command line: BOOT_IMAGE=/default/vmlinuz-linux-lts root=UUID=4ac09b56-3e02-40c0-bf64-02a4cf9344fc rw rootflags=subvol=default ip=192.168.0.3:eth0:none cryptdevice=/dev/sda2:root:allow-discards
[0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers'
[0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR'
[0.00] nas kernel: x86/fpu: xstate_offset[3]:  576, xstate_sizes[3]:   64
[0.00] nas kernel: x86/fpu: xstate_offset[4]:  640, xstate_sizes[4]:   64
[0.00] nas kernel: x86/fpu: Enabled xstate features 0x1b, context size is 704 bytes, using 'compacted' format.
[0.00] nas kernel: x86/fpu: Using 'eager' FPU context switches.
[0.00] nas kernel: e820: BIOS-provided physical RAM map:
[0.00] nas kernel: BIOS-e820: [mem 0x-0x0009c3ff] usable
[0.00] nas kernel: BIOS-e820: [mem 0x0009c400-0x0009] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x0010-0x78770fff] usable
[0.00] nas kernel: BIOS-e820: [mem 0x78771000-0x78771fff] ACPI NVS
[0.00] nas kernel: BIOS-e820: [mem 0x78772000-0x78772fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x78773000-0x7e137fff] usable
[0.00] nas kernel: BIOS-e820: [mem 0x7e138000-0x7e5bafff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x7e5bb000-0x7e667fff] usable
[0.00] nas kernel: BIOS-e820: [mem 0x7e668000-0x7ea06fff] ACPI NVS
[0.00] nas kernel: BIOS-e820: [mem 0x7ea07000-0x7effefff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x7efff000-0x7eff] usable
[0.00] nas kernel: BIOS-e820: [mem 0x7f00-0x8fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xe000-0xefff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xfe00-0xfe010fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xfec0-0xfec00fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xfed0-0xfed00fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xfee0-0xfee00fff] reserved
[0.00] nas kernel: BIOS-e820: [mem 0xff00-0x] reserved
[0.00] nas kernel: BIOS-e820: [mem 0x0001-0x00046dff] usable
[0.00] nas kernel: NX (Execute Disable) protection: active
[0.00] nas kernel: SMBIOS 3.0 present.
[0.00] nas kernel: DMI: To Be Filled By O.E.M. To Be Filled By O.E.M./C236 WSI, BIOS P2.10 04/18/2017
[0.00] nas kernel: e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] nas kernel: e820: remove [mem 0x000a-0x000f] usable
[0.00] nas kernel: e820: last_pfn = 0x46e000 max_arch_pfn = 0x4
[0.00] nas kernel: MTRR default type: write-back
[0.00] nas kernel: MTRR fixed ranges enabled:
[0.00] nas kernel:   0-9 write-back
[0.00] nas kernel:   A-B uncachable
[0.00] nas kernel:   C-F write-protect
[0.00] nas kernel: MTRR variable ranges enabled:
[0.00] nas kernel:   0 base 008000 mask 7F8000 uncachable
[0.00] nas kernel:   1 base 007F80 mask 7FFF80 uncachable
[0.00] nas kernel:   2 disabled
[0.00] nas kernel:   3 disabled
[0.00] nas kernel:   4 disabled
[0.00] nas kernel:   5 disabled
[0.00] 

Re: Creating btrfs RAID on LUKS devs makes devices disappear

2017-05-11 Thread Chris Murphy
journalctl -b -o short-monotonic > journal.log

And then attached the log, hopefully it's small enough to be accepted
by the list server (should be). If that's not revealing it might be
necessary to reboot with rd.udev.debug but start with the simple case
first and see if that reveals what's going on.



Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v1 00/30] fs: inode->i_version rework and optimization

2017-05-11 Thread NeilBrown
On Thu, May 11 2017, J. Bruce Fields wrote:

> On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
>> On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
>> > 1) Keep i_version as is, make clients also check for i_ctime.
>> 
>> That would be a protocol revision, which we'd definitely rather avoid.
>> 
>> But can't we accomplish the same by using something like
>> 
>>  ctime * (some constant) + i_version
>> 
>> ?
>> 
>> >Pro: No on-disk format changes.
>> >Cons: After a crash, i_version can go backwards (but when file changes
>> >i_version, i_ctime pair should be still different) or not, data can be
>> >old or not.
>> 
>> This is probably good enough for NFS purposes: typically on an NFS
>> filesystem, results of a read in the face of a concurrent write open are
>> undefined.  And writers sync before close.
>> 
>> So after a crash with a dirty inode, we're in a situation where an NFS
>> client still needs to resend some writes, sync, and close.  I'm OK with
>> things being inconsistent during this window.
>> 
>> I do expect things to return to normal once that client's has resent its
>> writes--hence the worry about actually resuing old values after boot
>> (such as if i_version regresses on boot and then increments back to the
>> same value after further writes).  Factoring in ctime fixes that.
>
> So for now I'm thinking of just doing something like the following.
>
> Only nfsd needs it for now, but it could be moved to a vfs helper for
> statx, or for individual filesystems that want to do something
> different.  (The NFSv4 client will want to use the server's change
> attribute instead, I think.  And other filesystems might want to try
> something more ambitious like Neil's proposal.)
>
> --b.
>
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 12feac6ee2fd..9636c9a60aba 100644
> diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
> index f84fe6bf9aee..14f09f1ef605 100644
> --- a/fs/nfsd/nfsfh.h
> +++ b/fs/nfsd/nfsfh.h
> @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
>   fhp->fh_pre_saved = false;
>  }
>  
> +static inline u64 nfsd4_change_attribute(struct inode *inode)
> +{
> + u64 chattr;
> +
> + chattr = inode->i_ctime.tv_sec << 30;
> + chattr += inode->i_ctime.tv_nsec;
> + chattr += inode->i_version;
> + return chattr;

So if I chmod a file, all clients will need to flush the content from their 
cache?
Maybe they already do?  Maybe it is a boring corner case?

> +}
> +
>  /*
>   * Fill in the pre_op attr for the wcc data
>   */
> @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
>   fhp->fh_pre_mtime = inode->i_mtime;
>   fhp->fh_pre_ctime = inode->i_ctime;
>   fhp->fh_pre_size  = inode->i_size;
> - fhp->fh_pre_change = inode->i_version;
> + fhp->fh_pre_change = nfsd4_change_attribute(inode);
>   fhp->fh_pre_saved = true;
>   }
>  }
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
>   printk("nfsd: inode locked twice during operation.\n");
>  
>   err = fh_getattr(fhp, >fh_post_attr);
> - fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
> + fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
>   if (err) {
>   fhp->fh_post_saved = false;
>   /* Grab the ctime anyway - set_change_info might use it */
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 26780d53a6f9..a09532d4a383 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat 
> *stat, struct inode *inode,
>   *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
>   *p++ = 0;
>   } else if (IS_I_VERSION(inode)) {
> - p = xdr_encode_hyper(p, inode->i_version);
> + p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
>   } else {
>   *p++ = cpu_to_be32(stat->ctime.tv_sec);
>   *p++ = cpu_to_be32(stat->ctime.tv_nsec);

It is *really* confusing to find that fh_post_change is only set in nfs3
code, and only used in nfs4 code.
It is probably time to get a 'version' field in 'struct kstat'.
That would allow this code to get a little cleaner.

(to me, this exercise is just a reminder that the NFSv4 change attribute
is poorly designed ... so it just makes me grumpy).

NeilBrown


> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: PGP signature


[PATCH v2 2/2] btrfs: Add quota_override knob into sysfs

2017-05-11 Thread Sargun Dhillon
This patch adds the read-write attribute quota_override into sysfs.
Any process which has cap_sys_resource can set this flag to on, and
once it is set to true, processes with cap_sys_resource can exceed
the quota.

Signed-off-by: Sargun Dhillon 
---
 fs/btrfs/sysfs.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fb..c2d5f35 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject 
*kobj,
 
 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 
+static ssize_t quota_override_show(struct kobject *kobj,
+  struct kobj_attribute *a, char *buf)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   int quota_override;
+
+   quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+   return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+   struct kobj_attribute *a,
+   const char *buf, size_t len)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   unsigned long knob;
+   int err;
+
+   if (!fs_info)
+   return -EPERM;
+
+   if (!capable(CAP_SYS_RESOURCE))
+   return -EPERM;
+
+   err = kstrtoul(buf, 10, );
+   if (err)
+   return err;
+   if (knob > 1)
+   return -EINVAL;
+
+   if (knob)
+   set_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+   else
+   clear_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+
+   return len;
+}
+
+BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+
 static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
BTRFS_ATTR_PTR(clone_alignment),
+   BTRFS_ATTR_PTR(quota_override),
NULL,
 };
 
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] btrfs: add quota override flag to enable quota override for sys_resource

2017-05-11 Thread Sargun Dhillon
This patch introduces the quota override flag to btrfs_fs_info, and
a change to quota limit checking code to temporarily allow for quota
to be overridden for processes with cap_sys_resource.

It's useful for administrative programs, such as log rotation,
that may need to temporarily use more disk space in order to free up
a greater amount of overall disk space without yielding more disk
space to the rest of userland.

Eventually, we may want to add the idea of an operator-specific
quota, operator reserved space, or something else to allow for
administrative override, but this is perhaps the simplest
solution.

Signed-off-by: Sargun Dhillon 
---
 fs/btrfs/ctree.h  | 2 ++
 fs/btrfs/qgroup.c | 5 +
 2 files changed, 7 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..e86cb7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -716,6 +716,8 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_BTREE_ERR 11
 #define BTRFS_FS_LOG1_ERR  12
 #define BTRFS_FS_LOG2_ERR  13
+#define BTRFS_FS_QUOTA_OVERRIDE14
+
 /*
  * Indicate that a whole-filesystem exclusive operation is running
  * (device replace, resize, device add/delete, balance)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb..458fec0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2338,6 +2338,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 
num_bytes, bool enforce)
 
if (num_bytes == 0)
return 0;
+
+   if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags) &&
+   capable(CAP_SYS_RESOURCE))
+   enforce = false;
+
 retry:
spin_lock(_info->qgroup_lock);
quota_root = fs_info->quota_root;
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] btrfs: allow mechanism to override quota

2017-05-11 Thread Sargun Dhillon
This patchset makes it so that on a per-filesystem basis one can disable
quota enforcement for users with cap_sys_resource. This patchset can
likely later be extended to per-qgroup, or a per-volume basis. I'm
thinking of extending the sysfs interface to list the qgroups and
this same interface for the qgroups themselves.

Changes since v1:
  -Rather than a separate member of btrfs_fs_info, use the existing
   flags field

Sargun Dhillon (2):
  btrfs: add quota override flag to enable quota override for
sys_resource
  btrfs: Add quota_override knob into sysfs

 fs/btrfs/ctree.h  |  2 ++
 fs/btrfs/qgroup.c |  5 +
 fs/btrfs/sysfs.c  | 41 +
 3 files changed, 48 insertions(+)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Creating btrfs RAID on LUKS devs makes devices disappear

2017-05-11 Thread Ochi

I should have added some more technical info. Here you go:

Arch Linux with systemd 233
Kernel linux-lts 4.9.27
btrfs-progs 4.10.2

Example session:

root@nas> ls /dev/dm-*
/dev/dm-0  /dev/dm-1  /dev/dm-2  /dev/dm-3  /dev/dm-4
root@nas> ls -l /dev/mapper
total 0
lrwxrwxrwx 1 root root   7 May 11 22:30 backup -> ../dm-1
crw--- 1 root root 10, 236 May 11 22:30 control
lrwxrwxrwx 1 root root   7 May 11 22:30 root -> ../dm-0
lrwxrwxrwx 1 root root   7 May 11 22:30 storage0 -> ../dm-2
lrwxrwxrwx 1 root root   7 May 11 22:30 storage1 -> ../dm-4
lrwxrwxrwx 1 root root   7 May 11 22:30 storage2 -> ../dm-3
root@nas> mkfs.btrfs -f -d raid1 -m raid1 /dev/dm-2 /dev/dm-3 /dev/dm-4
btrfs-progs v4.10.2
See http://btrfs.wiki.kernel.org for more information.

Label:  (null)
UUID:   a32b3106-678f-448f-ade9-c48cd41a7dae
Node size:  16384
Sector size:4096
Filesystem size:10.92TiB
Block group profiles:
  Data: RAID1 1.00GiB
  Metadata: RAID1 1.00GiB
  System:   RAID1 8.00MiB
SSD detected:   no
Incompat features:  extref, skinny-metadata
Number of devices:  3
Devices:
   IDSIZE  PATH
1 3.64TiB  /dev/dm-2
2 3.64TiB  /dev/dm-3
3 3.64TiB  /dev/dm-4

root@nas> ls /dev/dm-*
/dev/dm-0  /dev/dm-1  /dev/dm-2  /dev/dm-4

Note that dm-3 is gone.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Creating btrfs RAID on LUKS devs makes devices disappear

2017-05-11 Thread Ochi

Hello,

while trying to initialize a btrfs RAID1 on my new NAS using LUKS 
crypt-devices for each of the btrfs RAID devices, I have seen "random" 
weirdness shortly after mkfs.


It seems to boil down to the problem that after mkfs.btrfs, some of the 
/dev/dm-* nodes (as well as the corresponding /dev/mapper/* symlinks) 
sometimes disappear. The RAID can be mounted at first but quickly shows 
symptoms such as missing devices, or being unable to mount the second time.


I have tried to btrfs.mkfs -d raid1 -m raid1 using the /dev/dm-* and 
/dev/mapper/* devices, but with similar results.


By best guess is that the fact that one UUID is given to multiple 
separate devices confuses... something (udev or the like?), making nodes 
appear, disappear or being re-ordered while mkfs is in progress, or 
leading to unexpected things later at mount time.


Honestly, the idea of the same UUID being given to separate physical 
devices scared me already when I first saw it. Could that actually be 
the culprit here?


Best regards
Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs list corruption and soft lockups while testing writeback error handling

2017-05-11 Thread Chris Mason

On 05/11/2017 03:52 PM, Jeff Layton wrote:

On Thu, 2017-05-11 at 07:13 -0400, Jeff Layton wrote:

I finally got my writeback error handling test to work on btrfs (thanks,
Chris!), by making the filesystem stripe the data and mirror the
metadata across two devices. The test passes now, but on one run, I got
the following list corruption warning and then a soft lockup (which is
probably fallout from the list corruption).

I ran the test several times before and since then without this failure,
so I don't have a clear reproducer. The kernel in this instance is
basically a v4.11 kernel with my pile of writeback error handling
patches on top:


https://urldefense.proofpoint.com/v2/url?u=https-3A__git.samba.org_-3Fp-3Djlayton_linux.git-3Ba-3Dshortlog-3Bh-3Drefs_heads_wberr=DwICaQ=5VD0RTtNlTh3ycd41b3MUw=9QPtTAxcitoznaWRKKHoEQ=BXXwaUFQNFNaGGFYHEVlvNBwkrXiIoH7K5iOdR_PvxM=xE6pIXeQ1rlaxAV8aTYBSiI06pb3WZoiRJW8Vo1L3NQ=

It may be that they are a contributing factor, but this smells more like
a bug down in btrfs. Let me know if you need other info:


[ btrfs inode logging ]


(cc'ing Liu Bo since we were discussing this earlier this week)

I can't reproduce this on stock v4.11, so I think this is a bug in my
series.

I think this is due to the differences in how errors are being reported
from filemap_fdatawait_range now causing some transactions to end up
being freed while they're still on the log_ctxs list. I'm working on
hunting down the problem now.

Sorry for the noise!



There's a list in the inode logging code that we consistently seem to 
find list debugging assertions with.  We've fixed up all the known 
issues, but I wouldn't be surprised if we've got a goto fail in there.


I'll take a look ;)

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs list corruption and soft lockups while testing writeback error handling

2017-05-11 Thread Jeff Layton
On Thu, 2017-05-11 at 07:13 -0400, Jeff Layton wrote:
> I finally got my writeback error handling test to work on btrfs (thanks,
> Chris!), by making the filesystem stripe the data and mirror the
> metadata across two devices. The test passes now, but on one run, I got
> the following list corruption warning and then a soft lockup (which is
> probably fallout from the list corruption).
> 
> I ran the test several times before and since then without this failure,
> so I don't have a clear reproducer. The kernel in this instance is
> basically a v4.11 kernel with my pile of writeback error handling
> patches on top:
> 
> https://git.samba.org/?p=jlayton/linux.git;a=shortlog;h=refs/heads/wberr
> 
> It may be that they are a contributing factor, but this smells more like
> a bug down in btrfs. Let me know if you need other info:
> 
> --8<---
> 
> [  438.341942] run fstests generic/999 at 2017-05-11 07:03:39
> [  439.453293] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 
> 1 transid 3 /dev/vda8
> [  439.465918] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 
> 2 transid 3 /dev/vda7
> [  439.603578] device-mapper: ioctl: device doesn't appear to be in the dev 
> hash table.
> [  439.762422] BTRFS info (device dm-4): disk space caching is enabled
> [  439.763808] BTRFS info (device dm-4): has skinny extents
> [  439.764979] BTRFS info (device dm-4): flagging fs with big metadata feature
> [  439.785879] BTRFS info (device dm-4): creating UUID tree
> [  439.974266] BTRFS info (device dm-4): disk space caching is enabled
> [  439.975783] BTRFS info (device dm-4): has skinny extents
> [  440.229263] Buffer I/O error on dev dm-4, logical block 2621424, async 
> page read
> [  440.239970] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: 
> wr 1, rd 0, flush 0, corrupt 0, gen 0
> [  440.242459] [ cut here ]
> [  440.243276] WARNING: CPU: 0 PID: 5162 at lib/list_debug.c:28 
> __list_add_valid+0x69/0xa0
> [  440.244338] list_add corruption. prev->next should be next 
> (8dd531056b08), but was a93242807e90. (prev=a93242807e90).
> [  440.245939] Modules linked in: btrfs xor raid6_pq binfmt_misc 
> ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink 
> ebtable_broute bridge stp llc ebtable_nat ip6table_mangle ip6table_security 
> ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_raw 
> iptable_mangle iptable_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 
> nf_nat_ipv4 nf_nat nf_conntrack iptable_raw ebtable_filter ebtables 
> ip6table_filter ip6_tables snd_hda_codec_generic snd_hda_intel snd_hda_codec 
> snd_hda_core crct10dif_pclmul crc32_pclmul nfsd ghash_clmulni_intel ppdev 
> snd_hwdep snd_pcm acpi_cpufreq snd_timer tpm_tis snd parport_pc tpm_tis_core 
> parport pcspkr tpm i2c_piix4 auth_rpcgss soundcore floppy joydev qemu_fw_cfg 
> virtio_balloon nfs_acl lockd grace sunrpc xfs libcrc32c qxl drm_kms_helper 
> virtio_net
> [  440.254739]  virtio_blk virtio_console virtio_rng ttm drm crc32c_intel 
> virtio_pci virtio_ring ata_generic virtio serio_raw pata_acpi
> [  440.256352] CPU: 0 PID: 5162 Comm: fsync-err Not tainted 4.11.0+ #52
> [  440.257534] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.9.3-1.fc25 04/01/2014
> [  440.258584] Call Trace:
> [  440.259096]  dump_stack+0x63/0x86
> [  440.259618]  __warn+0xcb/0xf0
> [  440.260116]  warn_slowpath_fmt+0x5a/0x80
> [  440.260798]  ? check_parent_dirs_for_sync+0x109/0x140 [btrfs]
> [  440.261755]  __list_add_valid+0x69/0xa0
> [  440.262442]  btrfs_log_inode_parent+0x25c/0x9f0 [btrfs]
> [  440.263323]  ? btrfs_releasepage+0x20/0x20 [btrfs]
> [  440.264059]  ? wait_current_trans+0x2e/0xf0 [btrfs]
> [  440.264792]  ? kmem_cache_alloc+0x195/0x1b0
> [  440.265455]  ? join_transaction+0x27/0x420 [btrfs]
> [  440.266175]  btrfs_log_dentry_safe+0x60/0x80 [btrfs]
> [  440.266965]  btrfs_sync_file+0x2b7/0x400 [btrfs]
> [  440.267655]  vfs_fsync_range+0x49/0xb0
> [  440.268266]  do_fsync+0x3d/0x70
> [  440.268806]  SyS_fsync+0x10/0x20
> [  440.269347]  entry_SYSCALL_64_fastpath+0x1a/0xa9
> [  440.270033] RIP: 0033:0x7f7983af1b70
> [  440.270607] RSP: 002b:7ffe13b3aa18 EFLAGS: 0246 ORIG_RAX: 
> 004a
> [  440.271661] RAX: ffda RBX: 0005 RCX: 
> 7f7983af1b70
> [  440.272623] RDX: 00010401 RSI: 023ef030 RDI: 
> 0004
> [  440.273696] RBP: 7ffe13b3cc77 R08:  R09: 
> 7f79845364c8
> [  440.274808] R10: 0008 R11: 0246 R12: 
> 023ef030
> [  440.275873] R13: 0005 R14:  R15: 
> 
> [  440.276995] ---[ end trace 878ee9789ed2d63b ]---
> [  440.278476] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: 
> wr 2, rd 0, flush 0, corrupt 0, gen 0
> [  440.282362] BTRFS error (device dm-4): bdev /dev/mapper/error-test 

Re: Backing up BTRFS metadata

2017-05-11 Thread Hugo Mills
On Fri, May 12, 2017 at 12:22:00AM +0500, Roman Mamedov wrote:
> On Thu, 11 May 2017 09:19:28 -0600
> Chris Murphy  wrote:
> 
> > On Thu, May 11, 2017 at 8:56 AM, Marat Khalili  wrote:
> > > Sorry if question sounds unorthodox, Is there some simple way to read (and
> > > backup) all BTRFS metadata from volume?
> > 
> > btrfs-image
> 
> Hm, I thought that's for debugging only, and that you can't actually restore
> metadata onto a data-containing FS and have anything mountable/readable as a
> result.

   Indeed. This has been tried before, and I don't think it came to
anything.

> Seems not to be the case, and in fact, could this be one of the "missing
> links" in the Fsck story, 
> 
>-w
>Walk all the trees manually and copy any blocks that are
>referenced. Use this option if your extent tree is corrupted to
>make sure that all of the metadata is captured.
> 
> This certainly does sound like something to try for some of those broken
> filesystems where Btrfsck refuses to do anything. Save image with this manual
> walking/reconstruction of the trees, then restore. Too bad I already nuked
> mine, so can't experiment with that.

   I suspect it's still only capturing metadata, rather than data.

   Hugo.

-- 
Hugo Mills | Would you like an ocelot with that non-sequitur?
hugo@... carfax.org.uk |
http://carfax.org.uk/  |
PGP: E2AB1DE4  |


signature.asc
Description: Digital signature


Re: Question on compression unit

2017-05-11 Thread Xiaochu Liu
Thanks Qu!

I wonder if there is anyway we can easily configure the extent size
(maximum extent size, extent size for files to compress, etc.)? I was
trying to see if it helps reduce random read latency on compressed
files by using smaller extent...

On Wed, May 10, 2017 at 6:01 PM, Qu Wenruo  wrote:
>
>
> At 05/11/2017 04:11 AM, Xiaochu Liu wrote:
>>
>> Hi there,
>>
>> I'm trying to tune compression options for btrfs. Specifically, I want
>> to know the performance impact on the system under different
>> compression unit (block) sizes.
>
>
> Compression unit size is fixed in btrfs.
> It's sectorsize, determined at mkfs time, and only 4K (page size) is
> supported for x86 yet.
>
>>
>> I'm aware of '--nodesize' parameter which sets the block size of
>> metadata tree. Does that also set the block size in an extent? (from
>> my understanding, file data are mostly stored in extent unless small
>> enough to be inline-d in metadata leaf node?)
>
>
> nodesize only affects metadata, nothing to do with data size.
>
>>
>> Also from btrfs's wikipedia page:
>>
>> In compressed extents, individual blocks are not compressed
>> separately; rather, the compression stream spans the entire extent.
>>
>> Is that still true?
>
>
> Yes.
>
> For example if there is one continuous range represents 0~1M data of one
> file, and all this data is dirty (not written to disk).
>
> Then compress will happen when trying to writing them to disk.
> And since the maximum uncompressed size for compressed extent is 128K
> (fixed), the 0~1M will be split into 8 extents (if compression ratio is
> acceptable).
>
> And then each 128K extent will be compressed then compressed data will be
> written to disk. (compressed extent still meet sectorsize alignment).
>
> So the wiki page is still right and we must read out the whole (compressed)
> extent to get its content.
>
> And since both uncompressed data and compressed extent must meet sectorsize
> alignment, data smaller than or equal to sectorsize won't go through
> compression since it will just waste CPU time and no space saving.
>
> Thanks,
> Qu
>
>>
>> Thanks,
>> Xiaochu
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Backing up BTRFS metadata

2017-05-11 Thread Roman Mamedov
On Thu, 11 May 2017 09:19:28 -0600
Chris Murphy  wrote:

> On Thu, May 11, 2017 at 8:56 AM, Marat Khalili  wrote:
> > Sorry if question sounds unorthodox, Is there some simple way to read (and
> > backup) all BTRFS metadata from volume?
> 
> btrfs-image

Hm, I thought that's for debugging only, and that you can't actually restore
metadata onto a data-containing FS and have anything mountable/readable as a
result.

Seems not to be the case, and in fact, could this be one of the "missing
links" in the Fsck story, 

   -w
   Walk all the trees manually and copy any blocks that are
   referenced. Use this option if your extent tree is corrupted to
   make sure that all of the metadata is captured.

This certainly does sound like something to try for some of those broken
filesystems where Btrfsck refuses to do anything. Save image with this manual
walking/reconstruction of the trees, then restore. Too bad I already nuked
mine, so can't experiment with that.

-- 
With respect,
Roman
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] fs: Introduce RWF_NOWAIT

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

RWF_NOWAIT informs kernel to bail out if an AIO request will block
for reasons such as file allocations, or a writeback triggered,
or would block while allocating requests while performing
direct I/O.

RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags.

The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This
is called by most filesystems, either through fsops.write_iter() or through
the function defined by write_iter(). If not, we perform the check defined
by .write_iter() which is called for direct IO specifically.

Filesystems xfs, btrfs and ext4 would be supported in the following patches.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 fs/9p/vfs_file.c| 3 +++
 fs/aio.c| 6 ++
 fs/ceph/file.c  | 3 +++
 fs/cifs/file.c  | 3 +++
 fs/fuse/file.c  | 3 +++
 fs/nfs/direct.c | 3 +++
 fs/ocfs2/file.c | 3 +++
 include/linux/fs.h  | 5 -
 include/uapi/linux/fs.h | 1 +
 mm/filemap.c| 3 +++
 10 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3de3b4a89d89..403681db7723 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
loff_t origin;
int err = 0;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
retval = generic_write_checks(iocb, from);
if (retval <= 0)
return retval;
diff --git a/fs/aio.c b/fs/aio.c
index 020fa0045e3c..34027b67e2f4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
goto out_put_req;
}
 
+   if ((req->common.ki_flags & IOCB_NOWAIT) &&
+   !(req->common.ki_flags & IOCB_DIRECT)) {
+   ret = -EOPNOTSUPP;
+   goto out_put_req;
+   }
+
ret = put_user(KIOCB_KEY, _iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..af28419b1731 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1267,6 +1267,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
int err, want, got;
loff_t pos;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 21d404535739..f8858a06e119 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2638,6 +2638,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct 
iov_iter *from)
 * write request.
 */
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
rc = generic_write_checks(iocb, from);
if (rc <= 0)
return rc;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..72786e798319 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1425,6 +1425,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
ssize_t res;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
if (is_bad_inode(inode))
return -EIO;
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c1b5fed7c863..dcea0caa5cb5 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -996,6 +996,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct 
iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..e7f8ba890305 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2235,6 +2235,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
inode_lock(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2e6fc6a23f91..7e39b510b7a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -270,6 +270,7 @@ struct writeback_control;
 #define IOCB_DSYNC (1 << 4)
 #define IOCB_SYNC  (1 << 5)
 #define IOCB_WRITE (1 << 6)
+#define IOCB_NOWAIT(1 << 7)
 
 struct kiocb {
struct file *ki_filp;
@@ -3053,7 +3054,7 @@ static inline int iocb_flags(struct file *file)
 
 static inline int 

[PATCH 05/10] fs: return if direct write will trigger writeback

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Find out if the write will trigger a wait due to writeback. If yes,
return -EAGAIN.

Return -EINVAL for buffered AIO: there are multiple causes of
delay such as page locks, dirty throttling logic, page loading
from disk etc. which cannot be taken care of.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 mm/filemap.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index ca3031f505f2..fd7d175b3dee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2673,6 +2673,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
 
pos = iocb->ki_pos;
 
+   if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+   return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2742,9 +2745,17 @@ generic_file_direct_write(struct kiocb *iocb, struct 
iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
 
-   written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 
1);
-   if (written)
-   goto out;
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   /* If there are pages to writeback, return */
+   if (filemap_range_has_page(inode->i_mapping, pos,
+  pos + iov_iter_count(from)))
+   return -EAGAIN;
+   } else {
+   written = filemap_write_and_wait_range(mapping, pos,
+   pos + write_len - 1);
+   if (written)
+   goto out;
+   }
 
/*
 * After a write we want buffered reads to be sure to go to disk to get
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/10] fs: Introduce IOMAP_NOWAIT

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps.
This is used by XFS in the XFS patch.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 fs/iomap.c| 2 ++
 include/linux/iomap.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..d1c81753d411 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -885,6 +885,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
} else {
dio->flags |= IOMAP_DIO_WRITE;
flags |= IOMAP_WRITE;
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   flags |= IOMAP_NOWAIT;
}
 
if (mapping->nrpages) {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7291810067eb..53f6af89c625 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -51,6 +51,7 @@ struct iomap {
 #define IOMAP_REPORT   (1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT   (1 << 4) /* direct I/O */
+#define IOMAP_NOWAIT   (1 << 5) /* Don't wait for writeback */
 
 struct iomap_ops {
/*
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] xfs: nowait aio support

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

If IOCB_NOWAIT is set, bail if the i_rwsem is not lockable
immediately.

IF IOMAP_NOWAIT is set, return EAGAIN in xfs_file_iomap_begin
if it needs allocation either due to file extension, writing to a hole,
or COW or waiting for other DIOs to finish.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 fs/xfs/xfs_file.c  | 19 ++-
 fs/xfs/xfs_iomap.c | 17 +
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 35703a801372..b307940e7d56 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -541,8 +541,11 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
 
-   xfs_ilock(ip, iolock);
-
+   if (!xfs_ilock_nowait(ip, iolock)) {
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EAGAIN;
+   xfs_ilock(ip, iolock);
+   }
ret = xfs_file_aio_write_checks(iocb, from, );
if (ret)
goto out;
@@ -553,9 +556,15 @@ xfs_file_dio_aio_write(
 * otherwise demote the lock if we had to take the exclusive lock
 * for other reasons in xfs_file_aio_write_checks.
 */
-   if (unaligned_io)
-   inode_dio_wait(inode);
-   else if (iolock == XFS_IOLOCK_EXCL) {
+   if (unaligned_io) {
+   /* If we are going to wait for other DIO to finish, bail */
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (atomic_read(>i_dio_count))
+   return -EAGAIN;
+   } else {
+   inode_dio_wait(inode);
+   }
+   } else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..9baa65eeae9e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1015,6 +1015,15 @@ xfs_file_iomap_begin(
 
if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
if (flags & IOMAP_DIRECT) {
+   /*
+* A reflinked inode will result in CoW alloc.
+* FIXME: It could still overwrite on unshared extents
+* and not need allocation.
+*/
+   if (flags & IOMAP_NOWAIT) {
+   error = -EAGAIN;
+   goto out_unlock;
+   }
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, , ,
);
@@ -1032,6 +1041,14 @@ xfs_file_iomap_begin(
 
if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, , nimaps)) {
/*
+* If nowait is set bail since we are going to make
+* allocations.
+*/
+   if (flags & IOMAP_NOWAIT) {
+   error = -EAGAIN;
+   goto out_unlock;
+   }
+   /*
 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
 * pages to keep the chunks of work done where somewhat 
symmetric
 * with the work writeback does. This is a completely arbitrary
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] ext4: nowait aio support

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Return EAGAIN if any of the following checks fail for direct I/O:
  + i_rwsem is lockable
  + Writing beyond end of file (will trigger allocation)
  + Blocks are not allocated at the write location

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Jan Kara 
---
 fs/ext4/file.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cefa9835f275..2efdc6d4d3e8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -216,7 +216,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
return ext4_dax_write_iter(iocb, from);
 #endif
 
-   inode_lock(inode);
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (!inode_trylock(inode))
+   return -EAGAIN;
+   } else {
+   inode_lock(inode);
+   }
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -235,9 +241,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
 
iocb->private = 
/* Check whether we do a DIO overwrite or not */
-   if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-   ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
-   overwrite = 1;
+   if (o_direct && !unaligned_aio) {
+   if (ext4_overwrite_io(inode, iocb->ki_pos, 
iov_iter_count(from))) {
+   if (ext4_should_dioread_nolock(inode))
+   overwrite = 1;
+   } else if (iocb->ki_flags & IOCB_NOWAIT) {
+   ret = -EAGAIN;
+   goto out;
+   }
+   }
 
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] fs: return on congested block device

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

A new bio operation flag REQ_NOWAIT is introduced to identify bio's
orignating from iocb with IOCB_NOWAIT. This flag indicates
to return immediately if a request cannot be made instead
of retrying.

Stacked devices such as md (the ones with make_request_fn hooks)
currently are not supported because it may block for housekeeping.
For example, an md can have a part of the device suspended.
For this reason, only request based devices are supported.
In the future, this feature will be expanded to stacked devices
by teaching them how to handle the REQ_NOWAIT flags.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 block/blk-core.c  | 24 ++--
 block/blk-mq-sched.c  |  3 +++
 block/blk-mq.c|  4 
 fs/direct-io.c| 10 --
 include/linux/bio.h   |  6 ++
 include/linux/blk_types.h |  2 ++
 6 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d772c221cc17..effe934b806b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1232,6 +1232,11 @@ static struct request *get_request(struct request_queue 
*q, unsigned int op,
if (!IS_ERR(rq))
return rq;
 
+   if (op & REQ_NOWAIT) {
+   blk_put_rl(rl);
+   return ERR_PTR(-EAGAIN);
+   }
+
if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) 
{
blk_put_rl(rl);
return rq;
@@ -1870,6 +1875,17 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
 
+   /*
+* For a REQ_NOWAIT based request, return -EOPNOTSUPP
+* if queue does not have QUEUE_FLAG_NOWAIT_SUPPORT set
+* and if it is not a request based queue.
+*/
+
+   if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) {
+   err = -EOPNOTSUPP;
+   goto end_io;
+   }
+
part = bio->bi_bdev->bd_part;
if (should_fail_request(part, bio->bi_iter.bi_size) ||
should_fail_request(_to_disk(part)->part0,
@@ -2021,7 +2037,7 @@ blk_qc_t generic_make_request(struct bio *bio)
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-   if (likely(blk_queue_enter(q, false) == 0)) {
+   if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
struct bio_list lower, same;
 
/* Create a fresh bio_list for all subordinate requests 
*/
@@ -2046,7 +2062,11 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(_list_on_stack[0], );
bio_list_merge(_list_on_stack[0], 
_list_on_stack[1]);
} else {
-   bio_io_error(bio);
+   if (unlikely(!blk_queue_dying(q) &&
+   (bio->bi_opf & REQ_NOWAIT)))
+   bio_wouldblock_error(bio);
+   else
+   bio_io_error(bio);
}
bio = bio_list_pop(_list_on_stack[0]);
} while (bio);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c974a1bbf4cb..019d881d62b7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -119,6 +119,9 @@ struct request *blk_mq_sched_get_request(struct 
request_queue *q,
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
+   if (op & REQ_NOWAIT)
+   data->flags |= BLK_MQ_REQ_NOWAIT;
+
if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c7836a1ded97..d7613ae6a269 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1538,6 +1538,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
+   if (bio->bi_opf & REQ_NOWAIT)
+   bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
 
@@ -1662,6 +1664,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue 
*q, struct bio *bio)
rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
+   if (bio->bi_opf & REQ_NOWAIT)
+   bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea77de8..139ebd5ae1c7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -480,8 +480,12 @@ static int dio_bio_complete(struct dio *dio, struct bio 
*bio)
unsigned i;
int err;
 
-   if (bio->bi_error)
-   dio->io_error = -EIO;
+   if (bio->bi_error) {
+

[PATCH 02/10] fs: Introduce filemap_range_has_page()

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

filemap_range_has_page() return true if the file's mapping has
a page within the range mentioned. This function will be used
to check if a write() call will cause a writeback of previous
writes.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 include/linux/fs.h |  2 ++
 mm/filemap.c   | 33 +
 2 files changed, 35 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 869c9a6fe58d..2e6fc6a23f91 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2513,6 +2513,8 @@ extern int filemap_fdatawait(struct address_space *);
 extern void filemap_fdatawait_keep_errors(struct address_space *);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
   loff_t lend);
+extern int filemap_range_has_page(struct address_space *, loff_t lstart,
+ loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..fae5a361befb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -376,6 +376,39 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping:   address space structure to wait for
+ * @start_byte:offset in bytes where the range starts
+ * @end_byte:  offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+int filemap_range_has_page(struct address_space *mapping,
+  loff_t start_byte, loff_t end_byte)
+{
+   pgoff_t index = start_byte >> PAGE_SHIFT;
+   pgoff_t end = end_byte >> PAGE_SHIFT;
+   struct pagevec pvec;
+   int ret;
+
+   if (end_byte < start_byte)
+   return 0;
+
+   if (mapping->nrpages == 0)
+   return 0;
+
+   pagevec_init(, 0);
+   ret = pagevec_lookup(, mapping, index, 1);
+   if (!ret)
+   return 0;
+   ret = (pvec.pages[0]->index <= end);
+   pagevec_release();
+   return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
 static int __filemap_fdatawait_range(struct address_space *mapping,
 loff_t start_byte, loff_t end_byte)
 {
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] btrfs: nowait aio support

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Return EAGAIN if any of the following checks fail
 + i_rwsem is not lockable
 + NODATACOW or PREALLOC is not set
 + Cannot nocow at the desired location
 + Writing beyond end of file which is not allocated

Signed-off-by: Goldwyn Rodrigues 
Acked-by: David Sterba 
---
 fs/btrfs/file.c  | 25 -
 fs/btrfs/inode.c |  3 +++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 520cb7230b2d..a870e5dd2b4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1823,12 +1823,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
ssize_t num_written = 0;
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
ssize_t err;
-   loff_t pos;
-   size_t count;
+   loff_t pos = iocb->ki_pos;
+   size_t count = iov_iter_count(from);
loff_t oldsize;
int clean_page = 0;
 
-   inode_lock(inode);
+   if ((iocb->ki_flags & IOCB_NOWAIT) &&
+   (iocb->ki_flags & IOCB_DIRECT)) {
+   /* Don't sleep on inode rwsem */
+   if (!inode_trylock(inode))
+   return -EAGAIN;
+   /*
+* We will allocate space in case nodatacow is not set,
+* so bail
+*/
+   if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) ||
+   check_can_nocow(BTRFS_I(inode), pos, ) <= 0) {
+   inode_unlock(inode);
+   return -EAGAIN;
+   }
+   } else
+   inode_lock(inode);
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
inode_unlock(inode);
@@ -1862,8 +1879,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 */
update_time_for_write(inode);
 
-   pos = iocb->ki_pos;
-   count = iov_iter_count(from);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e71f1ea3391..47d3fcd86979 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8625,6 +8625,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct 
iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
+   } else if (iocb->ki_flags & IOCB_NOWAIT) {
+   ret = -EAGAIN;
+   goto out;
}
ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] fs: Use RWF_* flags for AIO operations

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

aio_rw_flags is introduced in struct iocb (using aio_reserved1) which will
carry the RWF_* flags. We cannot use aio_flags because they are not
checked for validity which may break existing applications.

Note, the only place RWF_HIPRI comes in effect is dio_await_one().
All the rest of the locations, aio code return -EIOCBQUEUED before the
checks for RWF_HIPRI.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 fs/aio.c | 8 +++-
 include/uapi/linux/aio_abi.h | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..020fa0045e3c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
ssize_t ret;
 
/* enforce forwards compatibility on users */
-   if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+   if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1586,6 +1586,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
req->common.ki_flags |= IOCB_EVENTFD;
}
 
+   ret = kiocb_set_rw_flags(>common, iocb->aio_rw_flags);
+   if (unlikely(ret)) {
+   pr_debug("EINVAL: aio_rw_flags\n");
+   goto out_put_req;
+   }
+
ret = put_user(KIOCB_KEY, _iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f7fbd1..a2d4a8ac94ca 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -79,7 +79,7 @@ struct io_event {
 struct iocb {
/* these are internal to the kernel/libc. */
__u64   aio_data;   /* data to be returned in event's data */
-   __u32   PADDED(aio_key, aio_reserved1);
+   __u32   PADDED(aio_key, aio_rw_flags);
/* the kernel sets aio_key to the req # */
 
/* common fields */
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/10 v8] No wait AIO

2017-05-11 Thread Goldwyn Rodrigues
Formerly known as non-blocking AIO.

This series adds nonblocking feature to asynchronous I/O writes.
io_submit() can be delayed because of a number of reason:
 - Block allocation for files
 - Data writebacks for direct I/O
 - Sleeping because of waiting to acquire i_rwsem
 - Congested block device

The goal of the patch series is to return -EAGAIN/-EWOULDBLOCK if
any of these conditions are met. This way userspace can push most
of the write()s to the kernel to the best of its ability to complete
and if it returns -EAGAIN, can defer it to another thread.

In order to enable this, IOCB_RW_FLAG_NOWAIT is introduced in
uapi/linux/aio_abi.h. If set for aio_rw_flags, it translates to
IOCB_NOWAIT for struct iocb, REQ_NOWAIT for bio.bi_opf and IOMAP_NOWAIT for
iomap. aio_rw_flags is a new flag replacing aio_reserved1. We could
not use aio_flags because it is not currently checked for invalidity
in the kernel.

This feature is provided for direct I/O of asynchronous I/O only. I have
tested it against xfs, ext4, and btrfs while I intend to add more filesystems.
The nowait feature is for request based devices. In the future, I intend to
add support to stacked devices such as md.

Applications will have to check supportability
by sending a async direct write and any other error besides -EAGAIN
would mean it is not supported.

First two patches are prep patches into nowait I/O.

Changes since v1:
 + changed name from _NONBLOCKING to *_NOWAIT
 + filemap_range_has_page call moved to closer to (just before) calling 
filemap_write_and_wait_range().
 + BIO_NOWAIT limited to get_request()
 + XFS fixes 
- included reflink 
- use of xfs_ilock_nowait() instead of a XFS_IOLOCK_NONBLOCKING flag
- Translate the flag through IOMAP_NOWAIT (iomap) to check for
  block allocation for the file.
 + ext4 coding style

Changes since v2:
 + Using aio_reserved1 as aio_rw_flags instead of aio_flags
 + blk-mq support
 + xfs uptodate with kernel and reflink changes

 Changes since v3:
  + Added FS_NOWAIT, which is set if the filesystem supports NOWAIT feature.
  + Checks in generic_make_request() to make sure BIO_NOWAIT comes in
for async direct writes only.
  + Added QUEUE_FLAG_NOWAIT, which is set if the device supports BIO_NOWAIT.
This is added (rather not set) to block devices such as dm/md currently.

 Changes since v4:
  + Ported AIO code to use RWF_* flags. Check for RWF_* flags in
generic_file_write_iter().
  + Changed IOCB_RW_FLAGS_NOWAIT to RWF_NOWAIT.

 Changes since v5:
  + BIO_NOWAIT to REQ_NOWAIT
  + Common helper for RWF flags.

 Changes since v6:
  + REQ_NOWAIT will be ignored for request based devices since they
cannot block. So, removed QUEUE_FLAG_NOWAIT since it is not
required in the current implementation. It will be resurrected
when we program for stacked devices.
  + changed kiocb_rw_flags() to kiocb_set_rw_flags() in order to accomodate
for errors. Moved checks in the function.

 Changes since v7:
  + split patches into prep so the main patches are smaller and easier
to understand
  + All patches are reviewed or acked!

-- 
Goldwyn


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/10] fs: Separate out kiocb flags setup based on RWF_* flags

2017-05-11 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
---
 fs/read_write.c| 12 +++-
 include/linux/fs.h | 14 ++
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index c4f88afbc67f..362f91cd8d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, 
struct iov_iter *iter,
struct kiocb kiocb;
ssize_t ret;
 
-   if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
-   return -EOPNOTSUPP;
-
init_sync_kiocb(, filp);
-   if (flags & RWF_HIPRI)
-   kiocb.ki_flags |= IOCB_HIPRI;
-   if (flags & RWF_DSYNC)
-   kiocb.ki_flags |= IOCB_DSYNC;
-   if (flags & RWF_SYNC)
-   kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+   ret = kiocb_set_rw_flags(, flags);
+   if (ret)
+   return ret;
kiocb.ki_pos = *ppos;
 
if (type == READ)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..869c9a6fe58d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3049,6 +3049,20 @@ static inline int iocb_flags(struct file *file)
return res;
 }
 
+static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+{
+   if (unlikely(flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)))
+   return -EOPNOTSUPP;
+
+   if (flags & RWF_HIPRI)
+   ki->ki_flags |= IOCB_HIPRI;
+   if (flags & RWF_DSYNC)
+   ki->ki_flags |= IOCB_DSYNC;
+   if (flags & RWF_SYNC)
+   ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+   return 0;
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
ino_t res;
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v1 00/30] fs: inode->i_version rework and optimization

2017-05-11 Thread J. Bruce Fields
On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote:
> On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote:
> > 1) Keep i_version as is, make clients also check for i_ctime.
> 
> That would be a protocol revision, which we'd definitely rather avoid.
> 
> But can't we accomplish the same by using something like
> 
>   ctime * (some constant) + i_version
> 
> ?
> 
> >Pro: No on-disk format changes.
> >Cons: After a crash, i_version can go backwards (but when file changes
> >i_version, i_ctime pair should be still different) or not, data can be
> >old or not.
> 
> This is probably good enough for NFS purposes: typically on an NFS
> filesystem, results of a read in the face of a concurrent write open are
> undefined.  And writers sync before close.
> 
> So after a crash with a dirty inode, we're in a situation where an NFS
> client still needs to resend some writes, sync, and close.  I'm OK with
> things being inconsistent during this window.
> 
> I do expect things to return to normal once that client's has resent its
> writes--hence the worry about actually resuing old values after boot
> (such as if i_version regresses on boot and then increments back to the
> same value after further writes).  Factoring in ctime fixes that.

So for now I'm thinking of just doing something like the following.

Only nfsd needs it for now, but it could be moved to a vfs helper for
statx, or for individual filesystems that want to do something
different.  (The NFSv4 client will want to use the server's change
attribute instead, I think.  And other filesystems might want to try
something more ambitious like Neil's proposal.)

--b.

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 12feac6ee2fd..9636c9a60aba 100644
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index f84fe6bf9aee..14f09f1ef605 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp)
fhp->fh_pre_saved = false;
 }
 
+static inline u64 nfsd4_change_attribute(struct inode *inode)
+{
+   u64 chattr;
+
+   chattr = inode->i_ctime.tv_sec << 30;
+   chattr += inode->i_ctime.tv_nsec;
+   chattr += inode->i_version;
+   return chattr;
+}
+
 /*
  * Fill in the pre_op attr for the wcc data
  */
@@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp)
fhp->fh_pre_mtime = inode->i_mtime;
fhp->fh_pre_ctime = inode->i_ctime;
fhp->fh_pre_size  = inode->i_size;
-   fhp->fh_pre_change = inode->i_version;
+   fhp->fh_pre_change = nfsd4_change_attribute(inode);
fhp->fh_pre_saved = true;
}
 }
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp)
printk("nfsd: inode locked twice during operation.\n");
 
err = fh_getattr(fhp, >fh_post_attr);
-   fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
+   fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry));
if (err) {
fhp->fh_post_saved = false;
/* Grab the ctime anyway - set_change_info might use it */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 26780d53a6f9..a09532d4a383 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat 
*stat, struct inode *inode,
*p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
*p++ = 0;
} else if (IS_I_VERSION(inode)) {
-   p = xdr_encode_hyper(p, inode->i_version);
+   p = xdr_encode_hyper(p, nfsd4_change_attribute(inode));
} else {
*p++ = cpu_to_be32(stat->ctime.tv_sec);
*p++ = cpu_to_be32(stat->ctime.tv_nsec);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] btrfs: relocation: Enhance kernel error output for relocation

2017-05-11 Thread David Sterba
On Wed, May 10, 2017 at 11:39:40AM +0800, Qu Wenruo wrote:
> 
> 
> At 05/10/2017 01:29 AM, David Sterba wrote:
> > On Wed, Feb 15, 2017 at 09:39:05AM +0800, Qu Wenruo wrote:
> >> When balance(relocation) fails, btrfs-progs will report like:
> >>
> >> ERROR: error during balancing '/mnt/scratch': Input/output error
> >> There may be more info in syslog - try dmesg | tail
> >>
> >> However kernel can't provide may useful info in many cases to locate the
> >> problem.
> >>
> >> This patch will add error messages in relocation to help user and
> >> developer to locate the problem.
> > 
> > I think it's too verbose for a user, and not really helpful what to do
> > after such error message appears in the log. The errors translate name
> > of the last function that failed, so the user would need to be familiar
> > with the inner workings of the balance to make sense of it.
> 
> Yes, normal user may never need such verbose output.
> 
> But it will help developers or support guys to wipe out some really easy 
> cases.
> 
> > 
> > The meessages may make sense to a developer, but then it's not necessary
> > to print them as btrfs_err, but btrfs_debug.
> 
> I also considered btrfs_debug, but the problem is btrfs_debug() depend 
> on either CONFIG_DYANMIC_DEBUG or DEBUG.
> 
> So when problem happens in real world, we're too late to ensure such output.

The I think we need some way that is not as noisy as btrfs_err but also
compiled-in by default unlike the dynamic debug.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/8] nowait aio: return on congested block device

2017-05-11 Thread Goldwyn Rodrigues


On 05/11/2017 02:44 AM, Christoph Hellwig wrote:
> Looks fine,
> 
> Reviewed-by: Christoph Hellwig 
> 
> Although lifting the make_request limit is something a lot of users
> would appreciate in the near future..
> 

Yes, I understand. That will be on my todo list next on priority.

-- 
Goldwyn
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Qgroup reserved space like in ZFS?

2017-05-11 Thread Robert Mader
Hello everyone,

I just wanted to ask a short question as I couldn't find a clear answer
anywhere on the net, yet:

Is it currently possible to reserve space for a BTRFS subvolume?



Following example: I have two subvolumes, one for root and one for home

I'd like to reserve 2GB of space for my root volume so even if my home
volume fills up, it can never starve my root subvolume.
But I don't want to set a fixed limit on my home partition, I want both
subvolumes to be able to grow and shrink dynamically.

I use that functionality a lot on ZFS-based systems like FreeNAS. In the
the web-interface of FreeNAS one can easily configure things like that.


If I get it right, qgroups actually already create things like a global
reserve. Therefor the functionality seems to be there. Is there a way to
use it, just like the limit functionality? Or is it planed?

Thanks in advance and regards,
Robert



signature.asc
Description: OpenPGP digital signature


Re: errno=-28 No space left, with kernel backtrace (blocking bug)

2017-05-11 Thread alpha_one_x86
Up plz, I can work with this bug.


On 05/11/17 01:39, alpha_one_x86 wrote:
> Hi, this bug is very blocking for me:
>
> https://bugzilla.kernel.org/show_bug.cgi?id=195257
>
> The server is backup server, I btrfs receive (with and without -p), and
> of course btrfs subvolume delete
> The volume is 70TB, then I use space_cache=v2
>
> Cheers,
>
>

-- 
alpha_one_x86/BRULE Herman 
Main developer of Supercopier/Ultracopier/CatchChallenger, Esourcing and server 
management
IT, OS, technologies, research & development, security and business department

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Backing up BTRFS metadata

2017-05-11 Thread Marat Khalili


On 11/05/17 18:19, Chris Murphy wrote:

btrfs-image

Looks great, thank you!

--

With Best Regards,
Marat Khalili
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Backing up BTRFS metadata

2017-05-11 Thread Chris Murphy
On Thu, May 11, 2017 at 8:56 AM, Marat Khalili  wrote:
> Sorry if question sounds unorthodox, Is there some simple way to read (and
> backup) all BTRFS metadata from volume?

btrfs-image



-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Backing up BTRFS metadata

2017-05-11 Thread Marat Khalili
Sorry if question sounds unorthodox, Is there some simple way to read 
(and backup) all BTRFS metadata from volume?


Motivation of course is possibility to quickly recover from catastrophic 
filesystem failures on a logical level. Some small amount of actual data 
that this metadata references may be overwritten between backup and 
restore moments, but due to checksumming it can easily be caught (and 
either individually restored from backup or discarded).


--

With Best Regards,
Marat Khalili
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] Btrfs: use bio_clone_bioset_partial to simplify DIO submit

2017-05-11 Thread David Sterba
On Mon, Apr 17, 2017 at 06:16:23PM -0700, Liu Bo wrote:
> Currently when mapping bio to limit bio to a single stripe length, we
> split bio by adding page to bio one by one, but later we don't modify
> the vector of bio at all, thus we can use bio_clone_fast to use the
> original bio vector directly.
> 
> Signed-off-by: Liu Bo 
> ---
>  fs/btrfs/extent_io.c |  15 +++
>  fs/btrfs/extent_io.h |   1 +
>  fs/btrfs/inode.c | 122 
> +++
>  3 files changed, 62 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0d4aea4..1b7156c 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2726,6 +2726,21 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, 
> unsigned int nr_iovecs)
>   return bio;
>  }
>  
> +struct bio *btrfs_bio_clone_partial(struct bio *orig, gfp_t gfp_mask, int 
> offset, int size)
> +{
> + struct bio *bio;
> +
> + bio = bio_clone_fast(orig, gfp_mask, btrfs_bioset);
> + if (bio) {

Please switch that to

bio = ...;
if (!bio)
return NULL;

(the rest)

return bio;

> + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
> + btrfs_bio->csum = NULL;
> + btrfs_bio->csum_allocated = NULL;
> + btrfs_bio->end_io = NULL;
> +
> + bio_trim(bio, (offset >> 9), (size >> 9));

Hm, so bio_trim also uses ints for the parameters, let's stick to that.

> + }
> + return bio;
> +}
>  
>  static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
>  unsigned long bio_flags)
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 3e4fad4..3b2bc88 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -460,6 +460,7 @@ btrfs_bio_alloc(struct block_device *bdev, u64 
> first_sector, int nr_vecs,
>   gfp_t gfp_flags);
>  struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
>  struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
> +struct bio *btrfs_bio_clone_partial(struct bio *orig, gfp_t gfp_mask, int 
> offset, int size);

line over 80 chars

>  
>  struct btrfs_fs_info;
>  struct btrfs_inode;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index a18510b..6215720 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -8230,16 +8230,6 @@ static void btrfs_end_dio_bio(struct bio *bio)
>   bio_put(bio);
>  }
>  
> -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
> -u64 first_sector, gfp_t gfp_flags)
> -{
> - struct bio *bio;
> - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
> - if (bio)
> - bio_associate_current(bio);
> - return bio;
> -}
> -
>  static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
>struct btrfs_dio_private *dip,
>struct bio *bio,
> @@ -8329,24 +8319,22 @@ static int btrfs_submit_direct_hook(struct 
> btrfs_dio_private *dip,
>   struct btrfs_root *root = BTRFS_I(inode)->root;
>   struct bio *bio;
>   struct bio *orig_bio = dip->orig_bio;
> - struct bio_vec *bvec;
>   u64 start_sector = orig_bio->bi_iter.bi_sector;
>   u64 file_offset = dip->logical_offset;
> - u64 submit_len = 0;
>   u64 map_length;
> - u32 blocksize = fs_info->sectorsize;
>   int async_submit = 0;
> - int nr_sectors;
> + int submit_len;
> + int clone_offset = 0;
> + int clone_len;
>   int ret;
> - int i, j;
>  
> - map_length = orig_bio->bi_iter.bi_size;
> + submit_len = map_length = orig_bio->bi_iter.bi_size;

Please do 2 separate initialization statements.

>   ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
> _length, NULL, 0);
>   if (ret)
>   return -EIO;
>  
> - if (map_length >= orig_bio->bi_iter.bi_size) {
> + if (map_length >= submit_len) {
>   bio = orig_bio;
>   dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
>   goto submit;
> @@ -8358,70 +8346,52 @@ static int btrfs_submit_direct_hook(struct 
> btrfs_dio_private *dip,
>   else
>   async_submit = 1;
>  
> - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
> - if (!bio)
> - return -ENOMEM;
> -
> - bio->bi_opf = orig_bio->bi_opf;
> - bio->bi_private = dip;
> - bio->bi_end_io = btrfs_end_dio_bio;
> - btrfs_io_bio(bio)->logical = file_offset;
> + /* bio split */
>   atomic_inc(>pending_bios);
> + while (submit_len > 0) {
> + /* map_length < submit_len, it's a int */
> + clone_len = min(submit_len, (int)map_length);

The types are mixed, map_length is u64 and cannot be easily switched to

Re: runtime btrfsck

2017-05-11 Thread Duncan
Roman Mamedov posted on Wed, 10 May 2017 13:52:55 +0500 as excerpted:

> So even with a minor corruption (something wonky in just ONE block of a
> multi-terabyte FS) the answer is way too often "nuke the entire thing
> and restore from backups".

Just another case where my "keep it small enough to be maintainable" 
policy triggers.  If that double-digit-TB fs is instead broken along 
functional/logical lines into say a dozen 1 TB each fs and that single 
block is corrupted, it can only be corrupted in one of them, so 11 of the 
dozen will be fine, and nuking to restore from backups just the single 1 
TB filesystem of a dozen, instead of the single 12-TB fs, isn't such a 
big deal -- it remains realistically maintainable.

Of course if at your scale 12 TB... or 12000 TB... is considered 
maintainable, great, but then we'd be unlikely to be having this 
discussion...

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


btrfs list corruption and soft lockups while testing writeback error handling

2017-05-11 Thread Jeff Layton
I finally got my writeback error handling test to work on btrfs (thanks,
Chris!), by making the filesystem stripe the data and mirror the
metadata across two devices. The test passes now, but on one run, I got
the following list corruption warning and then a soft lockup (which is
probably fallout from the list corruption).

I ran the test several times before and since then without this failure,
so I don't have a clear reproducer. The kernel in this instance is
basically a v4.11 kernel with my pile of writeback error handling
patches on top:

https://git.samba.org/?p=jlayton/linux.git;a=shortlog;h=refs/heads/wberr

It may be that they are a contributing factor, but this smells more like
a bug down in btrfs. Let me know if you need other info:

--8<---

[  438.341942] run fstests generic/999 at 2017-05-11 07:03:39
[  439.453293] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 1 
transid 3 /dev/vda8
[  439.465918] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 2 
transid 3 /dev/vda7
[  439.603578] device-mapper: ioctl: device doesn't appear to be in the dev 
hash table.
[  439.762422] BTRFS info (device dm-4): disk space caching is enabled
[  439.763808] BTRFS info (device dm-4): has skinny extents
[  439.764979] BTRFS info (device dm-4): flagging fs with big metadata feature
[  439.785879] BTRFS info (device dm-4): creating UUID tree
[  439.974266] BTRFS info (device dm-4): disk space caching is enabled
[  439.975783] BTRFS info (device dm-4): has skinny extents
[  440.229263] Buffer I/O error on dev dm-4, logical block 2621424, async page 
read
[  440.239970] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 
1, rd 0, flush 0, corrupt 0, gen 0
[  440.242459] [ cut here ]
[  440.243276] WARNING: CPU: 0 PID: 5162 at lib/list_debug.c:28 
__list_add_valid+0x69/0xa0
[  440.244338] list_add corruption. prev->next should be next 
(8dd531056b08), but was a93242807e90. (prev=a93242807e90).
[  440.245939] Modules linked in: btrfs xor raid6_pq binfmt_misc ip6t_rpfilter 
ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_broute bridge 
stp llc ebtable_nat ip6table_mangle ip6table_security ip6table_nat 
nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_raw iptable_mangle 
iptable_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 
nf_nat nf_conntrack iptable_raw ebtable_filter ebtables ip6table_filter 
ip6_tables snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core 
crct10dif_pclmul crc32_pclmul nfsd ghash_clmulni_intel ppdev snd_hwdep snd_pcm 
acpi_cpufreq snd_timer tpm_tis snd parport_pc tpm_tis_core parport pcspkr tpm 
i2c_piix4 auth_rpcgss soundcore floppy joydev qemu_fw_cfg virtio_balloon 
nfs_acl lockd grace sunrpc xfs libcrc32c qxl drm_kms_helper virtio_net
[  440.254739]  virtio_blk virtio_console virtio_rng ttm drm crc32c_intel 
virtio_pci virtio_ring ata_generic virtio serio_raw pata_acpi
[  440.256352] CPU: 0 PID: 5162 Comm: fsync-err Not tainted 4.11.0+ #52
[  440.257534] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.9.3-1.fc25 04/01/2014
[  440.258584] Call Trace:
[  440.259096]  dump_stack+0x63/0x86
[  440.259618]  __warn+0xcb/0xf0
[  440.260116]  warn_slowpath_fmt+0x5a/0x80
[  440.260798]  ? check_parent_dirs_for_sync+0x109/0x140 [btrfs]
[  440.261755]  __list_add_valid+0x69/0xa0
[  440.262442]  btrfs_log_inode_parent+0x25c/0x9f0 [btrfs]
[  440.263323]  ? btrfs_releasepage+0x20/0x20 [btrfs]
[  440.264059]  ? wait_current_trans+0x2e/0xf0 [btrfs]
[  440.264792]  ? kmem_cache_alloc+0x195/0x1b0
[  440.265455]  ? join_transaction+0x27/0x420 [btrfs]
[  440.266175]  btrfs_log_dentry_safe+0x60/0x80 [btrfs]
[  440.266965]  btrfs_sync_file+0x2b7/0x400 [btrfs]
[  440.267655]  vfs_fsync_range+0x49/0xb0
[  440.268266]  do_fsync+0x3d/0x70
[  440.268806]  SyS_fsync+0x10/0x20
[  440.269347]  entry_SYSCALL_64_fastpath+0x1a/0xa9
[  440.270033] RIP: 0033:0x7f7983af1b70
[  440.270607] RSP: 002b:7ffe13b3aa18 EFLAGS: 0246 ORIG_RAX: 
004a
[  440.271661] RAX: ffda RBX: 0005 RCX: 7f7983af1b70
[  440.272623] RDX: 00010401 RSI: 023ef030 RDI: 0004
[  440.273696] RBP: 7ffe13b3cc77 R08:  R09: 7f79845364c8
[  440.274808] R10: 0008 R11: 0246 R12: 023ef030
[  440.275873] R13: 0005 R14:  R15: 
[  440.276995] ---[ end trace 878ee9789ed2d63b ]---
[  440.278476] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 
2, rd 0, flush 0, corrupt 0, gen 0
[  440.282362] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 
3, rd 0, flush 0, corrupt 0, gen 0
[  440.300180] BTRFS warning (device dm-4): lost page write due to IO error on 
/dev/mapper/error-test
[  440.301502] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 
4, rd 0, flush 

parent transid verify failed

2017-05-11 Thread Massimo B.
Hello,

this is some btrfs-on-luks, USB hdd as blockdevice.
I can't mount my btrfs anymore, getting continuously the same syslog error:

- Last output repeated twice -
May 11 07:58:25 [kernel] BTRFS error (device dm-3): failed to read block groups:
-5
May 11 07:58:25 [kernel] BTRFS error (device dm-3): open_ctree failed
May 11 07:58:31 [kernel] BTRFS info (device dm-3): use zlib compression
May 11 07:58:31 [kernel] BTRFS info (device dm-3): enabling auto defrag
May 11 07:58:31 [kernel] BTRFS info (device dm-3): disk space caching is enabled
May 11 07:58:31 [kernel] BTRFS info (device dm-3): has skinny extents
May 11 07:58:33 [kernel] BTRFS error (device dm-3): parent transid verify failed
on 541635395584 wanted 10388 found 10385

This is the last part of btrfs check --repair (I know, highly experimental, but
I didn't get an alternative solution on #btrfs) :

rent transid verify failed on 541577035776 wanted 10388 found 10384
parent transid verify failed on 541577035776 wanted 10388 found 10384
parent transid verify failed on 541577035776 wanted 10388 found 10384
parent transid verify failed on 541577035776 wanted 10388 found 10384
parent transid verify failed on 541577035776 wanted 10388 found 10384
Chunk[256, 228, 429526089728]: length(1073741824), offset(429526089728), type(1)
is not found in block group
Chunk[256, 228, 430599831552]: length(1073741824), offset(430599831552), type(1)
is not found in block group
Chunk[256, 228, 431673573376]: length(1073741824), offset(431673573376), type(1)
is not found in block group
Chunk[256, 228, 434894798848]: length(1073741824), offset(434894798848), type(1)
is not found in block group
Chunk[256, 228, 435968540672]: length(1073741824), offset(435968540672), type(1)
is not found in block group
Chunk[256, 228, 437042282496]: length(1073741824), offset(437042282496), type(1)
is not found in block group
Chunk[256, 228, 438116024320]: length(1073741824), offset(438116024320), type(1)
is not found in block group
ref mismatch on [429497528320 40960] extent item 0, found 1
Backref 429497528320 parent 858210304 owner 0 offset 0 num_refs 0 not found in
extent tree
Incorrect local backref count on 429497528320 parent 858210304 owner 0 offset 0
found 1 wanted 0 back 0x37aaefc0
backpointer mismatch on [429497528320 40960]
parent transid verify failed on 541635395584 wanted 10388 found 10385
Ignoring transid failure
Failed to find [541635395584, 168, 16384]
btrfs unable to find ref byte nr 541635395584 parent 0 root 2  owner 1 offset 0
failed to repair damaged filesystem, aborting

How did that happen?
Yesterday I sent a big snapshot from local drive to a slower USB drive via
btrbk. That was  already finished. However the USB drive was completely filled
up to 99% and doing some IO apparently. Then I was not able to shutdown the
machine. Shutdown was really slow, finally umounts were accomplished, services
stopped, system shutdown almost finished, but no shutdown. I did a Sysreq- E I U
S R B, no reboot. Sysreq-O did not even shut off. So as last consequence I
disconnected power supply.

The broken btrfs is actually only a snapshot receiver as backup. I would prefer
to get it repaired. Seeing that btrfs is sensitive about filling up to 99%
usage, I'm worried about my production btrfs.

This is Gentoo-Linux, 4.10.14-ck, btrfs-progs-4.10.2.

Best regards,
Massimo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v3 5/6] btrfs: qgroup: Introduce extent changeset for qgroup reserve functions

2017-05-11 Thread Qu Wenruo



At 05/11/2017 01:59 AM, Goldwyn Rodrigues wrote:



On 05/09/2017 09:36 PM, Qu Wenruo wrote:

Introduce a new parameter, struct extent_changeset for
btrfs_qgroup_reserved_data() and its callers.

Such extent_changeset was used in btrfs_qgroup_reserve_data() to record
which range it reserved in current reserve, so it can free it at error
path.

The reason we need to export it to callers is, at buffered write error
path, without knowing what exactly which range we reserved in current
allocation, we can free space which is not reserved by us.

This will lead to qgroup reserved space underflow.

Reviewed-by: Chandan Rajendra 
Signed-off-by: Qu Wenruo 
---
  fs/btrfs/ctree.h   |  6 --
  fs/btrfs/extent-tree.c | 16 +++-
  fs/btrfs/extent_io.h   | 34 ++
  fs/btrfs/file.c| 12 +---
  fs/btrfs/inode-map.c   |  4 +++-
  fs/btrfs/inode.c   | 18 ++
  fs/btrfs/ioctl.c   |  5 -
  fs/btrfs/qgroup.c  | 41 +
  fs/btrfs/qgroup.h  |  3 ++-
  fs/btrfs/relocation.c  |  4 +++-
  10 files changed, 113 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e82516fe2d8..52a0147cd612 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2704,8 +2704,9 @@ enum btrfs_flush_state {
COMMIT_TRANS=   6,
  };
  
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);

  int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len);
  void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
  void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
@@ -2723,7 +2724,8 @@ void btrfs_subvolume_release_metadata(struct 
btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
  void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 
num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+int btrfs_delalloc_reserve_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len);
  void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4f62696131a6..782e0f5feb69 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3364,6 +3364,7 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct inode *inode = NULL;
+   struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
u64 num_pages = 0;
@@ -3483,7 +3484,7 @@ static int cache_save_setup(struct 
btrfs_block_group_cache *block_group,
num_pages *= 16;
num_pages *= PAGE_SIZE;
  
-	ret = btrfs_check_data_free_space(inode, 0, num_pages);

+   ret = btrfs_check_data_free_space(inode, _reserved, 0, num_pages);
if (ret)
goto out_put;
  
@@ -3514,6 +3515,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,

block_group->disk_cache_state = dcs;
spin_unlock(_group->lock);
  
+	extent_changeset_free(data_reserved);

return ret;
  }
  
@@ -4282,7 +4284,8 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)

   * Will replace old btrfs_check_data_free_space(), but for patch split,
   * add a new function first and then replace it.
   */
-int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+int btrfs_check_data_free_space(struct inode *inode,
+   struct extent_changeset **reserved, u64 start, u64 len)
  {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
@@ -4297,9 +4300,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 
start, u64 len)
return ret;
  
  	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */

-   ret = btrfs_qgroup_reserve_data(inode, start, len);
+   ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
+   else
+   ret = 0;
return ret;
  }
  
@@ -6140,11 +6145,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)

   * Return 

Re: [PATCH 7/8] nowait aio: xfs

2017-05-11 Thread Christoph Hellwig
Looks fine,

Reviewed-by: Christoph Hellwig 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/8] nowait aio: return on congested block device

2017-05-11 Thread Christoph Hellwig
Looks fine,

Reviewed-by: Christoph Hellwig 

Although lifting the make_request limit is something a lot of users
would appreciate in the near future..
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/8] nowait aio: Introduce RWF_NOWAIT

2017-05-11 Thread Christoph Hellwig
On Tue, May 09, 2017 at 07:22:13AM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> This flag informs kernel to bail out if an AIO request will block
> for reasons such as file allocations, or a writeback triggered,
> or would block while allocating requests while performing
> direct I/O.
> 
> Unfortunately, aio_flags is not checked for validity, which would
> break existing applications which have it set to anything besides zero
> or IOCB_FLAG_RESFD. So, we are using aio_reserved1 and renaming it
> to aio_rw_flags.
> 
> RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags.
> 
> The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This
> is called by most filesystems, either through fsops.write_iter() or through
> the function defined by write_iter(). If not, we perform the check defined
> by .write_iter() which is called for direct IO specifically.
> 
> Filesystems xfs, btrfs and ext4 would be supported in the following patches.
> 
> Signed-off-by: Goldwyn Rodrigues 
> ---
>  fs/9p/vfs_file.c| 3 +++
>  fs/aio.c| 6 ++
>  fs/ceph/file.c  | 3 +++
>  fs/cifs/file.c  | 3 +++
>  fs/fuse/file.c  | 3 +++
>  fs/nfs/direct.c | 3 +++
>  fs/ocfs2/file.c | 3 +++
>  include/linux/fs.h  | 5 -
>  include/uapi/linux/fs.h | 1 +
>  mm/filemap.c| 3 +++
>  10 files changed, 32 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> index 3de3b4a89d89..403681db7723 100644
> --- a/fs/9p/vfs_file.c
> +++ b/fs/9p/vfs_file.c
> @@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter 
> *from)
>   loff_t origin;
>   int err = 0;
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   retval = generic_write_checks(iocb, from);
>   if (retval <= 0)
>   return retval;
> diff --git a/fs/aio.c b/fs/aio.c
> index 020fa0045e3c..ea9f8581d902 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct 
> iocb __user *user_iocb,
>   goto out_put_req;
>   }
>  
> + if ((req->common.ki_flags & IOCB_NOWAIT) &&
> + !(req->common.ki_flags & IOCB_DIRECT)) {

Weird indentation.  Either align after the opening if brace:

if ((req->common.ki_flags & IOCB_NOWAIT) &&
!(req->common.ki_flags & IOCB_DIRECT)) {

or using two tabs:

if ((req->common.ki_flags & IOCB_NOWAIT) &&
!(req->common.ki_flags & IOCB_DIRECT)) {

if the first version looks confusing, but never using the same
indentation level as the following code.

Except for that the patch looks fine to me:

Reviewed-by: Christoph Hellwig 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback

2017-05-11 Thread Christoph Hellwig
It might make sense to move filemap_range_has_page into a separate
prep patch.

Otherwise this looks fine:

Reviewed-by: Christoph Hellwig 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/8] Use RWF_* flags for AIO operations

2017-05-11 Thread Christoph Hellwig
Please add subsystem prefixes to your subject lines, e.g.

fs:

for all the generic fs ones,

xfs:

for XFS,

block:

for block layer changes, etc.

>  
> - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
> - return -EOPNOTSUPP;
> -
>   init_sync_kiocb(, filp);
> - if (flags & RWF_HIPRI)
> - kiocb.ki_flags |= IOCB_HIPRI;
> - if (flags & RWF_DSYNC)
> - kiocb.ki_flags |= IOCB_DSYNC;
> - if (flags & RWF_SYNC)
> - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
> + ret = kiocb_set_rw_flags(, flags);
> + if (ret)
> + return ret;

And please split factoring out kiocb_set_rw_flags into a separate
prep patch.

Otherwise these changes look fine:

Reviewed-by: Christoph Hellwig 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Convert fs_info->free_chunk_space to atomic64_t

2017-05-11 Thread Nikolay Borisov
The ->free_chunk_space variable is used to track the unallocated space and
access to it is protected by a spinlock, which is not used for anything else.
Make the code a bit self-explanatory by switching the variable to an atomic64_t
type and kill the spinlock.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h   |  3 +--
 fs/btrfs/disk-io.c |  3 +--
 fs/btrfs/extent-tree.c |  4 +---
 fs/btrfs/volumes.c | 26 +++---
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e21211e99c3..2202dfdc7888 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,8 +729,7 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
 
/* keep track of unallocated space */
-   spinlock_t free_chunk_lock;
-   u64 free_chunk_space;
+   atomic64_t free_chunk_space;
 
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 061c1d1f774f..2ef80d562a54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(_info->fs_roots_radix_lock);
spin_lock_init(_info->delayed_iput_lock);
spin_lock_init(_info->defrag_inodes_lock);
-   spin_lock_init(_info->free_chunk_lock);
spin_lock_init(_info->tree_mod_seq_lock);
spin_lock_init(_info->super_lock);
spin_lock_init(_info->qgroup_op_lock);
@@ -2667,7 +2666,7 @@ int open_ctree(struct super_block *sb,
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
-   fs_info->free_chunk_space = 0;
+   atomic64_set(_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3ab1f88af038..f913c25b9a54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4626,9 +4626,7 @@ static int can_overcommit(struct btrfs_root *root,
 
used += space_info->bytes_may_use;
 
-   spin_lock(_info->free_chunk_lock);
-   avail = fs_info->free_chunk_space;
-   spin_unlock(_info->free_chunk_lock);
+   avail = atomic64_read(_info->free_chunk_space);
 
/*
 * If we have dup, raid1 or raid10 then only half of the free
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ab8a66d852f9..923a3591265c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2413,9 +2413,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, 
const char *device_path
fs_info->fs_devices->total_devices++;
fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += device->total_bytes;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(device->total_bytes, _info->free_chunk_space);
 
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_info->fs_devices->rotating = 1;
@@ -2850,9 +2848,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
mutex_lock(_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += dev_extent_len;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(dev_extent_len, 
_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(_info->chunk_mutex);
}
@@ -4379,9 +4375,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space -= diff;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_sub(diff, _info->free_chunk_space);
}
mutex_unlock(_info->chunk_mutex);
 
@@ -4505,9 +4499,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += diff;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(diff, _info->free_chunk_space);
mutex_unlock(_info->chunk_mutex);
}
return ret;
@@