Re: Btrfs/SSD
Austin S. Hemmelgarn posted on Mon, 17 Apr 2017 07:53:04 -0400 as excerpted: > * In my personal experience, Intel, Samsung, and Crucial appear to be > the best name brands (in relative order of quality). I have personally > had bad experiences with SanDisk and Kingston SSD's, but I don't have > anything beyond circumstantial evidence indicating that it was anything > but bad luck on both counts. FWIW, I'm in the market for SSDs ATM, and remembered this from a couple weeks ago so went back to find it. Thanks. =:^) (I'm currently still on quarter-TB generation ssds, plus spinning rust for the larger media partition and backups, and want to be rid of the spinning rust, so am looking at half-TB to TB, which seems to be the pricing sweet spot these days anyway.) -- Duncan - List replies preferred. No HTML msgs. "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3.1 5/6] btrfs: qgroup: Introduce extent changeset for qgroup reserve functions
Introduce a new parameter, struct extent_changeset for btrfs_qgroup_reserved_data() and its callers. Such extent_changeset was used in btrfs_qgroup_reserve_data() to record which range it reserved in current reserve, so it can free it at error path. The reason we need to export it to callers is, at buffered write error path, without knowing what exactly which range we reserved in current allocation, we can free space which is not reserved by us. This will lead to qgroup reserved space underflow. Reviewed-by: Chandan RajendraSigned-off-by: Qu Wenruo --- fs/btrfs/ctree.h | 6 -- fs/btrfs/extent-tree.c | 23 +-- fs/btrfs/extent_io.h | 34 + fs/btrfs/file.c| 12 +--- fs/btrfs/inode-map.c | 4 +++- fs/btrfs/inode.c | 18 ++ fs/btrfs/ioctl.c | 5 - fs/btrfs/qgroup.c | 51 -- fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 4 +++- 10 files changed, 119 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e82516fe2d8..52a0147cd612 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2704,8 +2704,9 @@ enum btrfs_flush_state { COMMIT_TRANS= 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); @@ -2723,7 +2724,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4f62696131a6..ef09cc37f25f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3364,6 +3364,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3483,7 +3484,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, _reserved, 0, num_pages); if (ret) goto out_put; @@ -3514,6 +3515,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, block_group->disk_cache_state = dcs; spin_unlock(_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -4277,12 +4279,8 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) return ret; } -/* - * New check_data_free_space() with ability for precious data reservation - * Will replace old btrfs_check_data_free_space(), but for patch split, - * add a new function first and then replace it. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4297,9 +4295,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -6123,6 +6123,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * @inode:
[RFC PATCH v3.1 3/6] btrfs: qgroup: Return actually freed bytes for qgroup release or free data
btrfs_qgroup_release/free_data() only returns 0 or minus error number(ENOMEM is the only possible error). This is normally good enough, but sometimes we need the accurate byte number it freed/released. Change it to return actually released/freed bytenr number instead of 0 for success. And slightly modify related extent_changeset structure, since in btrfs one none-hole data extent won't be larger than 128M, so "unsigned int" is large enough for the use case. Signed-off-by: Qu Wenruo--- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/qgroup.c | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e390451c72e6..4f62696131a6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4298,7 +4298,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); - if (ret) + if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f0ede3..cc1b08fa9fe7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -205,7 +205,7 @@ struct extent_buffer { */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - u64 bytes_changed; + unsigned int bytes_changed; /* Changed ranges */ struct ulist range_changed; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9f01c25469f7..ad2e99491395 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2886,6 +2886,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, changeset.bytes_changed); + ret = changeset.bytes_changed; out: ulist_release(_changed); return ret; -- 2.12.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3.1 6/6] btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG] For the following case, btrfs can underflow qgroup reserved space at error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B -- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at write back time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan RajendraSigned-off-by: Qu Wenruo Reviewed-by: Chandan Rajendra Tested-by: Chandan Rajendra --- fs/btrfs/ctree.h | 6 +++-- fs/btrfs/extent-tree.c | 12 + fs/btrfs/file.c| 29 +++- fs/btrfs/inode.c | 29 ++-- fs/btrfs/ioctl.c | 4 +-- fs/btrfs/qgroup.c | 72 ++ fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 8 +++--- 8 files changed, 117 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 52a0147cd612..75d2eced61b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2707,7 +2707,10 @@ enum btrfs_flush_state { int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -2726,7 +2729,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef09cc37f25f..eeeccc8a618e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4340,7 +4340,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4350,7 +4351,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, start, len); -
[RFC PATCH v3.1 2/6] btrfs: qgroup: Cleanup btrfs_qgroup_prepare_account_extents function
Quite a lot of qgroup corruption happens due to wrong timing of calling btrfs_qgroup_prepare_account_extents(). Since the safest timing is calling it just before btrfs_qgroup_account_extents(), there is no need to separate these 2 function. Merging them will make code cleaner and less bug prone. Signed-off-by: Qu Wenruo--- fs/btrfs/qgroup.c | 50 +- fs/btrfs/qgroup.h | 2 -- fs/btrfs/transaction.c | 10 -- 3 files changed, 17 insertions(+), 45 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 905fed1ee0dd..9f01c25469f7 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1403,38 +1403,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, return ret; } -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, -struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_extent_record *record; - struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; - u64 qgroup_to_skip; - int ret = 0; - - delayed_refs = >transaction->delayed_refs; - qgroup_to_skip = delayed_refs->qgroup_to_skip; - - /* -* No need to do lock, since this function will only be called in -* btrfs_commit_transaction(). -*/ - node = rb_first(_refs->dirty_extent_root); - while (node) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - if (WARN_ON(!record->old_roots)) - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, >old_roots); - if (ret < 0) - break; - if (qgroup_to_skip) - ulist_del(record->old_roots, qgroup_to_skip, 0); - node = rb_next(node); - } - return ret; -} - int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -2051,6 +2019,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* +* old roots should be searched when inserting qgroup +* extent record +*/ + if (WARN_ON(!record->old_roots)) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, + >old_roots); + if (ret < 0) + goto cleanup; + } + + /* * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). @@ -2059,8 +2040,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record->bytenr, SEQ_LAST, _roots); if (ret < 0) goto cleanup; - if (qgroup_to_skip) + if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); + ulist_del(record->old_roots, qgroup_to_skip, + 0); + } ret = btrfs_qgroup_account_extent(trans, fs_info, record->bytenr, record->num_bytes, record->old_roots, new_roots); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index fe04d3f295c6..38d14d4575c0 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -134,8 +134,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, -struct btrfs_fs_info *fs_info); /* * Inform qgroup to trace one dirty extent, its info is recorded in @record. * So qgroup can account it at transaction committing time. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2168654c90a1..ee5b41d297d1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1374,9 +1374,6 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, fs_info); if
[RFC PATCH v3.1 4/6] btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quota enable
[BUG] Under the following case, we can underflow qgroup reserved space. Task A|Task B --- Quota disabled | Buffered write | |- btrfs_check_data_free_space() | | *NO* qgroup space is reserved | | since quota is *DISABLED* | |- All pages are copied to page | cache | | Enable quota | Quota scan finished | | Sync_fs | |- run_delalloc_range | |- Write pages | |- btrfs_finish_ordered_io ||- insert_reserved_file_extent | |- btrfs_qgroup_release_data() | Since no qgroup space is reserved in Task A, we underflow qgroup reserved space This can be detected by fstest btrfs/104. [CAUSE] In insert_reserved_file_extent() we info qgroup to release the @ram_bytes size of qgroup reserved_space under all case. And btrfs_qgroup_release_data() will check if qgroup is enabled. However in above case, the buffered write happens before quota is enabled, so we don't havee reserved space for that range. [FIX] In insert_reserved_file_extent(), we info qgroup to release the acctual byte number it released. In above case, since we don't have reserved space, we info qgroup to release 0 byte, so the problem can be fixed. And thanks to the @reserved parameter introduced by qgroup rework, and previous patch to return release bytes, the fix can be as small as less than 10 lines. Signed-off-by: Qu Wenruo--- fs/btrfs/inode.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17cbe9306faf..a1294d5baef5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2143,6 +2143,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 qg_released; int extent_inserted = 0; int ret; @@ -2198,13 +2199,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, ); + /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head */ - btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); + if (ret < 0) + goto out; + qg_released = ret; + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, ); out: btrfs_free_path(path); -- 2.12.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3.1 1/6] btrfs: qgroup: Add quick exit for non-fs extents
For btrfs_qgroup_account_extent(), modify make it exit quicker for non-fs extents. This will also reduce the noise in trace_btrfs_qgroup_account_extent event. Signed-off-by: Qu Wenruo--- fs/btrfs/qgroup.c | 41 +++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3f75b5cbbfef..905fed1ee0dd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1915,6 +1915,33 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, return 0; } +/* + * Helper to check if the @roots is a list of fs tree roots + * Return 0 for definitely not a fs/subvol tree roots ulist + * Return 1 for possible fs/subvol tree roots ulist(including empty) + */ +static int maybe_fs_roots(struct ulist *roots) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + + /* Empty one, still possible for fs roots */ + if (!roots || roots->nnodes == 0) + return 1; + + ULIST_ITER_INIT(); + unode = ulist_next(roots, ); + if (!unode) + return 1; + + /* +* If it contains fs tree roots, then it must belongs to fs/subvol +* trees. +* If it contains non-fs tree, it won't be shared to fs/subvol trees. +*/ + return is_fstree(unode->val); +} + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -1931,10 +1958,20 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags)) return 0; - if (new_roots) + if (new_roots) { + if (!maybe_fs_roots(new_roots)) + goto out_free; nr_new_roots = new_roots->nnodes; - if (old_roots) + } + if (old_roots) { + if (!maybe_fs_roots(old_roots)) + goto out_free; nr_old_roots = old_roots->nnodes; + } + + /* Quick exit, either not fs tree roots, or won't affect any qgroup */ + if (nr_old_roots == 0 && nr_new_roots == 0) + goto out_free; BUG_ON(!fs_info->quota_root); -- 2.12.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v3.1 0/6] Qgroup fixes, Non-stack version
The remaining qgroup fixes patches, based on the Chris' for-linus-4.12 branch with commit 9bcaaea7418d09691f1ffab5c49aacafe3eef9d0 as base. Can be fetched from github: https://github.com/adam900710/linux/tree/qgroup_fixes_non_stack Despite the 5th patch, patches are mostly unchanged. Only minor conflicts are addressed in this update. The 5th patch chooses a different method to reduce stack memory usage. Instead of allocating extent_changeset structure on stack, this time only a pointer of extent_changeset is allocated on stack. And real extent_changeset is allocated inside btrfs_qgroup_reserve_data(). The impact to stack memory usage of quota disabled case is reduced to minimal. While the error handler routine is not affected either. v2: Add reviewed-by tag for 2nd patch Update the first patch to follow the new trace point standard RFC v3: Use non-stack (dyanamic allocation) for extent_changeset structure, in 5th patch, to reduce impact for quota disabled cases. Rebase to latest for-linus-4.12 branch. RFC v3.1: Update comment to include the newly introduced parameter Use init/release function to replace open coded ulist_init/release(). Qu Wenruo (6): btrfs: qgroup: Add quick exit for non-fs extents btrfs: qgroup: Cleanup btrfs_qgroup_prepare_account_extents function btrfs: qgroup: Return actually freed bytes for qgroup release or free data btrfs: qgroup: Fix qgroup reserved space underflow caused by buffered write and quota enable btrfs: qgroup: Introduce extent changeset for qgroup reserve functions btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges fs/btrfs/ctree.h | 12 ++- fs/btrfs/extent-tree.c | 37 + fs/btrfs/extent_io.h | 36 - fs/btrfs/file.c| 41 ++ fs/btrfs/inode-map.c | 4 +- fs/btrfs/inode.c | 58 - fs/btrfs/ioctl.c | 9 ++- fs/btrfs/qgroup.c | 215 - fs/btrfs/qgroup.h | 8 +- fs/btrfs/relocation.c | 12 +-- fs/btrfs/transaction.c | 10 --- 11 files changed, 303 insertions(+), 139 deletions(-) -- 2.12.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: errno=-28 No space left, with kernel backtrace (blocking bug)
Hi. I wish mean: I can't. I now for the btrfs maturity. But it's my unique alternative. I understand. For me this bug should be important because it block all the system, since Linux 4.1+ It's exactly what I wish, pay to have a quick fix. I don't think I wish too much, just fix this bug and put to upstream. Thanks for your time to read me, and thanks for confirm this bug is not forget. A least somebody have take time to read me, great thanks for this. Cheers, On 05/12/17 04:01, Duncan wrote: > alpha_one_x86 posted on Thu, 11 May 2017 17:25:32 +0200 as excerpted: > >> Up plz, I can work with this bug. >> >> >> On 05/11/17 01:39, alpha_one_x86 wrote: >>> Hi, this bug is very blocking for me: >>> >>> https://bugzilla.kernel.org/show_bug.cgi?id=195257 >>> >>> The server is backup server, I btrfs receive (with and without -p), and >>> of course btrfs subvolume delete The volume is 70TB, then I use >>> space_cache=v2 > Since you can work with it, do so. We're not stopping you. =:^) > > Or did you mean /can't/? > > Keep in mind that while btrfs is considered stabilizing, on this list at > least it's not considered fully stable and mature. If you want/need a > filesystem that's stable and mature, there's others out there that fill > that requirement. We don't claim btrfs does. Your system, your choice > of filesystem and with it, filesystem maturity. > > Meanwhile, btrfs devs have a lot of stuff on their plate, including bugs > they're already working on and further development, and (as with most > devs) aren't going to take kindly to demands that they work on *YOUR* bug > *RIGHT* *NOW*. That, if anything, is about the fastest way I know of to > ensure that working on it is /deprioritized/, with stuff that would have > been put off to work on it, done first, instead. > > Unless of course you're paying the salary of that dev. If you are, then > you get to call the shots, to some degree at least. Good devs tend to > find other employment if you're too controlling, tho, and they can > because good devs are in enough demand they often pick their jobs from a > list of offers, and they tend to be motivated by more than money so if > you're too demanding you can't expect to simply outbid everyone else on > the list, either, no matter how much money you have. And any dev skilled > enough to regularly get their work into the mainline kernel can be > considered a good dev, so... > > So I'd suggest that if it's high enough priority to you, you'll find a > kernel dev and sponsor them to work on it for you. But be warned, if > they're not already a btrfs dev, it'll take them some time to come upto > speed. Otherwise, you'll wait in line with everyone else... unless you > push too much, in which case your reports will as I said get > deprioritized, and if noone else reports them, your bugs may not get > handled until there's nothing else waiting... which could easily push > resolution past 2027... yes, a decade or more out. > -- alpha_one_x86/BRULE HermanMain developer of Supercopier/Ultracopier/CatchChallenger, Esourcing and server management IT, OS, technologies, research & development, security and business department -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: errno=-28 No space left, with kernel backtrace (blocking bug)
alpha_one_x86 posted on Thu, 11 May 2017 17:25:32 +0200 as excerpted: > Up plz, I can work with this bug. > > > On 05/11/17 01:39, alpha_one_x86 wrote: >> Hi, this bug is very blocking for me: >> >> https://bugzilla.kernel.org/show_bug.cgi?id=195257 >> >> The server is backup server, I btrfs receive (with and without -p), and >> of course btrfs subvolume delete The volume is 70TB, then I use >> space_cache=v2 Since you can work with it, do so. We're not stopping you. =:^) Or did you mean /can't/? Keep in mind that while btrfs is considered stabilizing, on this list at least it's not considered fully stable and mature. If you want/need a filesystem that's stable and mature, there's others out there that fill that requirement. We don't claim btrfs does. Your system, your choice of filesystem and with it, filesystem maturity. Meanwhile, btrfs devs have a lot of stuff on their plate, including bugs they're already working on and further development, and (as with most devs) aren't going to take kindly to demands that they work on *YOUR* bug *RIGHT* *NOW*. That, if anything, is about the fastest way I know of to ensure that working on it is /deprioritized/, with stuff that would have been put off to work on it, done first, instead. Unless of course you're paying the salary of that dev. If you are, then you get to call the shots, to some degree at least. Good devs tend to find other employment if you're too controlling, tho, and they can because good devs are in enough demand they often pick their jobs from a list of offers, and they tend to be motivated by more than money so if you're too demanding you can't expect to simply outbid everyone else on the list, either, no matter how much money you have. And any dev skilled enough to regularly get their work into the mainline kernel can be considered a good dev, so... So I'd suggest that if it's high enough priority to you, you'll find a kernel dev and sponsor them to work on it for you. But be warned, if they're not already a btrfs dev, it'll take them some time to come upto speed. Otherwise, you'll wait in line with everyone else... unless you push too much, in which case your reports will as I said get deprioritized, and if noone else reports them, your bugs may not get handled until there's nothing else waiting... which could easily push resolution past 2027... yes, a decade or more out. -- Duncan - List replies preferred. No HTML msgs. "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Creating btrfs RAID on LUKS devs makes devices disappear
Hello, here is the journal.log (I hope). It's quite interesting. I rebooted the machine, performed a mkfs.btrfs on dm-{2,3,4} and dm-3 was missing afterwards (around timestamp 66.*). However, I then logged into the machine from another terminal (around timestamp 118.*) which triggered something to make the device appear again :O Indeed, dm-3 was once again there after logging in. Does systemd mix something up? Hmm, I just did another mkfs once the devices where back, devices were missing, but they re-appeared a few seconds later, without logging into a terminal. After another mkfs, they were gone again and are now still gone after waiting a few minutes. It's really weird, I can't really tell what triggers this yet. Will test more tomorrow, let me know if you have any more ideas what to try. Best regards Sebastian -- Logs begin at Sun 2017-03-26 20:36:24 CEST, end at Fri 2017-05-12 01:00:45 CEST. -- [0.00] nas kernel: Linux version 4.9.27-1-lts (builduser@andyrtr) (gcc version 6.3.1 20170306 (GCC) ) #1 SMP Mon May 8 13:37:42 CEST 2017 [0.00] nas kernel: Command line: BOOT_IMAGE=/default/vmlinuz-linux-lts root=UUID=4ac09b56-3e02-40c0-bf64-02a4cf9344fc rw rootflags=subvol=default ip=192.168.0.3:eth0:none cryptdevice=/dev/sda2:root:allow-discards [0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers' [0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers' [0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers' [0.00] nas kernel: x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR' [0.00] nas kernel: x86/fpu: xstate_offset[3]: 576, xstate_sizes[3]: 64 [0.00] nas kernel: x86/fpu: xstate_offset[4]: 640, xstate_sizes[4]: 64 [0.00] nas kernel: x86/fpu: Enabled xstate features 0x1b, context size is 704 bytes, using 'compacted' format. [0.00] nas kernel: x86/fpu: Using 'eager' FPU context switches. [0.00] nas kernel: e820: BIOS-provided physical RAM map: [0.00] nas kernel: BIOS-e820: [mem 0x-0x0009c3ff] usable [0.00] nas kernel: BIOS-e820: [mem 0x0009c400-0x0009] reserved [0.00] nas kernel: BIOS-e820: [mem 0x000e-0x000f] reserved [0.00] nas kernel: BIOS-e820: [mem 0x0010-0x78770fff] usable [0.00] nas kernel: BIOS-e820: [mem 0x78771000-0x78771fff] ACPI NVS [0.00] nas kernel: BIOS-e820: [mem 0x78772000-0x78772fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0x78773000-0x7e137fff] usable [0.00] nas kernel: BIOS-e820: [mem 0x7e138000-0x7e5bafff] reserved [0.00] nas kernel: BIOS-e820: [mem 0x7e5bb000-0x7e667fff] usable [0.00] nas kernel: BIOS-e820: [mem 0x7e668000-0x7ea06fff] ACPI NVS [0.00] nas kernel: BIOS-e820: [mem 0x7ea07000-0x7effefff] reserved [0.00] nas kernel: BIOS-e820: [mem 0x7efff000-0x7eff] usable [0.00] nas kernel: BIOS-e820: [mem 0x7f00-0x8fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xe000-0xefff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xfe00-0xfe010fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xfec0-0xfec00fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xfed0-0xfed00fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xfee0-0xfee00fff] reserved [0.00] nas kernel: BIOS-e820: [mem 0xff00-0x] reserved [0.00] nas kernel: BIOS-e820: [mem 0x0001-0x00046dff] usable [0.00] nas kernel: NX (Execute Disable) protection: active [0.00] nas kernel: SMBIOS 3.0 present. [0.00] nas kernel: DMI: To Be Filled By O.E.M. To Be Filled By O.E.M./C236 WSI, BIOS P2.10 04/18/2017 [0.00] nas kernel: e820: update [mem 0x-0x0fff] usable ==> reserved [0.00] nas kernel: e820: remove [mem 0x000a-0x000f] usable [0.00] nas kernel: e820: last_pfn = 0x46e000 max_arch_pfn = 0x4 [0.00] nas kernel: MTRR default type: write-back [0.00] nas kernel: MTRR fixed ranges enabled: [0.00] nas kernel: 0-9 write-back [0.00] nas kernel: A-B uncachable [0.00] nas kernel: C-F write-protect [0.00] nas kernel: MTRR variable ranges enabled: [0.00] nas kernel: 0 base 008000 mask 7F8000 uncachable [0.00] nas kernel: 1 base 007F80 mask 7FFF80 uncachable [0.00] nas kernel: 2 disabled [0.00] nas kernel: 3 disabled [0.00] nas kernel: 4 disabled [0.00] nas kernel: 5 disabled [0.00]
Re: Creating btrfs RAID on LUKS devs makes devices disappear
journalctl -b -o short-monotonic > journal.log And then attached the log, hopefully it's small enough to be accepted by the list server (should be). If that's not revealing it might be necessary to reboot with rd.udev.debug but start with the simple case first and see if that reveals what's going on. Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v1 00/30] fs: inode->i_version rework and optimization
On Thu, May 11 2017, J. Bruce Fields wrote: > On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote: >> On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote: >> > 1) Keep i_version as is, make clients also check for i_ctime. >> >> That would be a protocol revision, which we'd definitely rather avoid. >> >> But can't we accomplish the same by using something like >> >> ctime * (some constant) + i_version >> >> ? >> >> >Pro: No on-disk format changes. >> >Cons: After a crash, i_version can go backwards (but when file changes >> >i_version, i_ctime pair should be still different) or not, data can be >> >old or not. >> >> This is probably good enough for NFS purposes: typically on an NFS >> filesystem, results of a read in the face of a concurrent write open are >> undefined. And writers sync before close. >> >> So after a crash with a dirty inode, we're in a situation where an NFS >> client still needs to resend some writes, sync, and close. I'm OK with >> things being inconsistent during this window. >> >> I do expect things to return to normal once that client's has resent its >> writes--hence the worry about actually resuing old values after boot >> (such as if i_version regresses on boot and then increments back to the >> same value after further writes). Factoring in ctime fixes that. > > So for now I'm thinking of just doing something like the following. > > Only nfsd needs it for now, but it could be moved to a vfs helper for > statx, or for individual filesystems that want to do something > different. (The NFSv4 client will want to use the server's change > attribute instead, I think. And other filesystems might want to try > something more ambitious like Neil's proposal.) > > --b. > > diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c > index 12feac6ee2fd..9636c9a60aba 100644 > diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h > index f84fe6bf9aee..14f09f1ef605 100644 > --- a/fs/nfsd/nfsfh.h > +++ b/fs/nfsd/nfsfh.h > @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp) > fhp->fh_pre_saved = false; > } > > +static inline u64 nfsd4_change_attribute(struct inode *inode) > +{ > + u64 chattr; > + > + chattr = inode->i_ctime.tv_sec << 30; > + chattr += inode->i_ctime.tv_nsec; > + chattr += inode->i_version; > + return chattr; So if I chmod a file, all clients will need to flush the content from their cache? Maybe they already do? Maybe it is a boring corner case? > +} > + > /* > * Fill in the pre_op attr for the wcc data > */ > @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp) > fhp->fh_pre_mtime = inode->i_mtime; > fhp->fh_pre_ctime = inode->i_ctime; > fhp->fh_pre_size = inode->i_size; > - fhp->fh_pre_change = inode->i_version; > + fhp->fh_pre_change = nfsd4_change_attribute(inode); > fhp->fh_pre_saved = true; > } > } > --- a/fs/nfsd/nfs3xdr.c > +++ b/fs/nfsd/nfs3xdr.c > @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp) > printk("nfsd: inode locked twice during operation.\n"); > > err = fh_getattr(fhp, >fh_post_attr); > - fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; > + fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry)); > if (err) { > fhp->fh_post_saved = false; > /* Grab the ctime anyway - set_change_info might use it */ > diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c > index 26780d53a6f9..a09532d4a383 100644 > --- a/fs/nfsd/nfs4xdr.c > +++ b/fs/nfsd/nfs4xdr.c > @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat > *stat, struct inode *inode, > *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); > *p++ = 0; > } else if (IS_I_VERSION(inode)) { > - p = xdr_encode_hyper(p, inode->i_version); > + p = xdr_encode_hyper(p, nfsd4_change_attribute(inode)); > } else { > *p++ = cpu_to_be32(stat->ctime.tv_sec); > *p++ = cpu_to_be32(stat->ctime.tv_nsec); It is *really* confusing to find that fh_post_change is only set in nfs3 code, and only used in nfs4 code. It is probably time to get a 'version' field in 'struct kstat'. That would allow this code to get a little cleaner. (to me, this exercise is just a reminder that the NFSv4 change attribute is poorly designed ... so it just makes me grumpy). NeilBrown > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html signature.asc Description: PGP signature
[PATCH v2 2/2] btrfs: Add quota_override knob into sysfs
This patch adds the read-write attribute quota_override into sysfs. Any process which has cap_sys_resource can set this flag to on, and once it is set to true, processes with cap_sys_resource can exceed the quota. Signed-off-by: Sargun Dhillon--- fs/btrfs/sysfs.c | 41 + 1 file changed, 41 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1f157fb..c2d5f35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +static ssize_t quota_override_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int quota_override; + + quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags); + return snprintf(buf, PAGE_SIZE, "%d\n", quota_override); +} + +static ssize_t quota_override_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long knob; + int err; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + err = kstrtoul(buf, 10, ); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + if (knob) + set_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags); + else + clear_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags); + + return len; +} + +BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), BTRFS_ATTR_PTR(clone_alignment), + BTRFS_ATTR_PTR(quota_override), NULL, }; -- 2.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 1/2] btrfs: add quota override flag to enable quota override for sys_resource
This patch introduces the quota override flag to btrfs_fs_info, and a change to quota limit checking code to temporarily allow for quota to be overridden for processes with cap_sys_resource. It's useful for administrative programs, such as log rotation, that may need to temporarily use more disk space in order to free up a greater amount of overall disk space without yielding more disk space to the rest of userland. Eventually, we may want to add the idea of an operator-specific quota, operator reserved space, or something else to allow for administrative override, but this is perhaps the simplest solution. Signed-off-by: Sargun Dhillon--- fs/btrfs/ctree.h | 2 ++ fs/btrfs/qgroup.c | 5 + 2 files changed, 7 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 643c70d..e86cb7c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -716,6 +716,8 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +#define BTRFS_FS_QUOTA_OVERRIDE14 + /* * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index deffbeb..458fec0 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2338,6 +2338,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; + + if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags) && + capable(CAP_SYS_RESOURCE)) + enforce = false; + retry: spin_lock(_info->qgroup_lock); quota_root = fs_info->quota_root; -- 2.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 0/2] btrfs: allow mechanism to override quota
This patchset makes it so that on a per-filesystem basis one can disable quota enforcement for users with cap_sys_resource. This patchset can likely later be extended to per-qgroup, or a per-volume basis. I'm thinking of extending the sysfs interface to list the qgroups and this same interface for the qgroups themselves. Changes since v1: -Rather than a separate member of btrfs_fs_info, use the existing flags field Sargun Dhillon (2): btrfs: add quota override flag to enable quota override for sys_resource btrfs: Add quota_override knob into sysfs fs/btrfs/ctree.h | 2 ++ fs/btrfs/qgroup.c | 5 + fs/btrfs/sysfs.c | 41 + 3 files changed, 48 insertions(+) -- 2.9.3 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Creating btrfs RAID on LUKS devs makes devices disappear
I should have added some more technical info. Here you go: Arch Linux with systemd 233 Kernel linux-lts 4.9.27 btrfs-progs 4.10.2 Example session: root@nas> ls /dev/dm-* /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3 /dev/dm-4 root@nas> ls -l /dev/mapper total 0 lrwxrwxrwx 1 root root 7 May 11 22:30 backup -> ../dm-1 crw--- 1 root root 10, 236 May 11 22:30 control lrwxrwxrwx 1 root root 7 May 11 22:30 root -> ../dm-0 lrwxrwxrwx 1 root root 7 May 11 22:30 storage0 -> ../dm-2 lrwxrwxrwx 1 root root 7 May 11 22:30 storage1 -> ../dm-4 lrwxrwxrwx 1 root root 7 May 11 22:30 storage2 -> ../dm-3 root@nas> mkfs.btrfs -f -d raid1 -m raid1 /dev/dm-2 /dev/dm-3 /dev/dm-4 btrfs-progs v4.10.2 See http://btrfs.wiki.kernel.org for more information. Label: (null) UUID: a32b3106-678f-448f-ade9-c48cd41a7dae Node size: 16384 Sector size:4096 Filesystem size:10.92TiB Block group profiles: Data: RAID1 1.00GiB Metadata: RAID1 1.00GiB System: RAID1 8.00MiB SSD detected: no Incompat features: extref, skinny-metadata Number of devices: 3 Devices: IDSIZE PATH 1 3.64TiB /dev/dm-2 2 3.64TiB /dev/dm-3 3 3.64TiB /dev/dm-4 root@nas> ls /dev/dm-* /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-4 Note that dm-3 is gone. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Creating btrfs RAID on LUKS devs makes devices disappear
Hello, while trying to initialize a btrfs RAID1 on my new NAS using LUKS crypt-devices for each of the btrfs RAID devices, I have seen "random" weirdness shortly after mkfs. It seems to boil down to the problem that after mkfs.btrfs, some of the /dev/dm-* nodes (as well as the corresponding /dev/mapper/* symlinks) sometimes disappear. The RAID can be mounted at first but quickly shows symptoms such as missing devices, or being unable to mount the second time. I have tried to btrfs.mkfs -d raid1 -m raid1 using the /dev/dm-* and /dev/mapper/* devices, but with similar results. By best guess is that the fact that one UUID is given to multiple separate devices confuses... something (udev or the like?), making nodes appear, disappear or being re-ordered while mkfs is in progress, or leading to unexpected things later at mount time. Honestly, the idea of the same UUID being given to separate physical devices scared me already when I first saw it. Could that actually be the culprit here? Best regards Sebastian -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfs list corruption and soft lockups while testing writeback error handling
On 05/11/2017 03:52 PM, Jeff Layton wrote: On Thu, 2017-05-11 at 07:13 -0400, Jeff Layton wrote: I finally got my writeback error handling test to work on btrfs (thanks, Chris!), by making the filesystem stripe the data and mirror the metadata across two devices. The test passes now, but on one run, I got the following list corruption warning and then a soft lockup (which is probably fallout from the list corruption). I ran the test several times before and since then without this failure, so I don't have a clear reproducer. The kernel in this instance is basically a v4.11 kernel with my pile of writeback error handling patches on top: https://urldefense.proofpoint.com/v2/url?u=https-3A__git.samba.org_-3Fp-3Djlayton_linux.git-3Ba-3Dshortlog-3Bh-3Drefs_heads_wberr=DwICaQ=5VD0RTtNlTh3ycd41b3MUw=9QPtTAxcitoznaWRKKHoEQ=BXXwaUFQNFNaGGFYHEVlvNBwkrXiIoH7K5iOdR_PvxM=xE6pIXeQ1rlaxAV8aTYBSiI06pb3WZoiRJW8Vo1L3NQ= It may be that they are a contributing factor, but this smells more like a bug down in btrfs. Let me know if you need other info: [ btrfs inode logging ] (cc'ing Liu Bo since we were discussing this earlier this week) I can't reproduce this on stock v4.11, so I think this is a bug in my series. I think this is due to the differences in how errors are being reported from filemap_fdatawait_range now causing some transactions to end up being freed while they're still on the log_ctxs list. I'm working on hunting down the problem now. Sorry for the noise! There's a list in the inode logging code that we consistently seem to find list debugging assertions with. We've fixed up all the known issues, but I wouldn't be surprised if we've got a goto fail in there. I'll take a look ;) -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfs list corruption and soft lockups while testing writeback error handling
On Thu, 2017-05-11 at 07:13 -0400, Jeff Layton wrote: > I finally got my writeback error handling test to work on btrfs (thanks, > Chris!), by making the filesystem stripe the data and mirror the > metadata across two devices. The test passes now, but on one run, I got > the following list corruption warning and then a soft lockup (which is > probably fallout from the list corruption). > > I ran the test several times before and since then without this failure, > so I don't have a clear reproducer. The kernel in this instance is > basically a v4.11 kernel with my pile of writeback error handling > patches on top: > > https://git.samba.org/?p=jlayton/linux.git;a=shortlog;h=refs/heads/wberr > > It may be that they are a contributing factor, but this smells more like > a bug down in btrfs. Let me know if you need other info: > > --8<--- > > [ 438.341942] run fstests generic/999 at 2017-05-11 07:03:39 > [ 439.453293] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid > 1 transid 3 /dev/vda8 > [ 439.465918] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid > 2 transid 3 /dev/vda7 > [ 439.603578] device-mapper: ioctl: device doesn't appear to be in the dev > hash table. > [ 439.762422] BTRFS info (device dm-4): disk space caching is enabled > [ 439.763808] BTRFS info (device dm-4): has skinny extents > [ 439.764979] BTRFS info (device dm-4): flagging fs with big metadata feature > [ 439.785879] BTRFS info (device dm-4): creating UUID tree > [ 439.974266] BTRFS info (device dm-4): disk space caching is enabled > [ 439.975783] BTRFS info (device dm-4): has skinny extents > [ 440.229263] Buffer I/O error on dev dm-4, logical block 2621424, async > page read > [ 440.239970] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: > wr 1, rd 0, flush 0, corrupt 0, gen 0 > [ 440.242459] [ cut here ] > [ 440.243276] WARNING: CPU: 0 PID: 5162 at lib/list_debug.c:28 > __list_add_valid+0x69/0xa0 > [ 440.244338] list_add corruption. prev->next should be next > (8dd531056b08), but was a93242807e90. (prev=a93242807e90). > [ 440.245939] Modules linked in: btrfs xor raid6_pq binfmt_misc > ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink > ebtable_broute bridge stp llc ebtable_nat ip6table_mangle ip6table_security > ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_raw > iptable_mangle iptable_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 > nf_nat_ipv4 nf_nat nf_conntrack iptable_raw ebtable_filter ebtables > ip6table_filter ip6_tables snd_hda_codec_generic snd_hda_intel snd_hda_codec > snd_hda_core crct10dif_pclmul crc32_pclmul nfsd ghash_clmulni_intel ppdev > snd_hwdep snd_pcm acpi_cpufreq snd_timer tpm_tis snd parport_pc tpm_tis_core > parport pcspkr tpm i2c_piix4 auth_rpcgss soundcore floppy joydev qemu_fw_cfg > virtio_balloon nfs_acl lockd grace sunrpc xfs libcrc32c qxl drm_kms_helper > virtio_net > [ 440.254739] virtio_blk virtio_console virtio_rng ttm drm crc32c_intel > virtio_pci virtio_ring ata_generic virtio serio_raw pata_acpi > [ 440.256352] CPU: 0 PID: 5162 Comm: fsync-err Not tainted 4.11.0+ #52 > [ 440.257534] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > 1.9.3-1.fc25 04/01/2014 > [ 440.258584] Call Trace: > [ 440.259096] dump_stack+0x63/0x86 > [ 440.259618] __warn+0xcb/0xf0 > [ 440.260116] warn_slowpath_fmt+0x5a/0x80 > [ 440.260798] ? check_parent_dirs_for_sync+0x109/0x140 [btrfs] > [ 440.261755] __list_add_valid+0x69/0xa0 > [ 440.262442] btrfs_log_inode_parent+0x25c/0x9f0 [btrfs] > [ 440.263323] ? btrfs_releasepage+0x20/0x20 [btrfs] > [ 440.264059] ? wait_current_trans+0x2e/0xf0 [btrfs] > [ 440.264792] ? kmem_cache_alloc+0x195/0x1b0 > [ 440.265455] ? join_transaction+0x27/0x420 [btrfs] > [ 440.266175] btrfs_log_dentry_safe+0x60/0x80 [btrfs] > [ 440.266965] btrfs_sync_file+0x2b7/0x400 [btrfs] > [ 440.267655] vfs_fsync_range+0x49/0xb0 > [ 440.268266] do_fsync+0x3d/0x70 > [ 440.268806] SyS_fsync+0x10/0x20 > [ 440.269347] entry_SYSCALL_64_fastpath+0x1a/0xa9 > [ 440.270033] RIP: 0033:0x7f7983af1b70 > [ 440.270607] RSP: 002b:7ffe13b3aa18 EFLAGS: 0246 ORIG_RAX: > 004a > [ 440.271661] RAX: ffda RBX: 0005 RCX: > 7f7983af1b70 > [ 440.272623] RDX: 00010401 RSI: 023ef030 RDI: > 0004 > [ 440.273696] RBP: 7ffe13b3cc77 R08: R09: > 7f79845364c8 > [ 440.274808] R10: 0008 R11: 0246 R12: > 023ef030 > [ 440.275873] R13: 0005 R14: R15: > > [ 440.276995] ---[ end trace 878ee9789ed2d63b ]--- > [ 440.278476] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: > wr 2, rd 0, flush 0, corrupt 0, gen 0 > [ 440.282362] BTRFS error (device dm-4): bdev /dev/mapper/error-test
Re: Backing up BTRFS metadata
On Fri, May 12, 2017 at 12:22:00AM +0500, Roman Mamedov wrote: > On Thu, 11 May 2017 09:19:28 -0600 > Chris Murphywrote: > > > On Thu, May 11, 2017 at 8:56 AM, Marat Khalili wrote: > > > Sorry if question sounds unorthodox, Is there some simple way to read (and > > > backup) all BTRFS metadata from volume? > > > > btrfs-image > > Hm, I thought that's for debugging only, and that you can't actually restore > metadata onto a data-containing FS and have anything mountable/readable as a > result. Indeed. This has been tried before, and I don't think it came to anything. > Seems not to be the case, and in fact, could this be one of the "missing > links" in the Fsck story, > >-w >Walk all the trees manually and copy any blocks that are >referenced. Use this option if your extent tree is corrupted to >make sure that all of the metadata is captured. > > This certainly does sound like something to try for some of those broken > filesystems where Btrfsck refuses to do anything. Save image with this manual > walking/reconstruction of the trees, then restore. Too bad I already nuked > mine, so can't experiment with that. I suspect it's still only capturing metadata, rather than data. Hugo. -- Hugo Mills | Would you like an ocelot with that non-sequitur? hugo@... carfax.org.uk | http://carfax.org.uk/ | PGP: E2AB1DE4 | signature.asc Description: Digital signature
Re: Question on compression unit
Thanks Qu! I wonder if there is anyway we can easily configure the extent size (maximum extent size, extent size for files to compress, etc.)? I was trying to see if it helps reduce random read latency on compressed files by using smaller extent... On Wed, May 10, 2017 at 6:01 PM, Qu Wenruowrote: > > > At 05/11/2017 04:11 AM, Xiaochu Liu wrote: >> >> Hi there, >> >> I'm trying to tune compression options for btrfs. Specifically, I want >> to know the performance impact on the system under different >> compression unit (block) sizes. > > > Compression unit size is fixed in btrfs. > It's sectorsize, determined at mkfs time, and only 4K (page size) is > supported for x86 yet. > >> >> I'm aware of '--nodesize' parameter which sets the block size of >> metadata tree. Does that also set the block size in an extent? (from >> my understanding, file data are mostly stored in extent unless small >> enough to be inline-d in metadata leaf node?) > > > nodesize only affects metadata, nothing to do with data size. > >> >> Also from btrfs's wikipedia page: >> >> In compressed extents, individual blocks are not compressed >> separately; rather, the compression stream spans the entire extent. >> >> Is that still true? > > > Yes. > > For example if there is one continuous range represents 0~1M data of one > file, and all this data is dirty (not written to disk). > > Then compress will happen when trying to writing them to disk. > And since the maximum uncompressed size for compressed extent is 128K > (fixed), the 0~1M will be split into 8 extents (if compression ratio is > acceptable). > > And then each 128K extent will be compressed then compressed data will be > written to disk. (compressed extent still meet sectorsize alignment). > > So the wiki page is still right and we must read out the whole (compressed) > extent to get its content. > > And since both uncompressed data and compressed extent must meet sectorsize > alignment, data smaller than or equal to sectorsize won't go through > compression since it will just waste CPU time and no space saving. > > Thanks, > Qu > >> >> Thanks, >> Xiaochu >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> > > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Backing up BTRFS metadata
On Thu, 11 May 2017 09:19:28 -0600 Chris Murphywrote: > On Thu, May 11, 2017 at 8:56 AM, Marat Khalili wrote: > > Sorry if question sounds unorthodox, Is there some simple way to read (and > > backup) all BTRFS metadata from volume? > > btrfs-image Hm, I thought that's for debugging only, and that you can't actually restore metadata onto a data-containing FS and have anything mountable/readable as a result. Seems not to be the case, and in fact, could this be one of the "missing links" in the Fsck story, -w Walk all the trees manually and copy any blocks that are referenced. Use this option if your extent tree is corrupted to make sure that all of the metadata is captured. This certainly does sound like something to try for some of those broken filesystems where Btrfsck refuses to do anything. Save image with this manual walking/reconstruction of the trees, then restore. Too bad I already nuked mine, so can't experiment with that. -- With respect, Roman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/10] fs: Introduce RWF_NOWAIT
From: Goldwyn RodriguesRWF_NOWAIT informs kernel to bail out if an AIO request will block for reasons such as file allocations, or a writeback triggered, or would block while allocating requests while performing direct I/O. RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags. The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This is called by most filesystems, either through fsops.write_iter() or through the function defined by write_iter(). If not, we perform the check defined by .write_iter() which is called for direct IO specifically. Filesystems xfs, btrfs and ext4 would be supported in the following patches. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- fs/9p/vfs_file.c| 3 +++ fs/aio.c| 6 ++ fs/ceph/file.c | 3 +++ fs/cifs/file.c | 3 +++ fs/fuse/file.c | 3 +++ fs/nfs/direct.c | 3 +++ fs/ocfs2/file.c | 3 +++ include/linux/fs.h | 5 - include/uapi/linux/fs.h | 1 + mm/filemap.c| 3 +++ 10 files changed, 32 insertions(+), 1 deletion(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3de3b4a89d89..403681db7723 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t origin; int err = 0; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + retval = generic_write_checks(iocb, from); if (retval <= 0) return retval; diff --git a/fs/aio.c b/fs/aio.c index 020fa0045e3c..34027b67e2f4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } + if ((req->common.ki_flags & IOCB_NOWAIT) && + !(req->common.ki_flags & IOCB_DIRECT)) { + ret = -EOPNOTSUPP; + goto out_put_req; + } + ret = put_user(KIOCB_KEY, _iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 26cc95421cca..af28419b1731 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1267,6 +1267,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) int err, want, got; loff_t pos; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 21d404535739..f8858a06e119 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2638,6 +2638,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) * write request. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + rc = generic_write_checks(iocb, from); if (rc <= 0) return rc; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec238fb5a584..72786e798319 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1425,6 +1425,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); ssize_t res; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + if (is_bad_inode(inode)) return -EIO; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c1b5fed7c863..dcea0caa5cb5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -996,6 +996,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + result = generic_write_checks(iocb, iter); if (result <= 0) return result; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bfeb647459d9..e7f8ba890305 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2235,6 +2235,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, if (count == 0) return 0; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; inode_lock(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2e6fc6a23f91..7e39b510b7a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -270,6 +270,7 @@ struct writeback_control; #define IOCB_DSYNC (1 << 4) #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) +#define IOCB_NOWAIT(1 << 7) struct kiocb { struct file *ki_filp; @@ -3053,7 +3054,7 @@ static inline int iocb_flags(struct file *file) static inline int
[PATCH 05/10] fs: return if direct write will trigger writeback
From: Goldwyn RodriguesFind out if the write will trigger a wait due to writeback. If yes, return -EAGAIN. Return -EINVAL for buffered AIO: there are multiple causes of delay such as page locks, dirty throttling logic, page loading from disk etc. which cannot be taken care of. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- mm/filemap.c | 17 ++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ca3031f505f2..fd7d175b3dee 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2673,6 +2673,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + if (limit != RLIM_INFINITY) { if (iocb->ki_pos >= limit) { send_sig(SIGXFSZ, current, 0); @@ -2742,9 +2745,17 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); - if (written) - goto out; + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(inode->i_mapping, pos, + pos + iov_iter_count(from))) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } /* * After a write we want buffered reads to be sure to go to disk to get -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/10] fs: Introduce IOMAP_NOWAIT
From: Goldwyn RodriguesIOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps. This is used by XFS in the XFS patch. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- fs/iomap.c| 2 ++ include/linux/iomap.h | 1 + 2 files changed, 3 insertions(+) diff --git a/fs/iomap.c b/fs/iomap.c index 141c3cd55a8b..d1c81753d411 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -885,6 +885,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } else { dio->flags |= IOMAP_DIO_WRITE; flags |= IOMAP_WRITE; + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= IOMAP_NOWAIT; } if (mapping->nrpages) { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 7291810067eb..53f6af89c625 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -51,6 +51,7 @@ struct iomap { #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT(1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */ struct iomap_ops { /* -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/10] xfs: nowait aio support
From: Goldwyn RodriguesIf IOCB_NOWAIT is set, bail if the i_rwsem is not lockable immediately. IF IOMAP_NOWAIT is set, return EAGAIN in xfs_file_iomap_begin if it needs allocation either due to file extension, writing to a hole, or COW or waiting for other DIOs to finish. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_file.c | 19 ++- fs/xfs/xfs_iomap.c | 17 + 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..b307940e7d56 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -541,8 +541,11 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_ilock(ip, iolock); - + if (!xfs_ilock_nowait(ip, iolock)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + xfs_ilock(ip, iolock); + } ret = xfs_file_aio_write_checks(iocb, from, ); if (ret) goto out; @@ -553,9 +556,15 @@ xfs_file_dio_aio_write( * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ - if (unaligned_io) - inode_dio_wait(inode); - else if (iolock == XFS_IOLOCK_EXCL) { + if (unaligned_io) { + /* If we are going to wait for other DIO to finish, bail */ + if (iocb->ki_flags & IOCB_NOWAIT) { + if (atomic_read(>i_dio_count)) + return -EAGAIN; + } else { + inode_dio_wait(inode); + } + } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 288ee5b840d7..9baa65eeae9e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1015,6 +1015,15 @@ xfs_file_iomap_begin( if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { if (flags & IOMAP_DIRECT) { + /* +* A reflinked inode will result in CoW alloc. +* FIXME: It could still overwrite on unshared extents +* and not need allocation. +*/ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, , , ); @@ -1032,6 +1041,14 @@ xfs_file_iomap_begin( if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, , nimaps)) { /* +* If nowait is set bail since we are going to make +* allocations. +*/ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; + } + /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric * with the work writeback does. This is a completely arbitrary -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/10] ext4: nowait aio support
From: Goldwyn RodriguesReturn EAGAIN if any of the following checks fail for direct I/O: + i_rwsem is lockable + Writing beyond end of file (will trigger allocation) + Blocks are not allocated at the write location Signed-off-by: Goldwyn Rodrigues Reviewed-by: Jan Kara --- fs/ext4/file.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cefa9835f275..2efdc6d4d3e8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -216,7 +216,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_dax_write_iter(iocb, from); #endif - inode_lock(inode); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -235,9 +241,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->private = /* Check whether we do a DIO overwrite or not */ - if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && - ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) - overwrite = 1; + if (o_direct && !unaligned_aio) { + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + if (ext4_should_dioread_nolock(inode)) + overwrite = 1; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + } ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/10] fs: return on congested block device
From: Goldwyn RodriguesA new bio operation flag REQ_NOWAIT is introduced to identify bio's orignating from iocb with IOCB_NOWAIT. This flag indicates to return immediately if a request cannot be made instead of retrying. Stacked devices such as md (the ones with make_request_fn hooks) currently are not supported because it may block for housekeeping. For example, an md can have a part of the device suspended. For this reason, only request based devices are supported. In the future, this feature will be expanded to stacked devices by teaching them how to handle the REQ_NOWAIT flags. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- block/blk-core.c | 24 ++-- block/blk-mq-sched.c | 3 +++ block/blk-mq.c| 4 fs/direct-io.c| 10 -- include/linux/bio.h | 6 ++ include/linux/blk_types.h | 2 ++ 6 files changed, 45 insertions(+), 4 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d772c221cc17..effe934b806b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1232,6 +1232,11 @@ static struct request *get_request(struct request_queue *q, unsigned int op, if (!IS_ERR(rq)) return rq; + if (op & REQ_NOWAIT) { + blk_put_rl(rl); + return ERR_PTR(-EAGAIN); + } + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; @@ -1870,6 +1875,17 @@ generic_make_request_checks(struct bio *bio) goto end_io; } + /* +* For a REQ_NOWAIT based request, return -EOPNOTSUPP +* if queue does not have QUEUE_FLAG_NOWAIT_SUPPORT set +* and if it is not a request based queue. +*/ + + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) { + err = -EOPNOTSUPP; + goto end_io; + } + part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(_to_disk(part)->part0, @@ -2021,7 +2037,7 @@ blk_qc_t generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (likely(blk_queue_enter(q, false) == 0)) { + if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ @@ -2046,7 +2062,11 @@ blk_qc_t generic_make_request(struct bio *bio) bio_list_merge(_list_on_stack[0], ); bio_list_merge(_list_on_stack[0], _list_on_stack[1]); } else { - bio_io_error(bio); + if (unlikely(!blk_queue_dying(q) && + (bio->bi_opf & REQ_NOWAIT))) + bio_wouldblock_error(bio); + else + bio_io_error(bio); } bio = bio_list_pop(_list_on_stack[0]); } while (bio); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c974a1bbf4cb..019d881d62b7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -119,6 +119,9 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); + if (op & REQ_NOWAIT) + data->flags |= BLK_MQ_REQ_NOWAIT; + if (e) { data->flags |= BLK_MQ_REQ_INTERNAL; diff --git a/block/blk-mq.c b/block/blk-mq.c index c7836a1ded97..d7613ae6a269 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1538,6 +1538,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, ); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); return BLK_QC_T_NONE; } @@ -1662,6 +1664,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, ); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); return BLK_QC_T_NONE; } diff --git a/fs/direct-io.c b/fs/direct-io.c index a04ebea77de8..139ebd5ae1c7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -480,8 +480,12 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) unsigned i; int err; - if (bio->bi_error) - dio->io_error = -EIO; + if (bio->bi_error) { +
[PATCH 02/10] fs: Introduce filemap_range_has_page()
From: Goldwyn Rodriguesfilemap_range_has_page() return true if the file's mapping has a page within the range mentioned. This function will be used to check if a write() call will cause a writeback of previous writes. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 2 ++ mm/filemap.c | 33 + 2 files changed, 35 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 869c9a6fe58d..2e6fc6a23f91 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2513,6 +2513,8 @@ extern int filemap_fdatawait(struct address_space *); extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +extern int filemap_range_has_page(struct address_space *, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); diff --git a/mm/filemap.c b/mm/filemap.c index 1694623a6289..fae5a361befb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -376,6 +376,39 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space structure to wait for + * @start_byte:offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + */ +int filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + int ret; + + if (end_byte < start_byte) + return 0; + + if (mapping->nrpages == 0) + return 0; + + pagevec_init(, 0); + ret = pagevec_lookup(, mapping, index, 1); + if (!ret) + return 0; + ret = (pvec.pages[0]->index <= end); + pagevec_release(); + return ret; +} +EXPORT_SYMBOL(filemap_range_has_page); + static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/10] btrfs: nowait aio support
From: Goldwyn RodriguesReturn EAGAIN if any of the following checks fail + i_rwsem is not lockable + NODATACOW or PREALLOC is not set + Cannot nocow at the desired location + Writing beyond end of file which is not allocated Signed-off-by: Goldwyn Rodrigues Acked-by: David Sterba --- fs/btrfs/file.c | 25 - fs/btrfs/inode.c | 3 +++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 520cb7230b2d..a870e5dd2b4d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1823,12 +1823,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, ssize_t num_written = 0; bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); ssize_t err; - loff_t pos; - size_t count; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); loff_t oldsize; int clean_page = 0; - inode_lock(inode); + if ((iocb->ki_flags & IOCB_NOWAIT) && + (iocb->ki_flags & IOCB_DIRECT)) { + /* Don't sleep on inode rwsem */ + if (!inode_trylock(inode)) + return -EAGAIN; + /* +* We will allocate space in case nodatacow is not set, +* so bail +*/ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || + check_can_nocow(BTRFS_I(inode), pos, ) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + } else + inode_lock(inode); + err = generic_write_checks(iocb, from); if (err <= 0) { inode_unlock(inode); @@ -1862,8 +1879,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); - pos = iocb->ki_pos; - count = iov_iter_count(from); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e71f1ea3391..47d3fcd86979 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8625,6 +8625,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; } ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/10] fs: Use RWF_* flags for AIO operations
From: Goldwyn Rodriguesaio_rw_flags is introduced in struct iocb (using aio_reserved1) which will carry the RWF_* flags. We cannot use aio_flags because they are not checked for validity which may break existing applications. Note, the only place RWF_HIPRI comes in effect is dio_await_one(). All the rest of the locations, aio code return -EIOCBQUEUED before the checks for RWF_HIPRI. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- fs/aio.c | 8 +++- include/uapi/linux/aio_abi.h | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f52d925ee259..020fa0045e3c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1586,6 +1586,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->common.ki_flags |= IOCB_EVENTFD; } + ret = kiocb_set_rw_flags(>common, iocb->aio_rw_flags); + if (unlikely(ret)) { + pr_debug("EINVAL: aio_rw_flags\n"); + goto out_put_req; + } + ret = put_user(KIOCB_KEY, _iocb->aio_key); if (unlikely(ret)) { pr_debug("EFAULT: aio_key\n"); diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h index bb2554f7fbd1..a2d4a8ac94ca 100644 --- a/include/uapi/linux/aio_abi.h +++ b/include/uapi/linux/aio_abi.h @@ -79,7 +79,7 @@ struct io_event { struct iocb { /* these are internal to the kernel/libc. */ __u64 aio_data; /* data to be returned in event's data */ - __u32 PADDED(aio_key, aio_reserved1); + __u32 PADDED(aio_key, aio_rw_flags); /* the kernel sets aio_key to the req # */ /* common fields */ -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/10 v8] No wait AIO
Formerly known as non-blocking AIO. This series adds nonblocking feature to asynchronous I/O writes. io_submit() can be delayed because of a number of reason: - Block allocation for files - Data writebacks for direct I/O - Sleeping because of waiting to acquire i_rwsem - Congested block device The goal of the patch series is to return -EAGAIN/-EWOULDBLOCK if any of these conditions are met. This way userspace can push most of the write()s to the kernel to the best of its ability to complete and if it returns -EAGAIN, can defer it to another thread. In order to enable this, IOCB_RW_FLAG_NOWAIT is introduced in uapi/linux/aio_abi.h. If set for aio_rw_flags, it translates to IOCB_NOWAIT for struct iocb, REQ_NOWAIT for bio.bi_opf and IOMAP_NOWAIT for iomap. aio_rw_flags is a new flag replacing aio_reserved1. We could not use aio_flags because it is not currently checked for invalidity in the kernel. This feature is provided for direct I/O of asynchronous I/O only. I have tested it against xfs, ext4, and btrfs while I intend to add more filesystems. The nowait feature is for request based devices. In the future, I intend to add support to stacked devices such as md. Applications will have to check supportability by sending a async direct write and any other error besides -EAGAIN would mean it is not supported. First two patches are prep patches into nowait I/O. Changes since v1: + changed name from _NONBLOCKING to *_NOWAIT + filemap_range_has_page call moved to closer to (just before) calling filemap_write_and_wait_range(). + BIO_NOWAIT limited to get_request() + XFS fixes - included reflink - use of xfs_ilock_nowait() instead of a XFS_IOLOCK_NONBLOCKING flag - Translate the flag through IOMAP_NOWAIT (iomap) to check for block allocation for the file. + ext4 coding style Changes since v2: + Using aio_reserved1 as aio_rw_flags instead of aio_flags + blk-mq support + xfs uptodate with kernel and reflink changes Changes since v3: + Added FS_NOWAIT, which is set if the filesystem supports NOWAIT feature. + Checks in generic_make_request() to make sure BIO_NOWAIT comes in for async direct writes only. + Added QUEUE_FLAG_NOWAIT, which is set if the device supports BIO_NOWAIT. This is added (rather not set) to block devices such as dm/md currently. Changes since v4: + Ported AIO code to use RWF_* flags. Check for RWF_* flags in generic_file_write_iter(). + Changed IOCB_RW_FLAGS_NOWAIT to RWF_NOWAIT. Changes since v5: + BIO_NOWAIT to REQ_NOWAIT + Common helper for RWF flags. Changes since v6: + REQ_NOWAIT will be ignored for request based devices since they cannot block. So, removed QUEUE_FLAG_NOWAIT since it is not required in the current implementation. It will be resurrected when we program for stacked devices. + changed kiocb_rw_flags() to kiocb_set_rw_flags() in order to accomodate for errors. Moved checks in the function. Changes since v7: + split patches into prep so the main patches are smaller and easier to understand + All patches are reviewed or acked! -- Goldwyn -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/10] fs: Separate out kiocb flags setup based on RWF_* flags
From: Goldwyn RodriguesSigned-off-by: Goldwyn Rodrigues Reviewed-by: Christoph Hellwig --- fs/read_write.c| 12 +++- include/linux/fs.h | 14 ++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index c4f88afbc67f..362f91cd8d66 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, struct kiocb kiocb; ssize_t ret; - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) - return -EOPNOTSUPP; - init_sync_kiocb(, filp); - if (flags & RWF_HIPRI) - kiocb.ki_flags |= IOCB_HIPRI; - if (flags & RWF_DSYNC) - kiocb.ki_flags |= IOCB_DSYNC; - if (flags & RWF_SYNC) - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + ret = kiocb_set_rw_flags(, flags); + if (ret) + return ret; kiocb.ki_pos = *ppos; if (type == READ) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7251f7bb45e8..869c9a6fe58d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3049,6 +3049,20 @@ static inline int iocb_flags(struct file *file) return res; } +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags) +{ + if (unlikely(flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))) + return -EOPNOTSUPP; + + if (flags & RWF_HIPRI) + ki->ki_flags |= IOCB_HIPRI; + if (flags & RWF_DSYNC) + ki->ki_flags |= IOCB_DSYNC; + if (flags & RWF_SYNC) + ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC); + return 0; +} + static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; -- 2.12.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v1 00/30] fs: inode->i_version rework and optimization
On Wed, Apr 05, 2017 at 02:14:09PM -0400, J. Bruce Fields wrote: > On Wed, Apr 05, 2017 at 10:05:51AM +0200, Jan Kara wrote: > > 1) Keep i_version as is, make clients also check for i_ctime. > > That would be a protocol revision, which we'd definitely rather avoid. > > But can't we accomplish the same by using something like > > ctime * (some constant) + i_version > > ? > > >Pro: No on-disk format changes. > >Cons: After a crash, i_version can go backwards (but when file changes > >i_version, i_ctime pair should be still different) or not, data can be > >old or not. > > This is probably good enough for NFS purposes: typically on an NFS > filesystem, results of a read in the face of a concurrent write open are > undefined. And writers sync before close. > > So after a crash with a dirty inode, we're in a situation where an NFS > client still needs to resend some writes, sync, and close. I'm OK with > things being inconsistent during this window. > > I do expect things to return to normal once that client's has resent its > writes--hence the worry about actually resuing old values after boot > (such as if i_version regresses on boot and then increments back to the > same value after further writes). Factoring in ctime fixes that. So for now I'm thinking of just doing something like the following. Only nfsd needs it for now, but it could be moved to a vfs helper for statx, or for individual filesystems that want to do something different. (The NFSv4 client will want to use the server's change attribute instead, I think. And other filesystems might want to try something more ambitious like Neil's proposal.) --b. diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 12feac6ee2fd..9636c9a60aba 100644 diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f84fe6bf9aee..14f09f1ef605 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -240,6 +240,16 @@ fh_clear_wcc(struct svc_fh *fhp) fhp->fh_pre_saved = false; } +static inline u64 nfsd4_change_attribute(struct inode *inode) +{ + u64 chattr; + + chattr = inode->i_ctime.tv_sec << 30; + chattr += inode->i_ctime.tv_nsec; + chattr += inode->i_version; + return chattr; +} + /* * Fill in the pre_op attr for the wcc data */ @@ -253,7 +263,7 @@ fill_pre_wcc(struct svc_fh *fhp) fhp->fh_pre_mtime = inode->i_mtime; fhp->fh_pre_ctime = inode->i_ctime; fhp->fh_pre_size = inode->i_size; - fhp->fh_pre_change = inode->i_version; + fhp->fh_pre_change = nfsd4_change_attribute(inode); fhp->fh_pre_saved = true; } } --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -260,7 +260,7 @@ void fill_post_wcc(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, >fh_post_attr); - fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; + fhp->fh_post_change = nfsd4_change_attribute(d_inode(fhp->fh_dentry)); if (err) { fhp->fh_post_saved = false; /* Grab the ctime anyway - set_change_info might use it */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 26780d53a6f9..a09532d4a383 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1973,7 +1973,7 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); *p++ = 0; } else if (IS_I_VERSION(inode)) { - p = xdr_encode_hyper(p, inode->i_version); + p = xdr_encode_hyper(p, nfsd4_change_attribute(inode)); } else { *p++ = cpu_to_be32(stat->ctime.tv_sec); *p++ = cpu_to_be32(stat->ctime.tv_nsec); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3] btrfs: relocation: Enhance kernel error output for relocation
On Wed, May 10, 2017 at 11:39:40AM +0800, Qu Wenruo wrote: > > > At 05/10/2017 01:29 AM, David Sterba wrote: > > On Wed, Feb 15, 2017 at 09:39:05AM +0800, Qu Wenruo wrote: > >> When balance(relocation) fails, btrfs-progs will report like: > >> > >> ERROR: error during balancing '/mnt/scratch': Input/output error > >> There may be more info in syslog - try dmesg | tail > >> > >> However kernel can't provide may useful info in many cases to locate the > >> problem. > >> > >> This patch will add error messages in relocation to help user and > >> developer to locate the problem. > > > > I think it's too verbose for a user, and not really helpful what to do > > after such error message appears in the log. The errors translate name > > of the last function that failed, so the user would need to be familiar > > with the inner workings of the balance to make sense of it. > > Yes, normal user may never need such verbose output. > > But it will help developers or support guys to wipe out some really easy > cases. > > > > > The meessages may make sense to a developer, but then it's not necessary > > to print them as btrfs_err, but btrfs_debug. > > I also considered btrfs_debug, but the problem is btrfs_debug() depend > on either CONFIG_DYANMIC_DEBUG or DEBUG. > > So when problem happens in real world, we're too late to ensure such output. The I think we need some way that is not as noisy as btrfs_err but also compiled-in by default unlike the dynamic debug. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] nowait aio: return on congested block device
On 05/11/2017 02:44 AM, Christoph Hellwig wrote: > Looks fine, > > Reviewed-by: Christoph Hellwig> > Although lifting the make_request limit is something a lot of users > would appreciate in the near future.. > Yes, I understand. That will be on my todo list next on priority. -- Goldwyn -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Qgroup reserved space like in ZFS?
Hello everyone, I just wanted to ask a short question as I couldn't find a clear answer anywhere on the net, yet: Is it currently possible to reserve space for a BTRFS subvolume? Following example: I have two subvolumes, one for root and one for home I'd like to reserve 2GB of space for my root volume so even if my home volume fills up, it can never starve my root subvolume. But I don't want to set a fixed limit on my home partition, I want both subvolumes to be able to grow and shrink dynamically. I use that functionality a lot on ZFS-based systems like FreeNAS. In the the web-interface of FreeNAS one can easily configure things like that. If I get it right, qgroups actually already create things like a global reserve. Therefor the functionality seems to be there. Is there a way to use it, just like the limit functionality? Or is it planed? Thanks in advance and regards, Robert signature.asc Description: OpenPGP digital signature
Re: errno=-28 No space left, with kernel backtrace (blocking bug)
Up plz, I can work with this bug. On 05/11/17 01:39, alpha_one_x86 wrote: > Hi, this bug is very blocking for me: > > https://bugzilla.kernel.org/show_bug.cgi?id=195257 > > The server is backup server, I btrfs receive (with and without -p), and > of course btrfs subvolume delete > The volume is 70TB, then I use space_cache=v2 > > Cheers, > > -- alpha_one_x86/BRULE HermanMain developer of Supercopier/Ultracopier/CatchChallenger, Esourcing and server management IT, OS, technologies, research & development, security and business department -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Backing up BTRFS metadata
On 11/05/17 18:19, Chris Murphy wrote: btrfs-image Looks great, thank you! -- With Best Regards, Marat Khalili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Backing up BTRFS metadata
On Thu, May 11, 2017 at 8:56 AM, Marat Khaliliwrote: > Sorry if question sounds unorthodox, Is there some simple way to read (and > backup) all BTRFS metadata from volume? btrfs-image -- Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Backing up BTRFS metadata
Sorry if question sounds unorthodox, Is there some simple way to read (and backup) all BTRFS metadata from volume? Motivation of course is possibility to quickly recover from catastrophic filesystem failures on a logical level. Some small amount of actual data that this metadata references may be overwritten between backup and restore moments, but due to checksumming it can easily be caught (and either individually restored from backup or discarded). -- With Best Regards, Marat Khalili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] Btrfs: use bio_clone_bioset_partial to simplify DIO submit
On Mon, Apr 17, 2017 at 06:16:23PM -0700, Liu Bo wrote: > Currently when mapping bio to limit bio to a single stripe length, we > split bio by adding page to bio one by one, but later we don't modify > the vector of bio at all, thus we can use bio_clone_fast to use the > original bio vector directly. > > Signed-off-by: Liu Bo> --- > fs/btrfs/extent_io.c | 15 +++ > fs/btrfs/extent_io.h | 1 + > fs/btrfs/inode.c | 122 > +++ > 3 files changed, 62 insertions(+), 76 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 0d4aea4..1b7156c 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -2726,6 +2726,21 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, > unsigned int nr_iovecs) > return bio; > } > > +struct bio *btrfs_bio_clone_partial(struct bio *orig, gfp_t gfp_mask, int > offset, int size) > +{ > + struct bio *bio; > + > + bio = bio_clone_fast(orig, gfp_mask, btrfs_bioset); > + if (bio) { Please switch that to bio = ...; if (!bio) return NULL; (the rest) return bio; > + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); > + btrfs_bio->csum = NULL; > + btrfs_bio->csum_allocated = NULL; > + btrfs_bio->end_io = NULL; > + > + bio_trim(bio, (offset >> 9), (size >> 9)); Hm, so bio_trim also uses ints for the parameters, let's stick to that. > + } > + return bio; > +} > > static int __must_check submit_one_bio(struct bio *bio, int mirror_num, > unsigned long bio_flags) > diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h > index 3e4fad4..3b2bc88 100644 > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -460,6 +460,7 @@ btrfs_bio_alloc(struct block_device *bdev, u64 > first_sector, int nr_vecs, > gfp_t gfp_flags); > struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs); > struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); > +struct bio *btrfs_bio_clone_partial(struct bio *orig, gfp_t gfp_mask, int > offset, int size); line over 80 chars > > struct btrfs_fs_info; > struct btrfs_inode; > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index a18510b..6215720 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -8230,16 +8230,6 @@ static void btrfs_end_dio_bio(struct bio *bio) > bio_put(bio); > } > > -static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, > -u64 first_sector, gfp_t gfp_flags) > -{ > - struct bio *bio; > - bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); > - if (bio) > - bio_associate_current(bio); > - return bio; > -} > - > static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, >struct btrfs_dio_private *dip, >struct bio *bio, > @@ -8329,24 +8319,22 @@ static int btrfs_submit_direct_hook(struct > btrfs_dio_private *dip, > struct btrfs_root *root = BTRFS_I(inode)->root; > struct bio *bio; > struct bio *orig_bio = dip->orig_bio; > - struct bio_vec *bvec; > u64 start_sector = orig_bio->bi_iter.bi_sector; > u64 file_offset = dip->logical_offset; > - u64 submit_len = 0; > u64 map_length; > - u32 blocksize = fs_info->sectorsize; > int async_submit = 0; > - int nr_sectors; > + int submit_len; > + int clone_offset = 0; > + int clone_len; > int ret; > - int i, j; > > - map_length = orig_bio->bi_iter.bi_size; > + submit_len = map_length = orig_bio->bi_iter.bi_size; Please do 2 separate initialization statements. > ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, > _length, NULL, 0); > if (ret) > return -EIO; > > - if (map_length >= orig_bio->bi_iter.bi_size) { > + if (map_length >= submit_len) { > bio = orig_bio; > dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; > goto submit; > @@ -8358,70 +8346,52 @@ static int btrfs_submit_direct_hook(struct > btrfs_dio_private *dip, > else > async_submit = 1; > > - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); > - if (!bio) > - return -ENOMEM; > - > - bio->bi_opf = orig_bio->bi_opf; > - bio->bi_private = dip; > - bio->bi_end_io = btrfs_end_dio_bio; > - btrfs_io_bio(bio)->logical = file_offset; > + /* bio split */ > atomic_inc(>pending_bios); > + while (submit_len > 0) { > + /* map_length < submit_len, it's a int */ > + clone_len = min(submit_len, (int)map_length); The types are mixed, map_length is u64 and cannot be easily switched to
Re: runtime btrfsck
Roman Mamedov posted on Wed, 10 May 2017 13:52:55 +0500 as excerpted: > So even with a minor corruption (something wonky in just ONE block of a > multi-terabyte FS) the answer is way too often "nuke the entire thing > and restore from backups". Just another case where my "keep it small enough to be maintainable" policy triggers. If that double-digit-TB fs is instead broken along functional/logical lines into say a dozen 1 TB each fs and that single block is corrupted, it can only be corrupted in one of them, so 11 of the dozen will be fine, and nuking to restore from backups just the single 1 TB filesystem of a dozen, instead of the single 12-TB fs, isn't such a big deal -- it remains realistically maintainable. Of course if at your scale 12 TB... or 12000 TB... is considered maintainable, great, but then we'd be unlikely to be having this discussion... -- Duncan - List replies preferred. No HTML msgs. "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
btrfs list corruption and soft lockups while testing writeback error handling
I finally got my writeback error handling test to work on btrfs (thanks, Chris!), by making the filesystem stripe the data and mirror the metadata across two devices. The test passes now, but on one run, I got the following list corruption warning and then a soft lockup (which is probably fallout from the list corruption). I ran the test several times before and since then without this failure, so I don't have a clear reproducer. The kernel in this instance is basically a v4.11 kernel with my pile of writeback error handling patches on top: https://git.samba.org/?p=jlayton/linux.git;a=shortlog;h=refs/heads/wberr It may be that they are a contributing factor, but this smells more like a bug down in btrfs. Let me know if you need other info: --8<--- [ 438.341942] run fstests generic/999 at 2017-05-11 07:03:39 [ 439.453293] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 1 transid 3 /dev/vda8 [ 439.465918] BTRFS: device fsid 08e9b22b-44a1-4954-a1b0-03c7c0537831 devid 2 transid 3 /dev/vda7 [ 439.603578] device-mapper: ioctl: device doesn't appear to be in the dev hash table. [ 439.762422] BTRFS info (device dm-4): disk space caching is enabled [ 439.763808] BTRFS info (device dm-4): has skinny extents [ 439.764979] BTRFS info (device dm-4): flagging fs with big metadata feature [ 439.785879] BTRFS info (device dm-4): creating UUID tree [ 439.974266] BTRFS info (device dm-4): disk space caching is enabled [ 439.975783] BTRFS info (device dm-4): has skinny extents [ 440.229263] Buffer I/O error on dev dm-4, logical block 2621424, async page read [ 440.239970] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 1, rd 0, flush 0, corrupt 0, gen 0 [ 440.242459] [ cut here ] [ 440.243276] WARNING: CPU: 0 PID: 5162 at lib/list_debug.c:28 __list_add_valid+0x69/0xa0 [ 440.244338] list_add corruption. prev->next should be next (8dd531056b08), but was a93242807e90. (prev=a93242807e90). [ 440.245939] Modules linked in: btrfs xor raid6_pq binfmt_misc ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_broute bridge stp llc ebtable_nat ip6table_mangle ip6table_security ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_raw iptable_mangle iptable_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core crct10dif_pclmul crc32_pclmul nfsd ghash_clmulni_intel ppdev snd_hwdep snd_pcm acpi_cpufreq snd_timer tpm_tis snd parport_pc tpm_tis_core parport pcspkr tpm i2c_piix4 auth_rpcgss soundcore floppy joydev qemu_fw_cfg virtio_balloon nfs_acl lockd grace sunrpc xfs libcrc32c qxl drm_kms_helper virtio_net [ 440.254739] virtio_blk virtio_console virtio_rng ttm drm crc32c_intel virtio_pci virtio_ring ata_generic virtio serio_raw pata_acpi [ 440.256352] CPU: 0 PID: 5162 Comm: fsync-err Not tainted 4.11.0+ #52 [ 440.257534] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014 [ 440.258584] Call Trace: [ 440.259096] dump_stack+0x63/0x86 [ 440.259618] __warn+0xcb/0xf0 [ 440.260116] warn_slowpath_fmt+0x5a/0x80 [ 440.260798] ? check_parent_dirs_for_sync+0x109/0x140 [btrfs] [ 440.261755] __list_add_valid+0x69/0xa0 [ 440.262442] btrfs_log_inode_parent+0x25c/0x9f0 [btrfs] [ 440.263323] ? btrfs_releasepage+0x20/0x20 [btrfs] [ 440.264059] ? wait_current_trans+0x2e/0xf0 [btrfs] [ 440.264792] ? kmem_cache_alloc+0x195/0x1b0 [ 440.265455] ? join_transaction+0x27/0x420 [btrfs] [ 440.266175] btrfs_log_dentry_safe+0x60/0x80 [btrfs] [ 440.266965] btrfs_sync_file+0x2b7/0x400 [btrfs] [ 440.267655] vfs_fsync_range+0x49/0xb0 [ 440.268266] do_fsync+0x3d/0x70 [ 440.268806] SyS_fsync+0x10/0x20 [ 440.269347] entry_SYSCALL_64_fastpath+0x1a/0xa9 [ 440.270033] RIP: 0033:0x7f7983af1b70 [ 440.270607] RSP: 002b:7ffe13b3aa18 EFLAGS: 0246 ORIG_RAX: 004a [ 440.271661] RAX: ffda RBX: 0005 RCX: 7f7983af1b70 [ 440.272623] RDX: 00010401 RSI: 023ef030 RDI: 0004 [ 440.273696] RBP: 7ffe13b3cc77 R08: R09: 7f79845364c8 [ 440.274808] R10: 0008 R11: 0246 R12: 023ef030 [ 440.275873] R13: 0005 R14: R15: [ 440.276995] ---[ end trace 878ee9789ed2d63b ]--- [ 440.278476] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0 [ 440.282362] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 3, rd 0, flush 0, corrupt 0, gen 0 [ 440.300180] BTRFS warning (device dm-4): lost page write due to IO error on /dev/mapper/error-test [ 440.301502] BTRFS error (device dm-4): bdev /dev/mapper/error-test errs: wr 4, rd 0, flush
parent transid verify failed
Hello, this is some btrfs-on-luks, USB hdd as blockdevice. I can't mount my btrfs anymore, getting continuously the same syslog error: - Last output repeated twice - May 11 07:58:25 [kernel] BTRFS error (device dm-3): failed to read block groups: -5 May 11 07:58:25 [kernel] BTRFS error (device dm-3): open_ctree failed May 11 07:58:31 [kernel] BTRFS info (device dm-3): use zlib compression May 11 07:58:31 [kernel] BTRFS info (device dm-3): enabling auto defrag May 11 07:58:31 [kernel] BTRFS info (device dm-3): disk space caching is enabled May 11 07:58:31 [kernel] BTRFS info (device dm-3): has skinny extents May 11 07:58:33 [kernel] BTRFS error (device dm-3): parent transid verify failed on 541635395584 wanted 10388 found 10385 This is the last part of btrfs check --repair (I know, highly experimental, but I didn't get an alternative solution on #btrfs) : rent transid verify failed on 541577035776 wanted 10388 found 10384 parent transid verify failed on 541577035776 wanted 10388 found 10384 parent transid verify failed on 541577035776 wanted 10388 found 10384 parent transid verify failed on 541577035776 wanted 10388 found 10384 parent transid verify failed on 541577035776 wanted 10388 found 10384 Chunk[256, 228, 429526089728]: length(1073741824), offset(429526089728), type(1) is not found in block group Chunk[256, 228, 430599831552]: length(1073741824), offset(430599831552), type(1) is not found in block group Chunk[256, 228, 431673573376]: length(1073741824), offset(431673573376), type(1) is not found in block group Chunk[256, 228, 434894798848]: length(1073741824), offset(434894798848), type(1) is not found in block group Chunk[256, 228, 435968540672]: length(1073741824), offset(435968540672), type(1) is not found in block group Chunk[256, 228, 437042282496]: length(1073741824), offset(437042282496), type(1) is not found in block group Chunk[256, 228, 438116024320]: length(1073741824), offset(438116024320), type(1) is not found in block group ref mismatch on [429497528320 40960] extent item 0, found 1 Backref 429497528320 parent 858210304 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 429497528320 parent 858210304 owner 0 offset 0 found 1 wanted 0 back 0x37aaefc0 backpointer mismatch on [429497528320 40960] parent transid verify failed on 541635395584 wanted 10388 found 10385 Ignoring transid failure Failed to find [541635395584, 168, 16384] btrfs unable to find ref byte nr 541635395584 parent 0 root 2 owner 1 offset 0 failed to repair damaged filesystem, aborting How did that happen? Yesterday I sent a big snapshot from local drive to a slower USB drive via btrbk. That was already finished. However the USB drive was completely filled up to 99% and doing some IO apparently. Then I was not able to shutdown the machine. Shutdown was really slow, finally umounts were accomplished, services stopped, system shutdown almost finished, but no shutdown. I did a Sysreq- E I U S R B, no reboot. Sysreq-O did not even shut off. So as last consequence I disconnected power supply. The broken btrfs is actually only a snapshot receiver as backup. I would prefer to get it repaired. Seeing that btrfs is sensitive about filling up to 99% usage, I'm worried about my production btrfs. This is Gentoo-Linux, 4.10.14-ck, btrfs-progs-4.10.2. Best regards, Massimo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v3 5/6] btrfs: qgroup: Introduce extent changeset for qgroup reserve functions
At 05/11/2017 01:59 AM, Goldwyn Rodrigues wrote: On 05/09/2017 09:36 PM, Qu Wenruo wrote: Introduce a new parameter, struct extent_changeset for btrfs_qgroup_reserved_data() and its callers. Such extent_changeset was used in btrfs_qgroup_reserve_data() to record which range it reserved in current reserve, so it can free it at error path. The reason we need to export it to callers is, at buffered write error path, without knowing what exactly which range we reserved in current allocation, we can free space which is not reserved by us. This will lead to qgroup reserved space underflow. Reviewed-by: Chandan RajendraSigned-off-by: Qu Wenruo --- fs/btrfs/ctree.h | 6 -- fs/btrfs/extent-tree.c | 16 +++- fs/btrfs/extent_io.h | 34 ++ fs/btrfs/file.c| 12 +--- fs/btrfs/inode-map.c | 4 +++- fs/btrfs/inode.c | 18 ++ fs/btrfs/ioctl.c | 5 - fs/btrfs/qgroup.c | 41 + fs/btrfs/qgroup.h | 3 ++- fs/btrfs/relocation.c | 4 +++- 10 files changed, 113 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e82516fe2d8..52a0147cd612 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2704,8 +2704,9 @@ enum btrfs_flush_state { COMMIT_TRANS= 6, }; -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); @@ -2723,7 +2724,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4f62696131a6..782e0f5feb69 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3364,6 +3364,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; + struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; u64 num_pages = 0; @@ -3483,7 +3484,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, 0, num_pages); + ret = btrfs_check_data_free_space(inode, _reserved, 0, num_pages); if (ret) goto out_put; @@ -3514,6 +3515,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, block_group->disk_cache_state = dcs; spin_unlock(_group->lock); + extent_changeset_free(data_reserved); return ret; } @@ -4282,7 +4284,8 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) * Will replace old btrfs_check_data_free_space(), but for patch split, * add a new function first and then replace it. */ -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; @@ -4297,9 +4300,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, start, len); + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; return ret; } @@ -6140,11 +6145,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * Return
Re: [PATCH 7/8] nowait aio: xfs
Looks fine, Reviewed-by: Christoph Hellwig-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] nowait aio: return on congested block device
Looks fine, Reviewed-by: Christoph HellwigAlthough lifting the make_request limit is something a lot of users would appreciate in the near future.. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/8] nowait aio: Introduce RWF_NOWAIT
On Tue, May 09, 2017 at 07:22:13AM -0500, Goldwyn Rodrigues wrote: > From: Goldwyn Rodrigues> > This flag informs kernel to bail out if an AIO request will block > for reasons such as file allocations, or a writeback triggered, > or would block while allocating requests while performing > direct I/O. > > Unfortunately, aio_flags is not checked for validity, which would > break existing applications which have it set to anything besides zero > or IOCB_FLAG_RESFD. So, we are using aio_reserved1 and renaming it > to aio_rw_flags. > > RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags. > > The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This > is called by most filesystems, either through fsops.write_iter() or through > the function defined by write_iter(). If not, we perform the check defined > by .write_iter() which is called for direct IO specifically. > > Filesystems xfs, btrfs and ext4 would be supported in the following patches. > > Signed-off-by: Goldwyn Rodrigues > --- > fs/9p/vfs_file.c| 3 +++ > fs/aio.c| 6 ++ > fs/ceph/file.c | 3 +++ > fs/cifs/file.c | 3 +++ > fs/fuse/file.c | 3 +++ > fs/nfs/direct.c | 3 +++ > fs/ocfs2/file.c | 3 +++ > include/linux/fs.h | 5 - > include/uapi/linux/fs.h | 1 + > mm/filemap.c| 3 +++ > 10 files changed, 32 insertions(+), 1 deletion(-) > > diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c > index 3de3b4a89d89..403681db7723 100644 > --- a/fs/9p/vfs_file.c > +++ b/fs/9p/vfs_file.c > @@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter > *from) > loff_t origin; > int err = 0; > > + if (iocb->ki_flags & IOCB_NOWAIT) > + return -EOPNOTSUPP; > + > retval = generic_write_checks(iocb, from); > if (retval <= 0) > return retval; > diff --git a/fs/aio.c b/fs/aio.c > index 020fa0045e3c..ea9f8581d902 100644 > --- a/fs/aio.c > +++ b/fs/aio.c > @@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct > iocb __user *user_iocb, > goto out_put_req; > } > > + if ((req->common.ki_flags & IOCB_NOWAIT) && > + !(req->common.ki_flags & IOCB_DIRECT)) { Weird indentation. Either align after the opening if brace: if ((req->common.ki_flags & IOCB_NOWAIT) && !(req->common.ki_flags & IOCB_DIRECT)) { or using two tabs: if ((req->common.ki_flags & IOCB_NOWAIT) && !(req->common.ki_flags & IOCB_DIRECT)) { if the first version looks confusing, but never using the same indentation level as the following code. Except for that the patch looks fine to me: Reviewed-by: Christoph Hellwig -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/8] nowait aio: return if direct write will trigger writeback
It might make sense to move filemap_range_has_page into a separate prep patch. Otherwise this looks fine: Reviewed-by: Christoph Hellwig-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/8] Use RWF_* flags for AIO operations
Please add subsystem prefixes to your subject lines, e.g. fs: for all the generic fs ones, xfs: for XFS, block: for block layer changes, etc. > > - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)) > - return -EOPNOTSUPP; > - > init_sync_kiocb(, filp); > - if (flags & RWF_HIPRI) > - kiocb.ki_flags |= IOCB_HIPRI; > - if (flags & RWF_DSYNC) > - kiocb.ki_flags |= IOCB_DSYNC; > - if (flags & RWF_SYNC) > - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC); > + ret = kiocb_set_rw_flags(, flags); > + if (ret) > + return ret; And please split factoring out kiocb_set_rw_flags into a separate prep patch. Otherwise these changes look fine: Reviewed-by: Christoph Hellwig-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: Convert fs_info->free_chunk_space to atomic64_t
The ->free_chunk_space variable is used to track the unallocated space and access to it is protected by a spinlock, which is not used for anything else. Make the code a bit self-explanatory by switching the variable to an atomic64_t type and kill the spinlock. Signed-off-by: Nikolay Borisov--- fs/btrfs/ctree.h | 3 +-- fs/btrfs/disk-io.c | 3 +-- fs/btrfs/extent-tree.c | 4 +--- fs/btrfs/volumes.c | 26 +++--- 4 files changed, 10 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3e21211e99c3..2202dfdc7888 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,8 +729,7 @@ struct btrfs_fs_info { struct rb_root block_group_cache_tree; /* keep track of unallocated space */ - spinlock_t free_chunk_lock; - u64 free_chunk_space; + atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 061c1d1f774f..2ef80d562a54 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb, spin_lock_init(_info->fs_roots_radix_lock); spin_lock_init(_info->delayed_iput_lock); spin_lock_init(_info->defrag_inodes_lock); - spin_lock_init(_info->free_chunk_lock); spin_lock_init(_info->tree_mod_seq_lock); spin_lock_init(_info->super_lock); spin_lock_init(_info->qgroup_op_lock); @@ -2667,7 +2666,7 @@ int open_ctree(struct super_block *sb, fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; - fs_info->free_chunk_space = 0; + atomic64_set(_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3ab1f88af038..f913c25b9a54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4626,9 +4626,7 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(_info->free_chunk_lock); - avail = fs_info->free_chunk_space; - spin_unlock(_info->free_chunk_lock); + avail = atomic64_read(_info->free_chunk_space); /* * If we have dup, raid1 or raid10 then only half of the free diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab8a66d852f9..923a3591265c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2413,9 +2413,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->fs_devices->total_devices++; fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(_info->free_chunk_lock); - fs_info->free_chunk_space += device->total_bytes; - spin_unlock(_info->free_chunk_lock); + atomic64_add(device->total_bytes, _info->free_chunk_space); if (!blk_queue_nonrot(bdev_get_queue(bdev))) fs_info->fs_devices->rotating = 1; @@ -2850,9 +2848,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, mutex_lock(_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(_info->free_chunk_lock); - fs_info->free_chunk_space += dev_extent_len; - spin_unlock(_info->free_chunk_lock); + atomic64_add(dev_extent_len, _info->free_chunk_space); btrfs_clear_space_info_full(fs_info); mutex_unlock(_info->chunk_mutex); } @@ -4379,9 +4375,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(_info->free_chunk_lock); - fs_info->free_chunk_space -= diff; - spin_unlock(_info->free_chunk_lock); + atomic64_sub(diff, _info->free_chunk_space); } mutex_unlock(_info->chunk_mutex); @@ -4505,9 +4499,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(_info->free_chunk_lock); - fs_info->free_chunk_space += diff; - spin_unlock(_info->free_chunk_lock); + atomic64_add(diff, _info->free_chunk_space); mutex_unlock(_info->chunk_mutex); } return ret; @@