[PATCH v5 4/4] btrfs: Refactor find_free_extent() loops update into find_free_extent_update_loop()

2018-11-01 Thread Qu Wenruo
We have a complex loop design for find_free_extent(), that has different
behavior for each loop, some even includes new chunk allocation.

Instead of putting such a long code into find_free_extent() and makes it
harder to read, just extract them into find_free_extent_update_loop().

With all the cleanups, the main find_free_extent() should be pretty
barebone:

find_free_extent()
|- Iterate through all block groups
|  |- Get a valid block group
|  |- Try to do clustered allocation in that block group
|  |- Try to do unclustered allocation in that block group
|  |- Check if the result is valid
|  |  |- If valid, then exit
|  |- Jump to next block group
|
|- Push harder to find free extents
   |- If not found, re-iterate all block groups

Signed-off-by: Qu Wenruo 
Reviewed-by: Su Yue 
---
 fs/btrfs/extent-tree.c | 217 ++---
 1 file changed, 117 insertions(+), 100 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c9f00cb8f26..c3f57dc30ab5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7262,7 +7262,9 @@ struct find_free_extent_ctl {
/* RAID index, converted from flags */
int index;
 
-   /* Current loop number */
+   /*
+* Current loop number, check find_free_extent_update_loop() for details
+*/
int loop;
 
/*
@@ -7463,6 +7465,117 @@ static int find_free_extent_unclustered(struct 
btrfs_block_group_cache *bg,
return 0;
 }
 
+/*
+ * Return >0 means caller needs to re-search for free extent
+ * Return 0 means we have the needed free extent.
+ * Return <0 means we failed to locate any free extent.
+ */
+static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+   struct btrfs_free_cluster *last_ptr,
+   struct btrfs_key *ins,
+   struct find_free_extent_ctl *ffe_ctl,
+   int full_search, bool use_cluster)
+{
+   struct btrfs_root *root = fs_info->extent_root;
+   int ret;
+
+   if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
+   ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
+   ffe_ctl->orig_have_caching_bg = true;
+
+   if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
+ffe_ctl->have_caching_bg)
+   return 1;
+
+   if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
+   return 1;
+
+   if (ins->objectid) {
+   if (!use_cluster && last_ptr) {
+   spin_lock(_ptr->lock);
+   last_ptr->window_start = ins->objectid;
+   spin_unlock(_ptr->lock);
+   }
+   return 0;
+   }
+
+   /*
+* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
+*  caching kthreads as we move along
+* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+*  again
+*/
+   if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
+   ffe_ctl->index = 0;
+   if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
+   /*
+* We want to skip the LOOP_CACHING_WAIT step if we
+* don't have any uncached bgs and we've already done a
+* full search through.
+*/
+   if (ffe_ctl->orig_have_caching_bg || !full_search)
+   ffe_ctl->loop = LOOP_CACHING_WAIT;
+   else
+   ffe_ctl->loop = LOOP_ALLOC_CHUNK;
+   } else {
+   ffe_ctl->loop++;
+   }
+
+   if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
+   struct btrfs_trans_handle *trans;
+   int exist = 0;
+
+   trans = current->journal_info;
+   if (trans)
+   exist = 1;
+   else
+   trans = btrfs_join_transaction(root);
+
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   return ret;
+   }
+
+   ret = do_chunk_alloc(trans, ffe_ctl->flags,
+CHUNK_ALLOC_FORCE);
+
+   /*
+* If we can't allocate a new chunk we've already looped
+* through at least once, move on to the NO_EMPTY_SIZE
+* case.
+*/
+   if (ret == -ENOSPC)
+  

[PATCH v5 2/4] btrfs: Refactor clustered extent allocation into find_free_extent_clustered()

2018-11-01 Thread Qu Wenruo
We have two main methods to find free extents inside a block group:
1) clustered allocation
2) unclustered allocation

This patch will extract the clustered allocation into
find_free_extent_clustered() to make it a little easier to read.

Instead of jumping between different labels in find_free_extent(), the
helper function will use return value to indicate different behavior.

Signed-off-by: Qu Wenruo 
Reviewed-by: Su Yue 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/extent-tree.c | 245 -
 1 file changed, 122 insertions(+), 123 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b25e039fe80..7b0c93cf280e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7290,6 +7290,116 @@ struct find_free_extent_ctl {
u64 found_offset;
 };
 
+
+/*
+ * Helper function for find_free_extent().
+ *
+ * Return -ENOENT to inform caller that we need fallback to unclustered mode.
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ * Return >0 to inform caller that we find nothing
+ * Return 0 means we have found a location and set ffe_ctl->found_offset.
+ */
+static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+   struct btrfs_free_cluster *last_ptr,
+   struct find_free_extent_ctl *ffe_ctl,
+   struct btrfs_block_group_cache **cluster_bg_ret)
+{
+   struct btrfs_fs_info *fs_info = bg->fs_info;
+   struct btrfs_block_group_cache *cluster_bg;
+   u64 aligned_cluster;
+   u64 offset;
+   int ret;
+
+   cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
+   if (!cluster_bg)
+   goto refill_cluster;
+   if (cluster_bg != bg && (cluster_bg->ro ||
+   !block_group_bits(cluster_bg, ffe_ctl->flags)))
+   goto release_cluster;
+
+   offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+   ffe_ctl->num_bytes, cluster_bg->key.objectid,
+   _ctl->max_extent_size);
+   if (offset) {
+   /* we have a block, we're done */
+   spin_unlock(_ptr->refill_lock);
+   trace_btrfs_reserve_extent_cluster(cluster_bg,
+   ffe_ctl->search_start, ffe_ctl->num_bytes);
+   *cluster_bg_ret = cluster_bg;
+   ffe_ctl->found_offset = offset;
+   return 0;
+   }
+   WARN_ON(last_ptr->block_group != cluster_bg);
+release_cluster:
+   /* If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
+* lets just skip it and let the allocator find whatever block it can
+* find. If we reach this point, we will have tried the cluster
+* allocator plenty of times and not have found anything, so we are
+* likely way too fragmented for the clustering stuff to find anything.
+*
+* However, if the cluster is taken from the current block group,
+* release the cluster first, so that we stand a better chance of
+* succeeding in the unclustered allocation.
+*/
+   if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
+   spin_unlock(_ptr->refill_lock);
+   btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+   return -ENOENT;
+   }
+
+   /* This cluster didn't work out, free it and start over */
+   btrfs_return_cluster_to_free_space(NULL, last_ptr);
+
+   if (cluster_bg != bg)
+   btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
+
+refill_cluster:
+   if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
+   spin_unlock(_ptr->refill_lock);
+   return -ENOENT;
+   }
+
+   aligned_cluster = max_t(u64,
+   ffe_ctl->empty_cluster + ffe_ctl->empty_size,
+   bg->full_stripe_len);
+   ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
+   ffe_ctl->search_start, ffe_ctl->num_bytes,
+   aligned_cluster);
+   if (ret == 0) {
+   /* now pull our allocation out of this cluster */
+   offset = btrfs_alloc_from_cluster(bg, last_ptr,
+   ffe_ctl->num_bytes,
+   ffe_ctl->search_start,
+   _ctl->max_extent_size);
+   if (offset) {
+   /* we found one, proceed */
+   spin_unlock(_ptr->refill_lock);
+   trace_btrfs_reserve_extent_cluster(bg,
+   ffe_ctl->search_start,
+   ffe_ctl->num_bytes);
+   ffe_ctl->found_offset = offset;
+   return 0;
+   }
+   } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
+  !ffe_ctl->retry_clustered) {
+   

[PATCH v5 3/4] btrfs: Refactor unclustered extent allocation into find_free_extent_unclustered()

2018-11-01 Thread Qu Wenruo
This patch will extract unclsutered extent allocation code into
find_free_extent_unclustered().

And this helper function will use return value to indicate what to do
next.

This should make find_free_extent() a little easier to read.

Signed-off-by: Qu Wenruo 
Reviewed-by: Su Yue 
Reviewed-by: Josef Bacik 
[Update merge conflict with fb5c39d7a887 ("btrfs: don't use ctl->free_space for 
max_extent_size")]
---
 fs/btrfs/extent-tree.c | 114 -
 1 file changed, 68 insertions(+), 46 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b0c93cf280e..2c9f00cb8f26 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7400,6 +7400,69 @@ static int find_free_extent_clustered(struct 
btrfs_block_group_cache *bg,
return 1;
 }
 
+/*
+ * Return >0 to inform caller that we find nothing
+ * Return 0 when we found an free extent and set ffe_ctrl->found_offset
+ * Return -EAGAIN to inform caller that we need to re-search this block group
+ */
+static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+   struct btrfs_free_cluster *last_ptr,
+   struct find_free_extent_ctl *ffe_ctl)
+{
+   u64 offset;
+
+   /*
+* We are doing an unclustered alloc, set the fragmented flag so we
+* don't bother trying to setup a cluster again until we get more space.
+*/
+   if (unlikely(last_ptr)) {
+   spin_lock(_ptr->lock);
+   last_ptr->fragmented = 1;
+   spin_unlock(_ptr->lock);
+   }
+   if (ffe_ctl->cached) {
+   struct btrfs_free_space_ctl *free_space_ctl;
+
+   free_space_ctl = bg->free_space_ctl;
+   spin_lock(_space_ctl->tree_lock);
+   if (free_space_ctl->free_space <
+   ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
+   ffe_ctl->empty_size) {
+   ffe_ctl->total_free_space = max_t(u64,
+   ffe_ctl->total_free_space,
+   free_space_ctl->free_space);
+   spin_unlock(_space_ctl->tree_lock);
+   return 1;
+   }
+   spin_unlock(_space_ctl->tree_lock);
+   }
+
+   offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
+   ffe_ctl->num_bytes, ffe_ctl->empty_size,
+   _ctl->max_extent_size);
+
+   /*
+* If we didn't find a chunk, and we haven't failed on this block group
+* before, and this block group is in the middle of caching and we are
+* ok with waiting, then go ahead and wait for progress to be made, and
+* set @retry_unclustered to true.
+*
+* If @retry_unclustered is true then we've already waited on this block
+* group once and should move on to the next block group.
+*/
+   if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
+   ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
+   wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
+   ffe_ctl->empty_size);
+   ffe_ctl->retry_unclustered = true;
+   return -EAGAIN;
+   } else if (!offset) {
+   return 1;
+   }
+   ffe_ctl->found_offset = offset;
+   return 0;
+}
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -7602,54 +7665,13 @@ static noinline int find_free_extent(struct 
btrfs_fs_info *fs_info,
/* ret == -ENOENT case falls through */
}
 
-   /*
-* We are doing an unclustered alloc, set the fragmented flag so
-* we don't bother trying to setup a cluster again until we get
-* more space.
-*/
-   if (unlikely(last_ptr)) {
-   spin_lock(_ptr->lock);
-   last_ptr->fragmented = 1;
-   spin_unlock(_ptr->lock);
-   }
-   if (ffe_ctl.cached) {
-   struct btrfs_free_space_ctl *ctl =
-   block_group->free_space_ctl;
-
-   spin_lock(>tree_lock);
-   if (ctl->free_space <
-   num_bytes + ffe_ctl.empty_cluster + empty_size) {
-   ffe_ctl.total_free_space = max(ctl->free_space,
-   ffe_ctl.total_free_space);
-   spin_unlock(>tree_lock);
-   goto loop;
-   }
-   spin_unlock(>tree_lock);
-   }
-
-   ffe_ctl.found_offset = btrfs_find_space_for_alloc(block_group,
-   ffe_ctl.search_start, 

[PATCH v5 0/4] btrfs: Refactor find_free_extent()

2018-11-01 Thread Qu Wenruo
Can be fetched from github:
https://github.com/adam900710/linux/tree/refactor_find_free_extent

Which is based on david's misc-4.20 branch.
The base head is:

commit 9084cb6a24bf5838a665af92ded1af8363f9e563 (david/misc-4.20)
Author: Filipe Manana 
Date:   Mon Oct 22 10:43:06 2018 +0100

Btrfs: fix use-after-free when dumping free space


extent-tree.c::find_free_extent() could be one of the most
ill-structured functions, it has at least 6 non-exit tags and jumps
between them.

Refactor it into 4 parts:

1) find_free_extent()
   The main entrance, does the main work of block group iteration and
   block group selection.
   Now this function doesn't care nor handles free extent search by
   itself.

2) find_free_extent_clustered()
   Do clustered free extent search.
   May try to build/re-fill cluster.

3) find_free_extent_unclustered()
   Do unclustered free extent search.
   May try to fill free space cache.

4) find_free_extent_update_loop()
   Do the loop based black magic.
   May allocate new chunk.

With this patch, at least we should make find_free_extent() a little
easier to read, and provides the basis for later work on this function.

Current refactor is trying not to touch the original functionality, thus
the helper structure find_free_extent_ctl still contains a lot of
unrelated members.
But it should not change how find_free_extent() works at all.

changelog:
v2:
  Split into 4 patches.
  Minor comment newline change.
v3:
  Mostly cosmetic update.
  Rebased to v4.19-rc1
  Rename find_free_extent_ctrl to find_free_extent_ctl to keep the
  naming scheme the same.
  Fix some comment spelling error.
  Enhance comment for find_free_extent_unclustered().
  Add reviewed-by tag.
v4:
  Move the (ins->objectid) check to proper location of the last patch,
  so all (!ins->objectid) branches are in the same code block.
  Add reviewed-by tags from Josef.
v5:
  Fix all warning reported by checkpatch.pl
  Rebase to misc-4.20.
  Solve conflicts with fb5c39d7a887 ("btrfs: don't use ctl->free_space for 
max_extent_size").
  This results:
  * New member find_free_extent_ctl::total_free_space
  * New comment for find_free_extent_ctl::total_free_space
  * New comment for falling back to total_free_space if we can't find
valid find_free_extent_ctl::max_extent_size.

Qu Wenruo (4):
  btrfs: Introduce find_free_extent_ctl structure for later rework
  btrfs: Refactor clustered extent allocation into
find_free_extent_clustered()
  btrfs: Refactor unclustered extent allocation into
find_free_extent_unclustered()
  btrfs: Refactor find_free_extent() loops update into
find_free_extent_update_loop()

 fs/btrfs/extent-tree.c | 715 -
 1 file changed, 414 insertions(+), 301 deletions(-)

-- 
2.19.1



[PATCH v5 1/4] btrfs: Introduce find_free_extent_ctl structure for later rework

2018-11-01 Thread Qu Wenruo
Instead of tons of different local variables in find_free_extent(),
extract them into find_free_extent_ctl structure, and add better
explanation for them.

Some modification may looks redundant, but will later greatly simplify
function parameter list during find_free_extent() refactor.

Also add two comments to co-operate with fb5c39d7a887 ("btrfs: don't use
ctl->free_space for max_extent_size"), to make ffe_ctl->max_extent_size
update more reader-friendly.

Signed-off-by: Qu Wenruo 
Reviewed-by: Su Yue 
Reviewed-by: Josef Bacik 
[Updated with conflicts with fb5c39d7a887 ("btrfs: don't use ctl->free_space 
for max_extent_size")]
---
 fs/btrfs/extent-tree.c | 265 ++---
 1 file changed, 170 insertions(+), 95 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf155747..7b25e039fe80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7238,6 +7238,58 @@ btrfs_release_block_group(struct btrfs_block_group_cache 
*cache,
btrfs_put_block_group(cache);
 }
 
+/*
+ * Internal used structure for find_free_extent() function.
+ * Wraps needed parameters.
+ */
+struct find_free_extent_ctl {
+   /* Basic allocation info */
+   u64 ram_bytes;
+   u64 num_bytes;
+   u64 empty_size;
+   u64 flags;
+   int delalloc;
+
+   /* Where to start the search inside the bg */
+   u64 search_start;
+
+   /* For clustered allocation */
+   u64 empty_cluster;
+
+   bool have_caching_bg;
+   bool orig_have_caching_bg;
+
+   /* RAID index, converted from flags */
+   int index;
+
+   /* Current loop number */
+   int loop;
+
+   /*
+* Whether we're refilling a cluster, if true we need to re-search
+* current block group but don't try to refill the cluster again.
+*/
+   bool retry_clustered;
+
+   /*
+* Whether we're updating free space cache, if true we need to re-search
+* current block group but don't try updating free space cache again.
+*/
+   bool retry_unclustered;
+
+   /* If current block group is cached */
+   int cached;
+
+   /* Max contiguous hole found */
+   u64 max_extent_size;
+
+   /* Total free space from free space cache, not always contiguous */
+   u64 total_free_space;
+
+   /* Found result */
+   u64 found_offset;
+};
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -7258,21 +7310,26 @@ static noinline int find_free_extent(struct 
btrfs_fs_info *fs_info,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
-   u64 search_start = 0;
-   u64 max_extent_size = 0;
-   u64 max_free_space = 0;
-   u64 empty_cluster = 0;
+   struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
-   int loop = 0;
-   int index = btrfs_bg_flags_to_raid_index(flags);
-   bool failed_cluster_refill = false;
-   bool failed_alloc = false;
bool use_cluster = true;
-   bool have_caching_bg = false;
-   bool orig_have_caching_bg = false;
bool full_search = false;
 
WARN_ON(num_bytes < fs_info->sectorsize);
+
+   ffe_ctl.ram_bytes = ram_bytes;
+   ffe_ctl.num_bytes = num_bytes;
+   ffe_ctl.empty_size = empty_size;
+   ffe_ctl.flags = flags;
+   ffe_ctl.search_start = 0;
+   ffe_ctl.retry_clustered = false;
+   ffe_ctl.retry_unclustered = false;
+   ffe_ctl.delalloc = delalloc;
+   ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
+   ffe_ctl.have_caching_bg = false;
+   ffe_ctl.orig_have_caching_bg = false;
+   ffe_ctl.found_offset = 0;
+
ins->type = BTRFS_EXTENT_ITEM_KEY;
ins->objectid = 0;
ins->offset = 0;
@@ -7308,7 +7365,8 @@ static noinline int find_free_extent(struct btrfs_fs_info 
*fs_info,
spin_unlock(_info->lock);
}
 
-   last_ptr = fetch_cluster_info(fs_info, space_info, _cluster);
+   last_ptr = fetch_cluster_info(fs_info, space_info,
+ _ctl.empty_cluster);
if (last_ptr) {
spin_lock(_ptr->lock);
if (last_ptr->block_group)
@@ -7325,10 +7383,12 @@ static noinline int find_free_extent(struct 
btrfs_fs_info *fs_info,
spin_unlock(_ptr->lock);
}
 
-   search_start = max(search_start, first_logical_byte(fs_info, 0));
-   search_start = max(search_start, hint_byte);
-   if (search_start == hint_byte) {
-   block_group = btrfs_lookup_block_group(fs_info, search_start);
+   ffe_ctl.search_start = max(ffe_ctl.search_start,
+  first_logical_byte(fs_info, 0));
+   ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
+   if 

Re: btrfs partition is broken, cannot restore anything

2018-11-01 Thread Qu Wenruo


On 2018/11/2 上午4:40, Attila Vangel wrote:
> Hi,
> 
> Somehow my btrfs partition got broken. I use Arch, so my kernel is
> quite new (4.18.x).
> I don't remember exactly the sequence of events. At some point it was
> accessible in read-only, but unfortunately I did not take backup
> immediately. dmesg log from that time:
> 
> [ 62.602388] BTRFS warning (device nvme0n1p2): block group
> 103923318784 has wrong amount of free space
> [ 62.602390] BTRFS warning (device nvme0n1p2): failed to load free
> space cache for block group 103923318784, rebuilding it now
> [ 108.039188] BTRFS error (device nvme0n1p2): bad tree block start 0 
> 18812026880
> [ 108.039227] BTRFS: error (device nvme0n1p2) in
> __btrfs_free_extent:7010: errno=-5 IO failure

Normally we shouldn't call __btrfs_free_extent() during mount.
It looks like it's caused by log tree replay.

You could try to mount the fs with -o ro,nologreplay to see if you could
still mount the fs RO without log replay.

If it goes well, at least you could access the files.

> [ 108.039241] BTRFS info (device nvme0n1p2): forced readonly
> [ 108.039250] BTRFS: error (device nvme0n1p2) in
> btrfs_run_delayed_refs:3076: errno=-5 IO failure
> 
> At the next reboot it failed to mount. Problem may have been that at
> some point I booted to another distro with older kernel (4.15.x,
> 4.14.52) and unfortunately attempted some checks/repairs (?) e.g. from
> gparted, and at that time I did not know it could be destructive.>
> Anyway, currently it fails to mount (even with ro and/or recovery),
> btrfs check results in "checksum verify failed" and "bad tree block"
> errors,

Full btrfs check result please.

> btrfs restore resulted in "We have looped trying to restore
> files in" errors for a dozen of paths then exit.

So at least btrfs-restore is working.

This normally means the fs is not completely corrupted, and essential
trees are at least OK.

Still need that "btrfs check" output (along with "btrfs check
--mode=lowmem") to determine what to do.

Thanks,
Qu
> 
> Is there some hope to save data from the filesystem, and if so, how?
> 
> BTW I checked some diagnostics commands regarding my SSD with the nvme
> client and from that it seems there are no hardware problems.
> 
> Your help is highly appreciated.
> 
> Cheers,
> Attila
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v4 0/4] btrfs: Refactor find_free_extent()

2018-11-01 Thread Qu Wenruo


On 2018/11/2 上午2:54, David Sterba wrote:
> On Wed, Oct 17, 2018 at 02:56:02PM +0800, Qu Wenruo wrote:
>> Can be fetched from github:
>> https://github.com/adam900710/linux/tree/refactor_find_free_extent
> 
> Can you please rebase it again, on top of current misc-4.20? Ie. the 2nd
> pull request. There were some fixes to the space infos, a new variable
> added to find_free_extent (conflict in the 1st patch) that was easy to
> fix, but further patches move code around and it was not a simple copy
> as the variable would probably need to be added to the free space ctl.
> 
> This is a change beyond the level I'm confident doing during reabses and
> don't want to introduce a bug to your code. This should be the last
> rebase of this series. Things that are ready will go from topic branches
> to misc-next after rc1 is out. Thanks for the help.

No problem at all.

Will sent out soon.

Thanks,
Qu



signature.asc
Description: OpenPGP digital signature


btrfs partition is broken, cannot restore anything

2018-11-01 Thread Attila Vangel
Hi,

Somehow my btrfs partition got broken. I use Arch, so my kernel is
quite new (4.18.x).
I don't remember exactly the sequence of events. At some point it was
accessible in read-only, but unfortunately I did not take backup
immediately. dmesg log from that time:

[ 62.602388] BTRFS warning (device nvme0n1p2): block group
103923318784 has wrong amount of free space
[ 62.602390] BTRFS warning (device nvme0n1p2): failed to load free
space cache for block group 103923318784, rebuilding it now
[ 108.039188] BTRFS error (device nvme0n1p2): bad tree block start 0 18812026880
[ 108.039227] BTRFS: error (device nvme0n1p2) in
__btrfs_free_extent:7010: errno=-5 IO failure
[ 108.039241] BTRFS info (device nvme0n1p2): forced readonly
[ 108.039250] BTRFS: error (device nvme0n1p2) in
btrfs_run_delayed_refs:3076: errno=-5 IO failure

At the next reboot it failed to mount. Problem may have been that at
some point I booted to another distro with older kernel (4.15.x,
4.14.52) and unfortunately attempted some checks/repairs (?) e.g. from
gparted, and at that time I did not know it could be destructive.

Anyway, currently it fails to mount (even with ro and/or recovery),
btrfs check results in "checksum verify failed" and "bad tree block"
errors, btrfs restore resulted in "We have looped trying to restore
files in" errors for a dozen of paths then exit.

Is there some hope to save data from the filesystem, and if so, how?

BTW I checked some diagnostics commands regarding my SSD with the nvme
client and from that it seems there are no hardware problems.

Your help is highly appreciated.

Cheers,
Attila


Re: [PATCH 8/8] btrfs: Remove extent_io_ops::split_extent_hook callback

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:53PM +0200, Nikolay Borisov wrote:
> This is the counterpart to merge_extent_hook, similarly, it's used only
> for data/freespace inodes so let's remove it, rename it and call it
> directly where necessary. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 
> ---

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 7/8] btrfs: Remove extent_io_ops::merge_extent_hook callback

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:52PM +0200, Nikolay Borisov wrote:
> This callback is used only for data and free space inodes. Such inodes
> are guaranteed to have their extent_io_tree::private_data set to the
> inode struct. Exploit this fact to directly call the function. Also
> give it a more descriptive name. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 
> ---

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 6/8] btrfs: Remove extent_io_ops::clear_bit_hook callback

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:51PM +0200, Nikolay Borisov wrote:
> This is the counterpart to ex-set_bit_hook (now btrfs_set_delalloc_extent),
> similar to what was done before remove clear_bit_hook and rename the
> function. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 5/8] btrfs: Remove extent_io_ops::set_bit_hook extent_io callback

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:50PM +0200, Nikolay Borisov wrote:
> This callback is used to properly account delalloc extents for
> data inodes (ordinary file inodes and freespace v1 inodes). Those can
> be easily identified since they have their extent_io trees
> ->private_data member point to the inode. Let's exploit this fact to
> remove the needless indirection through extent_io_hooks and directly
> call the function. Also give the function a name which reflects its
> purpose - btrfs_set_delalloc_extent.
> 
> This patch also modified test_find_delalloc so that the extent_io_tree
> used for testing doesn't have its ->private_data set which would have
> caused a crash in btrfs_set_delalloc_extent due to the
> btrfs_inode->root member not being initialised. The old version of the
> code also didn't call set_bit_hook since the extent_io ops weren't set
> for the inode.  No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 4/8] btrfs: Remove extent_io_ops::check_extent_io_range callback

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:49PM +0200, Nikolay Borisov wrote:
> This callback was only used in debug builds by btrfs_leak_debug_check.
> A better approach is to move its implementation in
> btrfs_leak_debug_check and ensure the latter is only executed for
> extent tree which have ->private_data set i.e. relate to a data node and
> not the btree one. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 3/8] btrfs: Remove extent_io_ops::writepage_end_io_hook

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:48PM +0200, Nikolay Borisov wrote:
> This callback is ony ever called for data page writeout so
> there is no need to actually abstract it via extent_io_ops. Lets just
> export it, remove the definition of the callback and call it directly
> in the functions that invoke the callback. Also rename the function to
> btrfs_writepage_endio_finish_ordered since what it really does is
> account finished io in the ordered extent data structures.
> No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Could send another cleanup patch to remove the struct extent_state *state from
the arg list as well.

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 2/8] btrfs: Remove extent_io_ops::writepage_start_hook

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:47PM +0200, Nikolay Borisov wrote:
> This hook is called only from __extent_writepage_io which is already
> called only from the data page writeout path. So there is no need to
> make an indirect call via extent_io_ops. This patch just removes the
> callback definition, exports the callback function and calls it
> directly at the only call site. Also give the function a more descriptive
> name. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 1/8] btrfs: Remove extent_io_ops::fill_delalloc

2018-11-01 Thread Josef Bacik
On Thu, Nov 01, 2018 at 02:09:46PM +0200, Nikolay Borisov wrote:
> This callback is called only from writepage_delalloc which in turn
> is guaranteed to be called from the data page writeout path. In the end
> there is no reason to have the call to this function to be indrected
> via the extent_io_ops structure. This patch removes the callback
> definition, exports the function and calls it directly. No functional
> changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v4 0/4] btrfs: Refactor find_free_extent()

2018-11-01 Thread David Sterba
On Wed, Oct 17, 2018 at 02:56:02PM +0800, Qu Wenruo wrote:
> Can be fetched from github:
> https://github.com/adam900710/linux/tree/refactor_find_free_extent

Can you please rebase it again, on top of current misc-4.20? Ie. the 2nd
pull request. There were some fixes to the space infos, a new variable
added to find_free_extent (conflict in the 1st patch) that was easy to
fix, but further patches move code around and it was not a simple copy
as the variable would probably need to be added to the free space ctl.

This is a change beyond the level I'm confident doing during reabses and
don't want to introduce a bug to your code. This should be the last
rebase of this series. Things that are ready will go from topic branches
to misc-next after rc1 is out. Thanks for the help.


Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 02:49:03PM +0800, Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.
> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec
> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien 

This looks nice, thanks. I agree with the The suggestions from Nikolay,
please update and resend.

I was bit curious about the 'livelock', what you describe does not seem
to be one. System under heavy IO can make the snapshot dead slow but can
recover from that once the IO stops.

Regarding the sync semantics, there's AFAIK no change to the current
state where the sync is done before snapshot but without further other
guarantees. From that point I think it's safe to select only subset of
pages and make things faster.

As the requested changes are not functional I'll add the patch to
for-next for testing.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Chris Mason
On 1 Nov 2018, at 12:00, Omar Sandoval wrote:

> On Thu, Nov 01, 2018 at 04:29:48PM +0100, David Sterba wrote:
>>>
>>> How is that? kthread_stop() frees the task struct, so 
>>> wake_up_process()
>>> would be a use-after-free.
>>
>> That was an assumption for the demonstration purposes, the wording 
>> was
>> confusing sorry.
>
> Oh, well in that case, that's exactly what kthread_park() is ;) Stop 
> the
> thread and make wake_up a noop, and then we don't need to add special
> cases everywhere else.

This is how Omar won me over to kthread_park().  I think Dave's 
alternatives can be made correct, but Omar's way solves a whole class of 
potential problems.  The park() makes the thread safely ignore future 
work, and won't don't have to chase down weird bugs where people are 
sending future work at the wrong time.

-chris


Re: [PATCH] Btrfs: fix missing data checksums after a ranged fsync (msync)

2018-11-01 Thread David Sterba
On Mon, Oct 29, 2018 at 09:42:06AM +, fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> Recently we got a massive simplification for fsync, where for the fast
> path we no longer log new extents while their respective ordered extents
> are still running.
> 
> However that simplification introduced a subtle regression for the case
> where we use a ranged fsync (msync). Consider the following example:
> 
>CPU 0CPU 1
> 
> mmap write to range [2Mb, 4Mb[
>   mmap write to range [512Kb, 1Mb[
>   msync range [512K, 1Mb[
> --> triggers fast fsync
> (BTRFS_INODE_NEEDS_FULL_SYNC
>  not set)
> --> creates extent map A for this
> range and adds it to list of
> modified extents
> --> starts ordered extent A for
> this range
> --> waits for it to complete
> 
> writeback triggered for range
> [2Mb, 4Mb[
>   --> create extent map B and
>   adds it to the list of
>   modified extents
>   --> creates ordered extent B
> 
> --> start looking for and logging
> modified extents
> --> logs extent maps A and B
> --> finds checksums for extent A
> in the csum tree, but not for
> extent B
>   fsync (msync) finishes
> 
>   --> ordered extent B
>   finishes and its
>   checksums are added
>   to the csum tree
> 
> 
> 
> After replaying the log, we have the extent covering the range [2Mb, 4Mb[
> but do not have the data checksum items covering that file range.
> 
> This happens because at the very beginning of an fsync (btrfs_sync_file())
> we start and wait for IO in the given range [512Kb, 1Mb[ and therefore
> wait for any ordered extents in that range to complete before we start
> logging the extents. However if right before we start logging the extent
> in our range [512Kb, 1Mb[, writeback is started for any other dirty range,
> such as the range [2Mb, 4Mb[ due to memory pressure or a concurrent fsync
> or msync (btrfs_sync_file() starts writeback before acquiring the inode's
> lock), an ordered extent is created for that other range and a new extent
> map is created to represent that range and added to the inode's list of
> modified extents.
> 
> That means that we will see that other extent in that list when collecting
> extents for logging (done at btrfs_log_changed_extents()) and log the
> extent before the respective ordered extent finishes - namely before the
> checksum items are added to the checksums tree, which is where
> log_extent_csums() looks for the checksums, therefore making us log an
> extent without logging its checksums. Before that massive simplification
> of fsync, this wasn't a problem because besides looking for checkums in
> the checksums tree, we also looked for them in any ordered extent still
> running.
> 
> The consequence of data checksums missing for a file range is that users
> attempting to read the affected file range will get -EIO errors and dmesg
> reports the following:
> 
>  [10188.358136] BTRFS info (device sdc): no csum found for inode 297 start 
> 57344
>  [10188.359278] BTRFS warning (device sdc): csum failed root 5 ino 297 off 
> 57344 csum 0x98f94189 expected csum 0x mirror 1
> 
> So fix this by skipping an extents outside of our logging range at
> btrfs_log_changed_extents() and leaving them on the list of modified
> extents so that any subsequent ranged fsync may collect them if needed.
> Also, if we find a hole extent outside of the range still log it, just
> to prevent having gaps between extent items after replaying the log,
> otherwise fsck will complain when we are not using the NO_HOLES feature
> (fstest btrfs/056 triggers such case).
> 
> Fixes: e7175a692765 ("btrfs: remove the wait ordered logic in the 
> log_one_extent path")
> CC: sta...@vger.kernel.org # 4.19+
> Signed-off-by: Filipe Manana 

Thanks, added to misc-next and on the way to 4.20.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 06:50:03PM +0200, Nikolay Borisov wrote:
> 
> 
> On 1.11.18 г. 18:44 ч., David Sterba wrote:
> > On Thu, Nov 01, 2018 at 09:00:56AM -0700, Omar Sandoval wrote:
> >>> That was an assumption for the demonstration purposes, the wording was
> >>> confusing sorry.
> >>
> >> Oh, well in that case, that's exactly what kthread_park() is ;) Stop the
> >> thread and make wake_up a noop, and then we don't need to add special
> >> cases everywhere else.
> > 
> > The end result is equivalent and should be, I'm now weighing both
> > approaches. Explicit state tracking outside of the thread structures
> > seems more clear and in case we want to query the status, we can't after
> > kthread_stop is called. My current version below.
> > 
> 
> 
> 
> > +static inline void btrfs_wake_up_transaction(struct btrfs_fs_info *fs_info)
> > +{
> > +   if (!test_bit(BTRFS_FS_CLOSING_SINGLE, _info->flags)) {
> 
> Why do we need a separate flag for a predicate? If we use
> FS_CLOSING_START then waking up becomes a noop from the moment
> close_ctree is called.

Yeah it's not strictly necessary, it's a leftover from when I had a
WARN_ON in the wakeup helpers that triggered too often so another flag
was added just after the threads were stopped.


Re: [PATCH RFC RESEND] btrfs: harden agaist duplicate fsid

2018-11-01 Thread David Sterba
On Mon, Oct 15, 2018 at 10:45:17AM +0800, Anand Jain wrote:
> (Thanks for the comments on requiring to warn_on if we fail the device 
> change.)
> (This fixes an ugly bug, I appreciate if you have any further comments).
> 
> Its not that impossible to imagine that a device OR a btrfs image is
> been copied just by using the dd or the cp command. Which in case both
> the copies of the btrfs will have the same fsid. If on the system with
> automount enabled, the copied FS gets scanned.
> 
> We have a known bug in btrfs, that we let the device path be changed
> after the device has been mounted. So using this loop hole the new
> copied device would appears as if its mounted immediately after its
> been copied.
> 
> For example:
> 
> Initially.. /dev/mmcblk0p4 is mounted as /
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part /
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> 
> btrfs fi show
>Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>Total devices 1 FS bytes used 1.40GiB
>devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4
> 
> Copy mmcblk0 to sda
>dd if=/dev/mmcblk0 of=/dev/sda
> 
> And immediately after the copy completes the change in the device
> superblock is notified which the automount scans using
> btrfs device scan and the new device sda becomes the mounted root
> device.
> 
> lsblk
> NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
> sda   8:01 14.9G  0 disk
> |-sda48:414G  0 part /
> |-sda28:21  500M  0 part
> |-sda38:31  256M  0 part
> `-sda18:11  256M  0 part
> mmcblk0 179:00 29.2G  0 disk
> |-mmcblk0p4 179:404G  0 part
> |-mmcblk0p2 179:20  500M  0 part /boot
> |-mmcblk0p3 179:30  256M  0 part [SWAP]
> `-mmcblk0p1 179:10  256M  0 part /boot/efi
> 
> btrfs fi show /
>  Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
>  Total devices 1 FS bytes used 1.40GiB
>  devid1 size 4.00GiB used 3.00GiB path /dev/sda4
> 
> The bug is quite nasty that you can't either unmount /dev/sda4 or
> /dev/mmcblk0p4. And the problem does not get solved until you take
> sda out of the system on to another system to change its fsid
> using the 'btrfstune -u' command.
> 
> Signed-off-by: Anand Jain 

The fix for that case looks ok to me, block devices with duplicate UUIDs
are problematic. I'll add the patch to for-next, thanks.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Nikolay Borisov



On 1.11.18 г. 18:44 ч., David Sterba wrote:
> On Thu, Nov 01, 2018 at 09:00:56AM -0700, Omar Sandoval wrote:
>>> That was an assumption for the demonstration purposes, the wording was
>>> confusing sorry.
>>
>> Oh, well in that case, that's exactly what kthread_park() is ;) Stop the
>> thread and make wake_up a noop, and then we don't need to add special
>> cases everywhere else.
> 
> The end result is equivalent and should be, I'm now weighing both
> approaches. Explicit state tracking outside of the thread structures
> seems more clear and in case we want to query the status, we can't after
> kthread_stop is called. My current version below.
> 



> +static inline void btrfs_wake_up_transaction(struct btrfs_fs_info *fs_info)
> +{
> + if (!test_bit(BTRFS_FS_CLOSING_SINGLE, _info->flags)) {

Why do we need a separate flag for a predicate? If we use
FS_CLOSING_START then waking up becomes a noop from the moment
close_ctree is called.

> + wake_up_process(fs_info->transaction_kthread);
> + } else {
> + btrfs_debug(fs_info,
> + "attempt to wake up transaction kthread during 
> shutdown");
> + }
> +}
> +
>  /*
>   * Try to commit transaction asynchronously, so this is safe to call
>   * even holding a spinlock.
> @@ -210,7 +229,7 @@ static inline void btrfs_commit_transaction_locksafe(
>   struct btrfs_fs_info *fs_info)
>  {
>   set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, _info->flags);
> - wake_up_process(fs_info->transaction_kthread);
> + btrfs_wake_up_transaction(fs_info);
>  }
>  int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
>  int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
> 


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 09:00:56AM -0700, Omar Sandoval wrote:
> > That was an assumption for the demonstration purposes, the wording was
> > confusing sorry.
> 
> Oh, well in that case, that's exactly what kthread_park() is ;) Stop the
> thread and make wake_up a noop, and then we don't need to add special
> cases everywhere else.

The end result is equivalent and should be, I'm now weighing both
approaches. Explicit state tracking outside of the thread structures
seems more clear and in case we want to query the status, we can't after
kthread_stop is called. My current version below.

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..cc076ae45ced 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -715,6 +715,7 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_BARRIER   1
 #define BTRFS_FS_CLOSING_START 2
 #define BTRFS_FS_CLOSING_DONE  3
+#define BTRFS_FS_CLOSING_SINGLE19
 #define BTRFS_FS_LOG_RECOVERING4
 #define BTRFS_FS_OPEN  5
 #define BTRFS_FS_QUOTA_ENABLED 6
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..570096fb3f35 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1801,7 +1801,7 @@ static int transaction_kthread(void *arg)
btrfs_end_transaction(trans);
}
 sleep:
-   wake_up_process(fs_info->cleaner_kthread);
+   btrfs_wake_up_cleaner(fs_info);
mutex_unlock(_info->transaction_kthread_mutex);
 
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
@@ -3914,7 +3914,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
mutex_lock(_info->cleaner_mutex);
btrfs_run_delayed_iputs(fs_info);
mutex_unlock(_info->cleaner_mutex);
-   wake_up_process(fs_info->cleaner_kthread);
+   btrfs_wake_up_cleaner(fs_info);
 
/* wait until ongoing cleanup work done */
down_write(_info->cleanup_work_sem);
@@ -3956,11 +3956,24 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 
cancel_work_sync(_info->async_reclaim_work);
 
+   if (test_bit(BTRFS_FS_STATE_ERROR, _info->fs_state) ||
+   test_bit(BTRFS_FS_STATE_TRANS_ABORTED, _info->fs_state))
+   btrfs_error_commit_super(fs_info);
+
+   kthread_stop(fs_info->transaction_kthread);
+   kthread_stop(fs_info->cleaner_kthread);
+
+   /*
+* btrfs_delete_unused_bgs or btrfs_commit_super can still try to wake
+* up the threads but will become a no-op
+*/
+   set_bit(BTRFS_FS_CLOSING_SINGLE, _info->flags);
+
if (!sb_rdonly(fs_info->sb)) {
/*
-* If the cleaner thread is stopped and there are
-* block groups queued for removal, the deletion will be
-* skipped when we quit the cleaner thread.
+* The cleaner thread is now stopped and if there are block
+* groups queued for removal, we have to pick up the work here
+* so there are no delayed iputs triggered.
 */
btrfs_delete_unused_bgs(fs_info);
 
@@ -3969,13 +3982,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
 
-   if (test_bit(BTRFS_FS_STATE_ERROR, _info->fs_state) ||
-   test_bit(BTRFS_FS_STATE_TRANS_ABORTED, _info->fs_state))
-   btrfs_error_commit_super(fs_info);
-
-   kthread_stop(fs_info->transaction_kthread);
-   kthread_stop(fs_info->cleaner_kthread);
-
ASSERT(list_empty(_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, _info->flags);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a990a9045139..a5786b6e8d37 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5915,7 +5915,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 * namely it pokes the cleaner kthread that will start
 * processing uncleaned subvols.
 */
-   wake_up_process(fs_info->transaction_kthread);
+   btrfs_wake_up_transaction(fs_info);
return ret;
}
case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b362b45dd757..9e73e35b94ea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1896,7 +1896,7 @@ static int btrfs_remount(struct super_block *sb, int 
*flags, char *data)
set_bit(BTRFS_FS_OPEN, _info->flags);
}
 out:
-   wake_up_process(fs_info->transaction_kthread);
+   btrfs_wake_up_transaction(fs_info);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3717c864ba23..d00311dc76d2 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -154,7 +154,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject 
*kobj,
   

Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Omar Sandoval
On Thu, Nov 01, 2018 at 04:29:48PM +0100, David Sterba wrote:
> On Thu, Nov 01, 2018 at 08:24:25AM -0700, Omar Sandoval wrote:
> > On Thu, Nov 01, 2018 at 04:22:29PM +0100, David Sterba wrote:
> > > On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> > > > On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > > > > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > > > > 
> > > > > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > > > > >> From: Omar Sandoval 
> > > > > >>
> > > > > >> There's a race between close_ctree() and cleaner_kthread().
> > > > > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when 
> > > > > >> it
> > > > > >> sees it set, but this is racy; the cleaner might have already 
> > > > > >> checked
> > > > > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > > > > >> unused
> > > > > >> block groups, it will create delayed iputs for the free space cache
> > > > > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > > > > >> longer running delayed iputs after a commit. Therefore, if the 
> > > > > >> cleaner
> > > > > >> creates more delayed iputs after delayed iputs are run in
> > > > > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > > > > >> inode crash from the VFS.
> > > > > >>
> > > > > >> Fix it by parking the cleaner
> > > > > >
> > > > > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > > > > we're missing a commit or clean up data structures. Messing with 
> > > > > > state
> > > > > > of a thread would be the last thing I'd try after proving that it's 
> > > > > > not
> > > > > > possible to fix in the logic of btrfs itself.
> > > > > >
> > > > > > The shutdown sequence in close_tree is quite tricky and we've had 
> > > > > > bugs
> > > > > > there. The interdependencies of thread and data structures and other
> > > > > > subsystems cannot have loops that could not find an ordering that 
> > > > > > will
> > > > > > not leak something.
> > > > > >
> > > > > > It's not a big problem if some step is done more than once, like
> > > > > > committing or cleaning up some other structures if we know that
> > > > > > it could create new.
> > > > > 
> > > > > The problem is the cleaner thread needs to be told to stop doing new 
> > > > > work, and we need to wait for the work it's already doing to be 
> > > > > finished.  We're getting "stop doing new work" already because the 
> > > > > cleaner thread checks to see if the FS is closing, but we don't have 
> > > > > a 
> > > > > way today to wait for him to finish what he's already doing.
> > > > > 
> > > > > kthread_park() is basically the same as adding another mutex or 
> > > > > synchronization point.  I'm not sure how we could change close_tree() 
> > > > > or 
> > > > > the final commit to pick this up more effectively?
> > > > 
> > > > The idea is:
> > > > 
> > > > cleaner close_ctree thread
> > > > 
> > > > tell cleaner to stop
> > > > wait
> > > > start work
> > > > if should_stop, then exit
> > > > cleaner is stopped
> > > > 
> > > > [does not run: finish work]
> > > > [does not run: loop]
> > > > pick up the work or make
> > > > sure there's nothing in
> > > > progress anymore
> > > > 
> > > > 
> > > > A simplified version in code:
> > > > 
> > > >   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> > > > 
> > > >   wait for defrag - could be started from cleaner but next iteration 
> > > > will
> > > > see the fs closed and will not continue
> > > > 
> > > >   kthread_stop(transaction_kthread)
> > > > 
> > > >   kthread_stop(cleaner_kthread)
> > > > 
> > > >   /* next, everything that could be left from cleaner should be 
> > > > finished */
> > > > 
> > > >   btrfs_delete_unused_bgs();
> > > >   assert there are no defrags
> > > >   assert there are no delayed iputs
> > > >   commit if necessary
> > > > 
> > > > IOW the unused block groups are removed from close_ctree too early,
> > > > moving that after the threads stop AND makins sure that it does not need
> > > > either of them should work.
> > > > 
> > > > The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> > > > calls plain btrfs_end_transaction that wakes up transaction ktread, so
> > > > there would need to be an argument passed to tell it to do full commit.
> > > 
> > > Not perfect, relies on the fact that wake_up_process(thread) on a stopped
> > > thread is a no-op,
> > 
> > How is that? kthread_stop() frees the task struct, so wake_up_process()
> > would be a use-after-free.
> 
> That was an assumption for the demonstration purposes, the wording was
> confusing sorry.

Oh, well 

Re: [PATCH v2] btrfs: add zstd compression level support

2018-11-01 Thread Nick Terrell


> On Nov 1, 2018, at 5:57 AM, Timofey Titovets  wrote:
> 
> ср, 31 окт. 2018 г. в 21:12, Nick Terrell :
>> 
>> From: Jennifer Liu 
>> 
>> Adds zstd compression level support to btrfs. Zstd requires
>> different amounts of memory for each level, so the design had
>> to be modified to allow set_level() to allocate memory. We
>> preallocate one workspace of the maximum size to guarantee
>> forward progress. This feature is expected to be useful for
>> read-mostly filesystems, or when creating images.
>> 
>> Benchmarks run in qemu on Intel x86 with a single core.
>> The benchmark measures the time to copy the Silesia corpus [0] to
>> a btrfs filesystem 10 times, then read it back.
>> 
>> The two important things to note are:
>> - The decompression speed and memory remains constant.
>>  The memory required to decompress is the same as level 1.
>> - The compression speed and ratio will vary based on the source.
>> 
>> Level   Ratio   Compression Decompression   Compression Memory
>> 1   2.59153 MB/s112 MB/s0.8 MB
>> 2   2.67136 MB/s113 MB/s1.0 MB
>> 3   2.72106 MB/s115 MB/s1.3 MB
>> 4   2.7886  MB/s109 MB/s0.9 MB
>> 5   2.8369  MB/s109 MB/s1.4 MB
>> 6   2.8953  MB/s110 MB/s1.5 MB
>> 7   2.9140  MB/s112 MB/s1.4 MB
>> 8   2.9234  MB/s110 MB/s1.8 MB
>> 9   2.9327  MB/s109 MB/s1.8 MB
>> 10  2.9422  MB/s109 MB/s1.8 MB
>> 11  2.9517  MB/s114 MB/s1.8 MB
>> 12  2.9513  MB/s113 MB/s1.8 MB
>> 13  2.9510  MB/s111 MB/s2.3 MB
>> 14  2.997   MB/s110 MB/s2.6 MB
>> 15  3.036   MB/s110 MB/s2.6 MB
>> 
>> [0] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
>> 
>> Signed-off-by: Jennifer Liu 
>> Signed-off-by: Nick Terrell 
>> Reviewed-by: Omar Sandoval 



> 
> You didn't mention, so:
> Did you test compression ratio/performance with compress-force or just 
> compress?

I tested with compress-force, since I just reused my script from before [0].

I reran some levels with compress and got these numbers:

Level   Ratio   Compression Decompression
1   2.21158 MB/s113 MB/s
3   2.28117 MB/s113 MB/s
5   2.3281  MB/s112 MB/s
7   2.3747  MB/s116 MB/s
15  2.417   MB/s115 MB/s

Using compress probably makes sense with lower levels, to get higher write 
speeds,
but if you're using higher compression levels, you'll likely want to use 
compress-force,
since you likely don't care too much about write speeds. Obviously this will 
depend on
the data you're compressing.

[0] https://gist.github.com/terrelln/51ed3c9da6f94d613c01fcdae60567e8

-Nick

> 
> Thanks.
> 
> -- 
> Have a nice day,
> Timofey.



Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 08:24:25AM -0700, Omar Sandoval wrote:
> On Thu, Nov 01, 2018 at 04:22:29PM +0100, David Sterba wrote:
> > On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> > > On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > > > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > > > 
> > > > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > > > >> From: Omar Sandoval 
> > > > >>
> > > > >> There's a race between close_ctree() and cleaner_kthread().
> > > > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > > > >> sees it set, but this is racy; the cleaner might have already checked
> > > > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > > > >> unused
> > > > >> block groups, it will create delayed iputs for the free space cache
> > > > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > > > >> longer running delayed iputs after a commit. Therefore, if the 
> > > > >> cleaner
> > > > >> creates more delayed iputs after delayed iputs are run in
> > > > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > > > >> inode crash from the VFS.
> > > > >>
> > > > >> Fix it by parking the cleaner
> > > > >
> > > > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > > > we're missing a commit or clean up data structures. Messing with state
> > > > > of a thread would be the last thing I'd try after proving that it's 
> > > > > not
> > > > > possible to fix in the logic of btrfs itself.
> > > > >
> > > > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > > > there. The interdependencies of thread and data structures and other
> > > > > subsystems cannot have loops that could not find an ordering that will
> > > > > not leak something.
> > > > >
> > > > > It's not a big problem if some step is done more than once, like
> > > > > committing or cleaning up some other structures if we know that
> > > > > it could create new.
> > > > 
> > > > The problem is the cleaner thread needs to be told to stop doing new 
> > > > work, and we need to wait for the work it's already doing to be 
> > > > finished.  We're getting "stop doing new work" already because the 
> > > > cleaner thread checks to see if the FS is closing, but we don't have a 
> > > > way today to wait for him to finish what he's already doing.
> > > > 
> > > > kthread_park() is basically the same as adding another mutex or 
> > > > synchronization point.  I'm not sure how we could change close_tree() 
> > > > or 
> > > > the final commit to pick this up more effectively?
> > > 
> > > The idea is:
> > > 
> > > cleaner close_ctree thread
> > > 
> > > tell cleaner to stop
> > >   wait
> > > start work
> > > if should_stop, then exit
> > > cleaner is stopped
> > > 
> > > [does not run: finish work]
> > > [does not run: loop]
> > > pick up the work or make
> > >   sure there's nothing in
> > >   progress anymore
> > > 
> > > 
> > > A simplified version in code:
> > > 
> > >   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> > > 
> > >   wait for defrag - could be started from cleaner but next iteration will
> > >   see the fs closed and will not continue
> > > 
> > >   kthread_stop(transaction_kthread)
> > > 
> > >   kthread_stop(cleaner_kthread)
> > > 
> > >   /* next, everything that could be left from cleaner should be finished 
> > > */
> > > 
> > >   btrfs_delete_unused_bgs();
> > >   assert there are no defrags
> > >   assert there are no delayed iputs
> > >   commit if necessary
> > > 
> > > IOW the unused block groups are removed from close_ctree too early,
> > > moving that after the threads stop AND makins sure that it does not need
> > > either of them should work.
> > > 
> > > The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> > > calls plain btrfs_end_transaction that wakes up transaction ktread, so
> > > there would need to be an argument passed to tell it to do full commit.
> > 
> > Not perfect, relies on the fact that wake_up_process(thread) on a stopped
> > thread is a no-op,
> 
> How is that? kthread_stop() frees the task struct, so wake_up_process()
> would be a use-after-free.

That was an assumption for the demonstration purposes, the wording was
confusing sorry.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 08:23:17AM -0700, Omar Sandoval wrote:
> On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> > On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > > 
> > > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > > >> From: Omar Sandoval 
> > > >>
> > > >> There's a race between close_ctree() and cleaner_kthread().
> > > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > > >> sees it set, but this is racy; the cleaner might have already checked
> > > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > > >> unused
> > > >> block groups, it will create delayed iputs for the free space cache
> > > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > > >> longer running delayed iputs after a commit. Therefore, if the 
> > > >> cleaner
> > > >> creates more delayed iputs after delayed iputs are run in
> > > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > > >> inode crash from the VFS.
> > > >>
> > > >> Fix it by parking the cleaner
> > > >
> > > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > > we're missing a commit or clean up data structures. Messing with state
> > > > of a thread would be the last thing I'd try after proving that it's 
> > > > not
> > > > possible to fix in the logic of btrfs itself.
> > > >
> > > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > > there. The interdependencies of thread and data structures and other
> > > > subsystems cannot have loops that could not find an ordering that will
> > > > not leak something.
> > > >
> > > > It's not a big problem if some step is done more than once, like
> > > > committing or cleaning up some other structures if we know that
> > > > it could create new.
> > > 
> > > The problem is the cleaner thread needs to be told to stop doing new 
> > > work, and we need to wait for the work it's already doing to be 
> > > finished.  We're getting "stop doing new work" already because the 
> > > cleaner thread checks to see if the FS is closing, but we don't have a 
> > > way today to wait for him to finish what he's already doing.
> > > 
> > > kthread_park() is basically the same as adding another mutex or 
> > > synchronization point.  I'm not sure how we could change close_tree() or 
> > > the final commit to pick this up more effectively?
> > 
> > The idea is:
> > 
> > cleaner close_ctree thread
> > 
> > tell cleaner to stop
> > wait
> > start work
> > if should_stop, then exit
> > cleaner is stopped
> > 
> > [does not run: finish work]
> > [does not run: loop]
> > pick up the work or make
> > sure there's nothing in
> > progress anymore
> > 
> > 
> > A simplified version in code:
> > 
> >   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> > 
> >   wait for defrag - could be started from cleaner but next iteration will
> > see the fs closed and will not continue
> > 
> >   kthread_stop(transaction_kthread)
> > 
> >   kthread_stop(cleaner_kthread)
> > 
> >   /* next, everything that could be left from cleaner should be finished */
> > 
> >   btrfs_delete_unused_bgs();
> >   assert there are no defrags
> >   assert there are no delayed iputs
> >   commit if necessary
> > 
> > IOW the unused block groups are removed from close_ctree too early,
> > moving that after the threads stop AND makins sure that it does not need
> > either of them should work.
> > 
> > The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> > calls plain btrfs_end_transaction that wakes up transaction ktread, so
> > there would need to be an argument passed to tell it to do full commit.
> 
> It's too fragile to run around in the filesystem with these threads
> freed. We can probably make it now, but I'm worried that we'll add
> another wakeup somewhere and blow up.

So functions that are called from close_ctree will be careful to either
not wake up or check before trying to wake them up. We can also wrap the
wakeups like

wake_up_cleaner(fs_info) {
if (!fs_info->state & CLOSING)
wake_up_process(cleaner_kthread)
else
BUG_ON
}

Same for kthread.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Omar Sandoval
On Thu, Nov 01, 2018 at 08:24:25AM -0700, Omar Sandoval wrote:
> On Thu, Nov 01, 2018 at 04:22:29PM +0100, David Sterba wrote:
> > On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> > > On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > > > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > > > 
> > > > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > > > >> From: Omar Sandoval 
> > > > >>
> > > > >> There's a race between close_ctree() and cleaner_kthread().
> > > > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > > > >> sees it set, but this is racy; the cleaner might have already checked
> > > > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > > > >> unused
> > > > >> block groups, it will create delayed iputs for the free space cache
> > > > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > > > >> longer running delayed iputs after a commit. Therefore, if the 
> > > > >> cleaner
> > > > >> creates more delayed iputs after delayed iputs are run in
> > > > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > > > >> inode crash from the VFS.
> > > > >>
> > > > >> Fix it by parking the cleaner
> > > > >
> > > > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > > > we're missing a commit or clean up data structures. Messing with state
> > > > > of a thread would be the last thing I'd try after proving that it's 
> > > > > not
> > > > > possible to fix in the logic of btrfs itself.
> > > > >
> > > > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > > > there. The interdependencies of thread and data structures and other
> > > > > subsystems cannot have loops that could not find an ordering that will
> > > > > not leak something.
> > > > >
> > > > > It's not a big problem if some step is done more than once, like
> > > > > committing or cleaning up some other structures if we know that
> > > > > it could create new.
> > > > 
> > > > The problem is the cleaner thread needs to be told to stop doing new 
> > > > work, and we need to wait for the work it's already doing to be 
> > > > finished.  We're getting "stop doing new work" already because the 
> > > > cleaner thread checks to see if the FS is closing, but we don't have a 
> > > > way today to wait for him to finish what he's already doing.
> > > > 
> > > > kthread_park() is basically the same as adding another mutex or 
> > > > synchronization point.  I'm not sure how we could change close_tree() 
> > > > or 
> > > > the final commit to pick this up more effectively?
> > > 
> > > The idea is:
> > > 
> > > cleaner close_ctree thread
> > > 
> > > tell cleaner to stop
> > >   wait
> > > start work
> > > if should_stop, then exit
> > > cleaner is stopped
> > > 
> > > [does not run: finish work]
> > > [does not run: loop]
> > > pick up the work or make
> > >   sure there's nothing in
> > >   progress anymore
> > > 
> > > 
> > > A simplified version in code:
> > > 
> > >   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> > > 
> > >   wait for defrag - could be started from cleaner but next iteration will
> > >   see the fs closed and will not continue
> > > 
> > >   kthread_stop(transaction_kthread)
> > > 
> > >   kthread_stop(cleaner_kthread)
> > > 
> > >   /* next, everything that could be left from cleaner should be finished 
> > > */
> > > 
> > >   btrfs_delete_unused_bgs();
> > >   assert there are no defrags
> > >   assert there are no delayed iputs
> > >   commit if necessary
> > > 
> > > IOW the unused block groups are removed from close_ctree too early,
> > > moving that after the threads stop AND makins sure that it does not need
> > > either of them should work.
> > > 
> > > The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> > > calls plain btrfs_end_transaction that wakes up transaction ktread, so
> > > there would need to be an argument passed to tell it to do full commit.
> > 
> > Not perfect, relies on the fact that wake_up_process(thread) on a stopped
> > thread is a no-op,
> 
> How is that? kthread_stop() frees the task struct, so wake_up_process()
> would be a use-after-free.

(Indirectly, through the kthread calling do_exit() -> do_task_dead() ->
being put in the scheduler)


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Omar Sandoval
On Thu, Nov 01, 2018 at 04:22:29PM +0100, David Sterba wrote:
> On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> > On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > > 
> > > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > > >> From: Omar Sandoval 
> > > >>
> > > >> There's a race between close_ctree() and cleaner_kthread().
> > > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > > >> sees it set, but this is racy; the cleaner might have already checked
> > > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > > >> unused
> > > >> block groups, it will create delayed iputs for the free space cache
> > > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > > >> longer running delayed iputs after a commit. Therefore, if the 
> > > >> cleaner
> > > >> creates more delayed iputs after delayed iputs are run in
> > > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > > >> inode crash from the VFS.
> > > >>
> > > >> Fix it by parking the cleaner
> > > >
> > > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > > we're missing a commit or clean up data structures. Messing with state
> > > > of a thread would be the last thing I'd try after proving that it's 
> > > > not
> > > > possible to fix in the logic of btrfs itself.
> > > >
> > > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > > there. The interdependencies of thread and data structures and other
> > > > subsystems cannot have loops that could not find an ordering that will
> > > > not leak something.
> > > >
> > > > It's not a big problem if some step is done more than once, like
> > > > committing or cleaning up some other structures if we know that
> > > > it could create new.
> > > 
> > > The problem is the cleaner thread needs to be told to stop doing new 
> > > work, and we need to wait for the work it's already doing to be 
> > > finished.  We're getting "stop doing new work" already because the 
> > > cleaner thread checks to see if the FS is closing, but we don't have a 
> > > way today to wait for him to finish what he's already doing.
> > > 
> > > kthread_park() is basically the same as adding another mutex or 
> > > synchronization point.  I'm not sure how we could change close_tree() or 
> > > the final commit to pick this up more effectively?
> > 
> > The idea is:
> > 
> > cleaner close_ctree thread
> > 
> > tell cleaner to stop
> > wait
> > start work
> > if should_stop, then exit
> > cleaner is stopped
> > 
> > [does not run: finish work]
> > [does not run: loop]
> > pick up the work or make
> > sure there's nothing in
> > progress anymore
> > 
> > 
> > A simplified version in code:
> > 
> >   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> > 
> >   wait for defrag - could be started from cleaner but next iteration will
> > see the fs closed and will not continue
> > 
> >   kthread_stop(transaction_kthread)
> > 
> >   kthread_stop(cleaner_kthread)
> > 
> >   /* next, everything that could be left from cleaner should be finished */
> > 
> >   btrfs_delete_unused_bgs();
> >   assert there are no defrags
> >   assert there are no delayed iputs
> >   commit if necessary
> > 
> > IOW the unused block groups are removed from close_ctree too early,
> > moving that after the threads stop AND makins sure that it does not need
> > either of them should work.
> > 
> > The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> > calls plain btrfs_end_transaction that wakes up transaction ktread, so
> > there would need to be an argument passed to tell it to do full commit.
> 
> Not perfect, relies on the fact that wake_up_process(thread) on a stopped
> thread is a no-op,

How is that? kthread_stop() frees the task struct, so wake_up_process()
would be a use-after-free.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Omar Sandoval
On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > 
> > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > >> From: Omar Sandoval 
> > >>
> > >> There's a race between close_ctree() and cleaner_kthread().
> > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > >> sees it set, but this is racy; the cleaner might have already checked
> > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > >> unused
> > >> block groups, it will create delayed iputs for the free space cache
> > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > >> longer running delayed iputs after a commit. Therefore, if the 
> > >> cleaner
> > >> creates more delayed iputs after delayed iputs are run in
> > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > >> inode crash from the VFS.
> > >>
> > >> Fix it by parking the cleaner
> > >
> > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > we're missing a commit or clean up data structures. Messing with state
> > > of a thread would be the last thing I'd try after proving that it's 
> > > not
> > > possible to fix in the logic of btrfs itself.
> > >
> > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > there. The interdependencies of thread and data structures and other
> > > subsystems cannot have loops that could not find an ordering that will
> > > not leak something.
> > >
> > > It's not a big problem if some step is done more than once, like
> > > committing or cleaning up some other structures if we know that
> > > it could create new.
> > 
> > The problem is the cleaner thread needs to be told to stop doing new 
> > work, and we need to wait for the work it's already doing to be 
> > finished.  We're getting "stop doing new work" already because the 
> > cleaner thread checks to see if the FS is closing, but we don't have a 
> > way today to wait for him to finish what he's already doing.
> > 
> > kthread_park() is basically the same as adding another mutex or 
> > synchronization point.  I'm not sure how we could change close_tree() or 
> > the final commit to pick this up more effectively?
> 
> The idea is:
> 
> cleaner close_ctree thread
> 
> tell cleaner to stop
>   wait
> start work
> if should_stop, then exit
> cleaner is stopped
> 
> [does not run: finish work]
> [does not run: loop]
> pick up the work or make
>   sure there's nothing in
>   progress anymore
> 
> 
> A simplified version in code:
> 
>   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> 
>   wait for defrag - could be started from cleaner but next iteration will
>   see the fs closed and will not continue
> 
>   kthread_stop(transaction_kthread)
> 
>   kthread_stop(cleaner_kthread)
> 
>   /* next, everything that could be left from cleaner should be finished */
> 
>   btrfs_delete_unused_bgs();
>   assert there are no defrags
>   assert there are no delayed iputs
>   commit if necessary
> 
> IOW the unused block groups are removed from close_ctree too early,
> moving that after the threads stop AND makins sure that it does not need
> either of them should work.
> 
> The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> calls plain btrfs_end_transaction that wakes up transaction ktread, so
> there would need to be an argument passed to tell it to do full commit.

It's too fragile to run around in the filesystem with these threads
freed. We can probably make it now, but I'm worried that we'll add
another wakeup somewhere and blow up.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 04:08:32PM +0100, David Sterba wrote:
> On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> > On 1 Nov 2018, at 6:15, David Sterba wrote:
> > 
> > > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> > >> From: Omar Sandoval 
> > >>
> > >> There's a race between close_ctree() and cleaner_kthread().
> > >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> > >> sees it set, but this is racy; the cleaner might have already checked
> > >> the bit and could be cleaning stuff. In particular, if it deletes 
> > >> unused
> > >> block groups, it will create delayed iputs for the free space cache
> > >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> > >> longer running delayed iputs after a commit. Therefore, if the 
> > >> cleaner
> > >> creates more delayed iputs after delayed iputs are run in
> > >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> > >> inode crash from the VFS.
> > >>
> > >> Fix it by parking the cleaner
> > >
> > > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > > we're missing a commit or clean up data structures. Messing with state
> > > of a thread would be the last thing I'd try after proving that it's 
> > > not
> > > possible to fix in the logic of btrfs itself.
> > >
> > > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > > there. The interdependencies of thread and data structures and other
> > > subsystems cannot have loops that could not find an ordering that will
> > > not leak something.
> > >
> > > It's not a big problem if some step is done more than once, like
> > > committing or cleaning up some other structures if we know that
> > > it could create new.
> > 
> > The problem is the cleaner thread needs to be told to stop doing new 
> > work, and we need to wait for the work it's already doing to be 
> > finished.  We're getting "stop doing new work" already because the 
> > cleaner thread checks to see if the FS is closing, but we don't have a 
> > way today to wait for him to finish what he's already doing.
> > 
> > kthread_park() is basically the same as adding another mutex or 
> > synchronization point.  I'm not sure how we could change close_tree() or 
> > the final commit to pick this up more effectively?
> 
> The idea is:
> 
> cleaner close_ctree thread
> 
> tell cleaner to stop
>   wait
> start work
> if should_stop, then exit
> cleaner is stopped
> 
> [does not run: finish work]
> [does not run: loop]
> pick up the work or make
>   sure there's nothing in
>   progress anymore
> 
> 
> A simplified version in code:
> 
>   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> 
>   wait for defrag - could be started from cleaner but next iteration will
>   see the fs closed and will not continue
> 
>   kthread_stop(transaction_kthread)
> 
>   kthread_stop(cleaner_kthread)
> 
>   /* next, everything that could be left from cleaner should be finished */
> 
>   btrfs_delete_unused_bgs();
>   assert there are no defrags
>   assert there are no delayed iputs
>   commit if necessary
> 
> IOW the unused block groups are removed from close_ctree too early,
> moving that after the threads stop AND makins sure that it does not need
> either of them should work.
> 
> The "AND" above is not currently implemented as btrfs_delete_unused_bgs
> calls plain btrfs_end_transaction that wakes up transaction ktread, so
> there would need to be an argument passed to tell it to do full commit.

Not perfect, relies on the fact that wake_up_process(thread) on a stopped
thread is a no-op, arguments would need to be added to skip that in
btrfs_delete_unused_bgs and btrfs_commit_super.

--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3956,11 +3956,18 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 
cancel_work_sync(_info->async_reclaim_work);
 
+   if (test_bit(BTRFS_FS_STATE_ERROR, _info->fs_state) ||
+   test_bit(BTRFS_FS_STATE_TRANS_ABORTED, _info->fs_state))
+   btrfs_error_commit_super(fs_info);
+
+   kthread_stop(fs_info->transaction_kthread);
+   kthread_stop(fs_info->cleaner_kthread);
+
if (!sb_rdonly(fs_info->sb)) {
/*
-* If the cleaner thread is stopped and there are
-* block groups queued for removal, the deletion will be
-* skipped when we quit the cleaner thread.
+* The cleaner thread is now stopped and if there are block
+* groups queued for removal, we have to pick up the work here
+* so there are no delayed iputs triggered.
 */

Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Nikolay Borisov



On 1.11.18 г. 16:35 ч., Nikolay Borisov wrote:
> 
> 
> On 31.10.18 г. 19:06 ч., Omar Sandoval wrote:
>> From: Omar Sandoval 
>>
>> There's a race between close_ctree() and cleaner_kthread().
>> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
>> sees it set, but this is racy; the cleaner might have already checked
>> the bit and could be cleaning stuff. In particular, if it deletes unused
>> block groups, it will create delayed iputs for the free space cache
>> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
>> longer running delayed iputs after a commit. Therefore, if the cleaner
>> creates more delayed iputs after delayed iputs are run in
>> btrfs_commit_super(), we will leak inodes on unmount and get a busy
>> inode crash from the VFS.
>>
>> Fix it by parking the cleaner before we actually close anything. Then,
>> any remaining delayed iputs will always be handled in
>> btrfs_commit_super(). This also ensures that the commit in close_ctree()
>> is really the last commit, so we can get rid of the commit in
>> cleaner_kthread().
>>
>> Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
>> Signed-off-by: Omar Sandoval 
> 
> Also I believe this patch renders the wake_up_process in
> btrfs_commit_super a null op so it can also be removed, which leaves a
> single place that could wake up the cleaner - transaction_kthread.
> 
> So can't we stop transaction and cleaner thread right after setting
> CLOSING_FS. And commit the transaction in close_ctree whenever we deem
> necessary (in btrfs_commit_super for example) ?

Ok, that won't work because commit_super is called in other contexts
where it can genuinely wake up the trans kthread, blimey. Why can't we
stop transaction kthread first thing in close_ctree ?



> 
>> ---
>> Changes from v1:
>>
>> - Add a comment explaining why it needs to be a kthread_park(), not
>>   kthread_stop()
>> - Update later comment now that the cleaner thread is definitely stopped
>>
>>  fs/btrfs/disk-io.c | 51 ++
>>  1 file changed, 15 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index b0ab41da91d1..40bcc45d827d 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
>>  struct btrfs_root *root = arg;
>>  struct btrfs_fs_info *fs_info = root->fs_info;
>>  int again;
>> -struct btrfs_trans_handle *trans;
>>  
>> -do {
>> +while (1) {
>>  again = 0;
>>  
>>  /* Make the cleaner go to sleep early. */
>> @@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
>>   */
>>  btrfs_delete_unused_bgs(fs_info);
>>  sleep:
>> +if (kthread_should_park())
>> +kthread_parkme();
>> +if (kthread_should_stop())
>> +return 0;
>>  if (!again) {
>>  set_current_state(TASK_INTERRUPTIBLE);
>> -if (!kthread_should_stop())
>> -schedule();
>> +schedule();
>>  __set_current_state(TASK_RUNNING);
>>  }
>> -} while (!kthread_should_stop());
>> -
>> -/*
>> - * Transaction kthread is stopped before us and wakes us up.
>> - * However we might have started a new transaction and COWed some
>> - * tree blocks when deleting unused block groups for example. So
>> - * make sure we commit the transaction we started to have a clean
>> - * shutdown when evicting the btree inode - if it has dirty pages
>> - * when we do the final iput() on it, eviction will trigger a
>> - * writeback for it which will fail with null pointer dereferences
>> - * since work queues and other resources were already released and
>> - * destroyed by the time the iput/eviction/writeback is made.
>> - */
>> -trans = btrfs_attach_transaction(root);
>> -if (IS_ERR(trans)) {
>> -if (PTR_ERR(trans) != -ENOENT)
>> -btrfs_err(fs_info,
>> -  "cleaner transaction attach returned %ld",
>> -  PTR_ERR(trans));
>> -} else {
>> -int ret;
>> -
>> -ret = btrfs_commit_transaction(trans);
>> -if (ret)
>> -btrfs_err(fs_info,
>> -  "cleaner open transaction commit returned %d",
>> -  ret);
>>  }
>> -
>> -return 0;
>>  }
>>  
>>  static int transaction_kthread(void *arg)
>> @@ -3931,6 +3904,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>>  int ret;
>>  
>>  set_bit(BTRFS_FS_CLOSING_START, _info->flags);
>> +/*
>> + * We don't want the cleaner to start new transactions, add more delayed
>> + * iputs, etc. while we're closing. We can't use kthread_stop() yet
>> + * because that frees the 

Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Thu, Nov 01, 2018 at 01:31:18PM +, Chris Mason wrote:
> On 1 Nov 2018, at 6:15, David Sterba wrote:
> 
> > On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> >> From: Omar Sandoval 
> >>
> >> There's a race between close_ctree() and cleaner_kthread().
> >> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> >> sees it set, but this is racy; the cleaner might have already checked
> >> the bit and could be cleaning stuff. In particular, if it deletes 
> >> unused
> >> block groups, it will create delayed iputs for the free space cache
> >> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> >> longer running delayed iputs after a commit. Therefore, if the 
> >> cleaner
> >> creates more delayed iputs after delayed iputs are run in
> >> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> >> inode crash from the VFS.
> >>
> >> Fix it by parking the cleaner
> >
> > Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> > we're missing a commit or clean up data structures. Messing with state
> > of a thread would be the last thing I'd try after proving that it's 
> > not
> > possible to fix in the logic of btrfs itself.
> >
> > The shutdown sequence in close_tree is quite tricky and we've had bugs
> > there. The interdependencies of thread and data structures and other
> > subsystems cannot have loops that could not find an ordering that will
> > not leak something.
> >
> > It's not a big problem if some step is done more than once, like
> > committing or cleaning up some other structures if we know that
> > it could create new.
> 
> The problem is the cleaner thread needs to be told to stop doing new 
> work, and we need to wait for the work it's already doing to be 
> finished.  We're getting "stop doing new work" already because the 
> cleaner thread checks to see if the FS is closing, but we don't have a 
> way today to wait for him to finish what he's already doing.
> 
> kthread_park() is basically the same as adding another mutex or 
> synchronization point.  I'm not sure how we could change close_tree() or 
> the final commit to pick this up more effectively?

The idea is:

cleaner close_ctree thread

tell cleaner to stop
wait
start work
if should_stop, then exit
cleaner is stopped

[does not run: finish work]
[does not run: loop]
pick up the work or make
sure there's nothing in
progress anymore


A simplified version in code:

  set_bit(BTRFS_FS_CLOSING_START, _info->flags);

  wait for defrag - could be started from cleaner but next iteration will
see the fs closed and will not continue

  kthread_stop(transaction_kthread)

  kthread_stop(cleaner_kthread)

  /* next, everything that could be left from cleaner should be finished */

  btrfs_delete_unused_bgs();
  assert there are no defrags
  assert there are no delayed iputs
  commit if necessary

IOW the unused block groups are removed from close_ctree too early,
moving that after the threads stop AND makins sure that it does not need
either of them should work.

The "AND" above is not currently implemented as btrfs_delete_unused_bgs
calls plain btrfs_end_transaction that wakes up transaction ktread, so
there would need to be an argument passed to tell it to do full commit.


Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Nikolay Borisov



On 31.10.18 г. 19:06 ч., Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> There's a race between close_ctree() and cleaner_kthread().
> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> sees it set, but this is racy; the cleaner might have already checked
> the bit and could be cleaning stuff. In particular, if it deletes unused
> block groups, it will create delayed iputs for the free space cache
> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> longer running delayed iputs after a commit. Therefore, if the cleaner
> creates more delayed iputs after delayed iputs are run in
> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> inode crash from the VFS.
> 
> Fix it by parking the cleaner before we actually close anything. Then,
> any remaining delayed iputs will always be handled in
> btrfs_commit_super(). This also ensures that the commit in close_ctree()
> is really the last commit, so we can get rid of the commit in
> cleaner_kthread().
> 
> Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
> Signed-off-by: Omar Sandoval 

Also I believe this patch renders the wake_up_process in
btrfs_commit_super a null op so it can also be removed, which leaves a
single place that could wake up the cleaner - transaction_kthread.

So can't we stop transaction and cleaner thread right after setting
CLOSING_FS. And commit the transaction in close_ctree whenever we deem
necessary (in btrfs_commit_super for example) ?

> ---
> Changes from v1:
> 
> - Add a comment explaining why it needs to be a kthread_park(), not
>   kthread_stop()
> - Update later comment now that the cleaner thread is definitely stopped
> 
>  fs/btrfs/disk-io.c | 51 ++
>  1 file changed, 15 insertions(+), 36 deletions(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index b0ab41da91d1..40bcc45d827d 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
>   struct btrfs_root *root = arg;
>   struct btrfs_fs_info *fs_info = root->fs_info;
>   int again;
> - struct btrfs_trans_handle *trans;
>  
> - do {
> + while (1) {
>   again = 0;
>  
>   /* Make the cleaner go to sleep early. */
> @@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
>*/
>   btrfs_delete_unused_bgs(fs_info);
>  sleep:
> + if (kthread_should_park())
> + kthread_parkme();
> + if (kthread_should_stop())
> + return 0;
>   if (!again) {
>   set_current_state(TASK_INTERRUPTIBLE);
> - if (!kthread_should_stop())
> - schedule();
> + schedule();
>   __set_current_state(TASK_RUNNING);
>   }
> - } while (!kthread_should_stop());
> -
> - /*
> -  * Transaction kthread is stopped before us and wakes us up.
> -  * However we might have started a new transaction and COWed some
> -  * tree blocks when deleting unused block groups for example. So
> -  * make sure we commit the transaction we started to have a clean
> -  * shutdown when evicting the btree inode - if it has dirty pages
> -  * when we do the final iput() on it, eviction will trigger a
> -  * writeback for it which will fail with null pointer dereferences
> -  * since work queues and other resources were already released and
> -  * destroyed by the time the iput/eviction/writeback is made.
> -  */
> - trans = btrfs_attach_transaction(root);
> - if (IS_ERR(trans)) {
> - if (PTR_ERR(trans) != -ENOENT)
> - btrfs_err(fs_info,
> -   "cleaner transaction attach returned %ld",
> -   PTR_ERR(trans));
> - } else {
> - int ret;
> -
> - ret = btrfs_commit_transaction(trans);
> - if (ret)
> - btrfs_err(fs_info,
> -   "cleaner open transaction commit returned %d",
> -   ret);
>   }
> -
> - return 0;
>  }
>  
>  static int transaction_kthread(void *arg)
> @@ -3931,6 +3904,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>   int ret;
>  
>   set_bit(BTRFS_FS_CLOSING_START, _info->flags);
> + /*
> +  * We don't want the cleaner to start new transactions, add more delayed
> +  * iputs, etc. while we're closing. We can't use kthread_stop() yet
> +  * because that frees the task_struct, and the transaction kthread might
> +  * still try to wake up the cleaner.
> +  */
> + kthread_park(fs_info->cleaner_kthread);
>  
>   /* wait for the qgroup rescan worker to stop */
>   btrfs_qgroup_wait_for_completion(fs_info, false);
> @@ -3958,9 +3938,8 @@ void close_ctree(struct 

Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread Chris Mason
On 1 Nov 2018, at 6:15, David Sterba wrote:

> On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
>> From: Omar Sandoval 
>>
>> There's a race between close_ctree() and cleaner_kthread().
>> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
>> sees it set, but this is racy; the cleaner might have already checked
>> the bit and could be cleaning stuff. In particular, if it deletes 
>> unused
>> block groups, it will create delayed iputs for the free space cache
>> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
>> longer running delayed iputs after a commit. Therefore, if the 
>> cleaner
>> creates more delayed iputs after delayed iputs are run in
>> btrfs_commit_super(), we will leak inodes on unmount and get a busy
>> inode crash from the VFS.
>>
>> Fix it by parking the cleaner
>
> Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
> we're missing a commit or clean up data structures. Messing with state
> of a thread would be the last thing I'd try after proving that it's 
> not
> possible to fix in the logic of btrfs itself.
>
> The shutdown sequence in close_tree is quite tricky and we've had bugs
> there. The interdependencies of thread and data structures and other
> subsystems cannot have loops that could not find an ordering that will
> not leak something.
>
> It's not a big problem if some step is done more than once, like
> committing or cleaning up some other structures if we know that
> it could create new.

The problem is the cleaner thread needs to be told to stop doing new 
work, and we need to wait for the work it's already doing to be 
finished.  We're getting "stop doing new work" already because the 
cleaner thread checks to see if the FS is closing, but we don't have a 
way today to wait for him to finish what he's already doing.

kthread_park() is basically the same as adding another mutex or 
synchronization point.  I'm not sure how we could change close_tree() or 
the final commit to pick this up more effectively?

-chris



Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread Chris Mason
On 1 Nov 2018, at 6:21, ethanlien wrote:

> Nikolay Borisov 於 2018-11-01 18:01 寫到:
>> On 1.11.18 г. 11:56 ч., ethanlien wrote:
>>> Nikolay Borisov 於 2018-11-01 16:59 寫到:
 On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very 
> long
> time to complete. To fix the problem, we use tagged writepage for
> snapshot
> flusher as we do in the generic write_cache_pages(), so we can 
> ommit
> pages
> dirtied after the snapshot command.

 So the gist of this commit really is that you detect when 
 filemap_flush
 has been called from snapshot context and tag all pages at *that* 
 time
 as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages 
 that
 might have been dirtied beforehand. Your description kind of dances
 around this idea without really saying it explicitly. Those 
 semantics
 make sense, however i'd like to be stated more explicitly in the 
 change
  log.

 However, this is done at the expense of consistency, so we have 
 faster
 snapshots but depending the file which are stored in them they 
 might be
 broken (i.e a database) since they will be missing pages.

>>>
>>> tag_pages_for_writeback() will tag all pages with 
>>> PAGECACHE_TAG_DIRTY.
>>> If a dirty
>>> page get missed here, it means someone has initiated the flush 
>>> before
>>> us, so we
>>> will wait that dirty page to complete in create_snapshot() ->
>>> btrfs_wait_ordered_extents().
>>
>> And what happens in the scenario where you have  someone dirtying 
>> pages,
>> you issue the snapshot ioctl, mark all currently dirtied pages as
>> TOWRITE and then you have more delalloc pages being dirtied following
>> initial call to tag_pages_for_writeback , you will miss those, no ?
>>
>
> We don't freeze the filesystem when doing snapshot, so there is no 
> guarantee
> the page dirtied right after we send the ioctl, will be included in 
> snapshot.
> If a page is dirtied while we scan the dirty pages and its offset is 
> before our
> current index, we miss it in our current snapshot implementation too.

This looks like a pretty good compromise to me between btrfs v0's don't 
flush at all on snapshot and today's full sync on snapshot.  The full 
sync is always going to be a little racey wrt concurrent writes.

-chris


[PATCH 2/8] btrfs: Remove extent_io_ops::writepage_start_hook

2018-11-01 Thread Nikolay Borisov
This hook is called only from __extent_writepage_io which is already
called only from the data page writeout path. So there is no need to
make an indirect call via extent_io_ops. This patch just removes the
callback definition, exports the callback function and calls it
directly at the only call site. Also give the function a more descriptive
name. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/extent_io.c | 23 ++-
 fs/btrfs/extent_io.h |  1 -
 fs/btrfs/inode.c |  3 +--
 4 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dbeb5b2486d5..8e73301eaa65 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3189,6 +3189,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
 int run_delalloc_range(void *private_data, struct page *locked_page, u64 start,
   u64 end, int *page_started, unsigned long *nr_written,
   struct writeback_control *wbc);
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e6191aa25f3..c5c713ebb9b1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3321,20 +3321,17 @@ static noinline_for_stack int 
__extent_writepage_io(struct inode *inode,
int nr = 0;
bool compressed;
 
-   if (tree->ops && tree->ops->writepage_start_hook) {
-   ret = tree->ops->writepage_start_hook(page, start,
- page_end);
-   if (ret) {
-   /* Fixup worker will requeue */
-   if (ret == -EBUSY)
-   wbc->pages_skipped++;
-   else
-   redirty_page_for_writepage(wbc, page);
+   ret = btrfs_writepage_cow_fixup(page, start, page_end);
+   if (ret) {
+   /* Fixup worker will requeue */
+   if (ret == -EBUSY)
+   wbc->pages_skipped++;
+   else
+   redirty_page_for_writepage(wbc, page);
 
-   update_nr_written(wbc, nr_written);
-   unlock_page(page);
-   return 1;
-   }
+   update_nr_written(wbc, nr_written);
+   unlock_page(page);
+   return 1;
}
 
/*
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ca48187b86ba..4275a1061f5a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,7 +106,6 @@ struct extent_io_ops {
/*
 * Optional hooks, called if the pointer is not NULL
 */
-   int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
  struct extent_state *state, int uptodate);
void (*set_bit_hook)(void *private_data, struct extent_state *state,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aa6d6b64a70a..5c07c6f9f7db 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2154,7 +2154,7 @@ static void btrfs_writepage_fixup_worker(struct 
btrfs_work *work)
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
 {
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -10523,7 +10523,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = 
{
 
/* optional callbacks */
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
-   .writepage_start_hook = btrfs_writepage_start_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
-- 
2.7.4



[PATCH 3/8] btrfs: Remove extent_io_ops::writepage_end_io_hook

2018-11-01 Thread Nikolay Borisov
This callback is ony ever called for data page writeout so
there is no need to actually abstract it via extent_io_ops. Lets just
export it, remove the definition of the callback and call it directly
in the functions that invoke the callback. Also rename the function to
btrfs_writepage_endio_finish_ordered since what it really does is
account finished io in the ordered extent data structures.
No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/compression.c | 11 +--
 fs/btrfs/ctree.h   |  3 +++
 fs/btrfs/extent_io.c   | 31 +--
 fs/btrfs/extent_io.h   |  2 --
 fs/btrfs/inode.c   |  9 -
 5 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8703ce68fe9d..2f2b63efe5bb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -250,12 +250,11 @@ static void end_compressed_bio_write(struct bio *bio)
inode = cb->inode;
tree = _I(inode)->io_tree;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
-   tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
-cb->start,
-cb->start + cb->len - 1,
-NULL,
-bio->bi_status ?
-BLK_STS_OK : BLK_STS_NOTSUPP);
+   btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+cb->start,
+cb->start + cb->len - 1, NULL,
+bio->bi_status ?
+BLK_STS_OK : BLK_STS_NOTSUPP);
cb->compressed_pages[0]->mapping = NULL;
 
end_compressed_writeback(inode, cb);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e73301eaa65..10f4dad8b16d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3190,6 +3190,9 @@ int run_delalloc_range(void *private_data, struct page 
*locked_page, u64 start,
   u64 end, int *page_started, unsigned long *nr_written,
   struct writeback_control *wbc);
 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
+void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+ u64 end, struct extent_state *state,
+ int uptodate);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c5c713ebb9b1..05cbd6b3aeec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2417,9 +2417,7 @@ void end_extent_writepage(struct page *page, int err, u64 
start, u64 end)
 
tree = _I(page->mapping->host)->io_tree;
 
-   if (tree->ops && tree->ops->writepage_end_io_hook)
-   tree->ops->writepage_end_io_hook(page, start, end, NULL,
-   uptodate);
+   btrfs_writepage_endio_finish_ordered(page, start, end, NULL, uptodate);
 
if (!uptodate) {
ClearPageUptodate(page);
@@ -3342,9 +3340,8 @@ static noinline_for_stack int 
__extent_writepage_io(struct inode *inode,
 
end = page_end;
if (i_size <= start) {
-   if (tree->ops && tree->ops->writepage_end_io_hook)
-   tree->ops->writepage_end_io_hook(page, start,
-page_end, NULL, 1);
+   btrfs_writepage_endio_finish_ordered(page, start, page_end,
+NULL, 1);
goto done;
}
 
@@ -3355,9 +3352,9 @@ static noinline_for_stack int 
__extent_writepage_io(struct inode *inode,
u64 offset;
 
if (cur >= i_size) {
-   if (tree->ops && tree->ops->writepage_end_io_hook)
-   tree->ops->writepage_end_io_hook(page, cur,
-page_end, NULL, 1);
+   btrfs_writepage_endio_finish_ordered(page, cur,
+page_end, NULL,
+1);
break;
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
@@ -3391,11 +3388,10 @@ static noinline_for_stack int 
__extent_writepage_io(struct inode *inode,
 * end_io notification does not happen here for
 * compressed extents
 */
-   if (!compressed && tree->ops &&
-   tree->ops->writepage_end_io_hook)
-   tree->ops->writepage_end_io_hook(page, cur,
-

[PATCH 5/8] btrfs: Remove extent_io_ops::set_bit_hook extent_io callback

2018-11-01 Thread Nikolay Borisov
This callback is used to properly account delalloc extents for
data inodes (ordinary file inodes and freespace v1 inodes). Those can
be easily identified since they have their extent_io trees
->private_data member point to the inode. Let's exploit this fact to
remove the needless indirection through extent_io_hooks and directly
call the function. Also give the function a name which reflects its
purpose - btrfs_set_delalloc_extent.

This patch also modified test_find_delalloc so that the extent_io_tree
used for testing doesn't have its ->private_data set which would have
caused a crash in btrfs_set_delalloc_extent due to the
btrfs_inode->root member not being initialised. The old version of the
code also didn't call set_bit_hook since the extent_io ops weren't set
for the inode.  No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/extent_io.c | 11 +++
 fs/btrfs/extent_io.h |  2 --
 fs/btrfs/inode.c | 12 
 fs/btrfs/tests/extent-io-tests.c |  2 +-
 5 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 10f4dad8b16d..8fd9db7aa8de 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3150,6 +3150,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle 
*trans,
 struct btrfs_root *new_root,
 struct btrfs_root *parent_root,
 u64 new_dirid);
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+  unsigned *bits);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index de171ae2ef20..31bdc596e623 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -404,13 +404,6 @@ static void merge_state(struct extent_io_tree *tree,
}
 }
 
-static void set_state_cb(struct extent_io_tree *tree,
-struct extent_state *state, unsigned *bits)
-{
-   if (tree->ops && tree->ops->set_bit_hook)
-   tree->ops->set_bit_hook(tree->private_data, state, bits);
-}
-
 static void clear_state_cb(struct extent_io_tree *tree,
   struct extent_state *state, unsigned *bits)
 {
@@ -809,7 +802,9 @@ static void set_state_bits(struct extent_io_tree *tree,
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
 
-   set_state_cb(tree, state, bits);
+   if (tree->private_data)
+   btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3cb84a0fbaab..b3235d46b5c3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,8 +106,6 @@ struct extent_io_ops {
/*
 * Optional hooks, called if the pointer is not NULL
 */
-   void (*set_bit_hook)(void *private_data, struct extent_state *state,
-unsigned *bits);
void (*clear_bit_hook)(void *private_data,
struct extent_state *state,
unsigned *bits);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7110686e9b0e..d1c56c56a413 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1757,15 +1757,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root 
*root,
 }
 
 /*
- * extent_io.c set_bit_hook, used to track delayed allocation
- * bytes in this file, and to maintain the list of inodes that
- * have pending delalloc work to be done.
+ * Properly track delayed allocation bytes in the inode and to maintain the
+ * list of inodes that have pending delalloc work to be done.
  */
-static void btrfs_set_bit_hook(void *private_data,
-  struct extent_state *state, unsigned *bits)
+void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
+  unsigned *bits)
 {
-   struct inode *inode = private_data;
-
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -10508,7 +10505,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = 
{
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
 
/* optional callbacks */
-   .set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e0f4a01be14..ac8b5e35797d 100644
--- 

[PATCH 7/8] btrfs: Remove extent_io_ops::merge_extent_hook callback

2018-11-01 Thread Nikolay Borisov
This callback is used only for data and free space inodes. Such inodes
are guaranteed to have their extent_io_tree::private_data set to the
inode struct. Exploit this fact to directly call the function. Also
give it a more descriptive name. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/extent_io.c | 15 ++-
 fs/btrfs/extent_io.h |  3 ---
 fs/btrfs/inode.c | 16 ++--
 4 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49d52180e14c..1f0025a0268b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3154,6 +3154,8 @@ void btrfs_set_delalloc_extent(struct inode *inode, 
struct extent_state *state,
   unsigned *bits);
 void btrfs_clear_delalloc_extent(struct inode *inode,
 struct extent_state *state, unsigned *bits);
+void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+struct extent_state *other);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a3081c0ca3e3..ee7e6ec3878d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -353,13 +353,6 @@ static inline struct rb_node *tree_search(struct 
extent_io_tree *tree,
return tree_search_for_insert(tree, offset, NULL, NULL);
 }
 
-static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
-struct extent_state *other)
-{
-   if (tree->ops && tree->ops->merge_extent_hook)
-   tree->ops->merge_extent_hook(tree->private_data, new, other);
-}
-
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
@@ -383,7 +376,9 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
-   merge_cb(tree, state, other);
+   if (tree->private_data)
+   btrfs_merge_delalloc_extent(tree->private_data,
+   state, other);
state->start = other->start;
rb_erase(>rb_node, >state);
RB_CLEAR_NODE(>rb_node);
@@ -395,7 +390,9 @@ static void merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
-   merge_cb(tree, state, other);
+   if (tree->private_data)
+   btrfs_merge_delalloc_extent(tree->private_data,
+   state, other);
state->end = other->end;
rb_erase(>rb_node, >state);
RB_CLEAR_NODE(>rb_node);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a3a3302f3625..7d181a378d90 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,9 +106,6 @@ struct extent_io_ops {
/*
 * Optional hooks, called if the pointer is not NULL
 */
-   void (*merge_extent_hook)(void *private_data,
- struct extent_state *new,
- struct extent_state *other);
void (*split_extent_hook)(void *private_data,
  struct extent_state *orig, u64 split);
 };
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41c655cd9d23..448fad3a8b74 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1627,7 +1627,7 @@ static void btrfs_split_extent_hook(void *private_data,
u64 new_size;
 
/*
-* See the explanation in btrfs_merge_extent_hook, the same
+* See the explanation in btrfs_merge_delalloc_extent, the same
 * applies here, just in reverse.
 */
new_size = orig->end - split + 1;
@@ -1644,16 +1644,13 @@ static void btrfs_split_extent_hook(void *private_data,
 }
 
 /*
- * extent_io.c merge_extent_hook, used to track merged delayed allocation
- * extents so we can keep track of new extents that are just merged onto old
- * extents, such as when we are doing sequential writes, so we can properly
- * account for the metadata space we'll need.
+ * Handle merged delayed allocation extents so we can keep track of new extents
+ * that are just merged onto old extents, such as when we are doing sequential
+ * writes, so we can properly account for the 

[PATCH 8/8] btrfs: Remove extent_io_ops::split_extent_hook callback

2018-11-01 Thread Nikolay Borisov
This is the counterpart to merge_extent_hook, similarly, it's used only
for data/freespace inodes so let's remove it, rename it and call it
directly where necessary. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/extent_io.c | 10 ++
 fs/btrfs/extent_io.h |  6 --
 fs/btrfs/inode.c |  8 ++--
 4 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1f0025a0268b..c20ef001f9af 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3156,6 +3156,8 @@ void btrfs_clear_delalloc_extent(struct inode *inode,
 struct extent_state *state, unsigned *bits);
 void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
 struct extent_state *other);
+void btrfs_split_delalloc_extent(struct inode *inode,
+struct extent_state *orig, u64 split);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ee7e6ec3878d..8c9073f28f6f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -443,13 +443,6 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
 }
 
-static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
-u64 split)
-{
-   if (tree->ops && tree->ops->split_extent_hook)
-   tree->ops->split_extent_hook(tree->private_data, orig, split);
-}
-
 /*
  * split a given extent state struct in two, inserting the preallocated
  * struct 'prealloc' as the newly created second half.  'split' indicates an
@@ -469,7 +462,8 @@ static int split_state(struct extent_io_tree *tree, struct 
extent_state *orig,
 {
struct rb_node *node;
 
-   split_cb(tree, orig, split);
+   if (tree->private_data)
+   btrfs_split_delalloc_extent(tree->private_data, orig, split);
 
prealloc->start = orig->start;
prealloc->end = split - 1;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7d181a378d90..d96fd534f144 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -102,12 +102,6 @@ struct extent_io_ops {
struct page *page, u64 start, u64 end,
int mirror);
int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-
-   /*
-* Optional hooks, called if the pointer is not NULL
-*/
-   void (*split_extent_hook)(void *private_data,
- struct extent_state *orig, u64 split);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 448fad3a8b74..c38733c44b98 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1611,10 +1611,9 @@ int run_delalloc_range(void *private_data, struct page 
*locked_page,
return ret;
 }
 
-static void btrfs_split_extent_hook(void *private_data,
-   struct extent_state *orig, u64 split)
+void btrfs_split_delalloc_extent(struct inode *inode,
+struct extent_state *orig, u64 split)
 {
-   struct inode *inode = private_data;
u64 size;
 
/* not delalloc, ignore it */
@@ -10500,9 +10499,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = 
{
.submit_bio_hook = btrfs_submit_bio_hook,
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
-
-   /* optional callbacks */
-   .split_extent_hook = btrfs_split_extent_hook,
 };
 
 /*
-- 
2.7.4



[PATCH 1/8] btrfs: Remove extent_io_ops::fill_delalloc

2018-11-01 Thread Nikolay Borisov
This callback is called only from writepage_delalloc which in turn
is guaranteed to be called from the data page writeout path. In the end
there is no reason to have the call to this function to be indrected
via the extent_io_ops structure. This patch removes the callback
definition, exports the function and calls it directly. No functional
changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  3 +++
 fs/btrfs/extent_io.c | 14 ++
 fs/btrfs/extent_io.h |  5 -
 fs/btrfs/inode.c | 10 +-
 4 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..dbeb5b2486d5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3186,6 +3186,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int run_delalloc_range(void *private_data, struct page *locked_page, u64 start,
+  u64 end, int *page_started, unsigned long *nr_written,
+  struct writeback_control *wbc);
 extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..2e6191aa25f3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3205,7 +3205,7 @@ static void update_nr_written(struct writeback_control 
*wbc,
 /*
  * helper for __extent_writepage, doing all of the delayed allocation setup.
  *
- * This returns 1 if our fill_delalloc function did all the work required
+ * This returns 1 if run_delalloc_range function did all the work required
  * to write the page (copy into inline extent).  In this case the IO has
  * been started and the page is already unlocked.
  *
@@ -3226,7 +3226,7 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
int ret;
int page_started = 0;
 
-   if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+   if (epd->extent_locked)
return 0;
 
while (delalloc_end < page_end) {
@@ -3239,15 +3239,13 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
-   ret = tree->ops->fill_delalloc(inode, page,
-  delalloc_start,
-  delalloc_end,
-  _started,
-  nr_written, wbc);
+   ret = run_delalloc_range(inode, page, delalloc_start,
+delalloc_end, _started,
+nr_written, wbc);
/* File system has been set read-only */
if (ret) {
SetPageError(page);
-   /* fill_delalloc should be return < 0 for error
+   /* run_delalloc_range should return < 0 for error
 * but just in case, we use > 0 here meaning the
 * IO is started, so we don't want to return > 0
 * unless things are going well.
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 369daa5d4f73..ca48187b86ba 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,11 +106,6 @@ struct extent_io_ops {
/*
 * Optional hooks, called if the pointer is not NULL
 */
-   int (*fill_delalloc)(void *private_data, struct page *locked_page,
-u64 start, u64 end, int *page_started,
-unsigned long *nr_written,
-struct writeback_control *wbc);
-
int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
  struct extent_state *state, int uptodate);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f4d31fd62eed..aa6d6b64a70a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -109,8 +109,8 @@ static void __endio_write_update_ordered(struct inode 
*inode,
  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
  * and EXTENT_DELALLOC simultaneously, because that causes the reserved 
metadata
  * to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of the
- * fill_delalloc() callback already does proper cleanup for the first page of
+ * extent (btrfs_finish_ordered_io()). Also note that the caller of
+ * run_delalloc_range already does proper cleanup for the first page of
  * the range, that is, it invokes the 

[PATCH 6/8] btrfs: Remove extent_io_ops::clear_bit_hook callback

2018-11-01 Thread Nikolay Borisov
This is the counterpart to ex-set_bit_hook (now btrfs_set_delalloc_extent),
similar to what was done before remove clear_bit_hook and rename the
function. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/extent_io.c | 12 
 fs/btrfs/extent_io.h |  3 ---
 fs/btrfs/inode.c | 13 ++---
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8fd9db7aa8de..49d52180e14c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3152,6 +3152,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle 
*trans,
 u64 new_dirid);
 void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
   unsigned *bits);
+void btrfs_clear_delalloc_extent(struct inode *inode,
+struct extent_state *state, unsigned *bits);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 size_t size, struct bio *bio,
 unsigned long bio_flags);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 31bdc596e623..a3081c0ca3e3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -404,13 +404,6 @@ static void merge_state(struct extent_io_tree *tree,
}
 }
 
-static void clear_state_cb(struct extent_io_tree *tree,
-  struct extent_state *state, unsigned *bits)
-{
-   if (tree->ops && tree->ops->clear_bit_hook)
-   tree->ops->clear_bit_hook(tree->private_data, state, bits);
-}
-
 static void set_state_bits(struct extent_io_tree *tree,
   struct extent_state *state, unsigned *bits,
   struct extent_changeset *changeset);
@@ -525,7 +518,10 @@ static struct extent_state *clear_state_bit(struct 
extent_io_tree *tree,
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
}
-   clear_state_cb(tree, state, bits);
+
+   if (tree->private_data)
+   btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b3235d46b5c3..a3a3302f3625 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -106,9 +106,6 @@ struct extent_io_ops {
/*
 * Optional hooks, called if the pointer is not NULL
 */
-   void (*clear_bit_hook)(void *private_data,
-   struct extent_state *state,
-   unsigned *bits);
void (*merge_extent_hook)(void *private_data,
  struct extent_state *new,
  struct extent_state *other);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d1c56c56a413..41c655cd9d23 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1808,14 +1808,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, 
struct extent_state *state,
 }
 
 /*
- * extent_io.c clear_bit_hook, see set_bit_hook for why
+ * Once a range is no longer delalloc this function ensures proper accounting
+ * happens.
  */
-static void btrfs_clear_bit_hook(void *private_data,
-struct extent_state *state,
-unsigned *bits)
+void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
+struct extent_state *state, unsigned *bits)
 {
-   struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
-   struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
+   struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+   struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
u64 len = state->end + 1 - state->start;
u32 num_extents = count_max_extents(len);
 
@@ -10505,7 +10505,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = 
{
.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
 
/* optional callbacks */
-   .clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
 };
-- 
2.7.4



[PATCH 4/8] btrfs: Remove extent_io_ops::check_extent_io_range callback

2018-11-01 Thread Nikolay Borisov
This callback was only used in debug builds by btrfs_leak_debug_check.
A better approach is to move its implementation in
btrfs_leak_debug_check and ensure the latter is only executed for
extent tree which have ->private_data set i.e. relate to a data node and
not the btree one. No functional changes.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/extent_io.c | 15 ---
 fs/btrfs/extent_io.h |  2 --
 fs/btrfs/inode.c | 15 ---
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 05cbd6b3aeec..de171ae2ef20 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void)
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree, u64 start, u64 end)
 {
-   if (tree->ops && tree->ops->check_extent_io_range)
-   tree->ops->check_extent_io_range(tree->private_data, caller,
-start, end);
+   struct inode *inode = tree->private_data;
+   u64 isize;
+
+   if (!inode)
+   return;
+
+   isize = i_size_read(inode);
+   if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+   btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+   "%s: ino %llu isize %llu odd range [%llu,%llu]",
+   caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+   }
 }
 #else
 #define btrfs_leak_debug_add(new, head)do {} while (0)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 415ea7c2b058..3cb84a0fbaab 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -116,8 +116,6 @@ struct extent_io_ops {
  struct extent_state *other);
void (*split_extent_hook)(void *private_data,
  struct extent_state *orig, u64 split);
-   void (*check_extent_io_range)(void *private_data, const char *caller,
- u64 start, u64 end);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e0f2728b9de..7110686e9b0e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10447,20 +10447,6 @@ static int btrfs_readpage_io_failed_hook(struct page 
*page, int failed_mirror)
return -EAGAIN;
 }
 
-static void btrfs_check_extent_io_range(void *private_data, const char *caller,
-   u64 start, u64 end)
-{
-   struct inode *inode = private_data;
-   u64 isize;
-
-   isize = i_size_read(inode);
-   if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-   btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-   "%s: ino %llu isize %llu odd range [%llu,%llu]",
-   caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
-   }
-}
-
 void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
struct inode *inode = tree->private_data;
@@ -10526,7 +10512,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = 
{
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
-   .check_extent_io_range = btrfs_check_extent_io_range,
 };
 
 /*
-- 
2.7.4



[PATCH 0/8] Removal of optional hooks from struct extent_io_ops

2018-11-01 Thread Nikolay Borisov
extent_io_ops has a set of 8 optional hooks which are set only for data and 
freespace inodes. The majority of them actually deal with delallocs in one way 
or another. Inspecting the code it transpired that there is actually no need to
have them as function pointers in a structure. Data/freespace inodes can easily
be distinguished from the btree_inode (which is pending removal anyway) by 
inspecting extent_io_tree::private_data. This member is set by all 
data/freespace
inodes. This series exploits this fact to remove the majority of them. Others, 
such as fill_delalloc, writepage_start_hook and writepage_end_io_hook are always
called from the data writeout path and can be directly called without having to
check whether the respective pointers are set. 

This series has undergone multiple xfstest runs and no regressions were 
identified. Additionally all but run_delalloc_range functions are given more 
descriptive names, related to their actual intent. 

Nikolay Borisov (8):
  btrfs: Remove extent_io_ops::fill_delalloc
  btrfs: Remove extent_io_ops::writepage_start_hook
  btrfs: Remove extent_io_ops::writepage_end_io_hook
  btrfs: Remove extent_io_ops::check_extent_io_range callback
  btrfs: Remove extent_io_ops::set_bit_hook extent_io callback
  btrfs: Remove extent_io_ops::clear_bit_hook callback
  btrfs: Remove extent_io_ops::merge_extent_hook callback
  btrfs: Remove extent_io_ops::split_extent_hook callback

 fs/btrfs/compression.c   |  11 ++--
 fs/btrfs/ctree.h |  15 +
 fs/btrfs/extent_io.c | 131 +--
 fs/btrfs/extent_io.h |  24 ---
 fs/btrfs/inode.c |  86 +
 fs/btrfs/tests/extent-io-tests.c |   2 +-
 6 files changed, 105 insertions(+), 164 deletions(-)

-- 
2.7.4



BTRFS RAID5 disk failed while balancing

2018-11-01 Thread Oliver R.

If you clicked on the link to this topic: Thank you!

I have the following setup:

6x 500GB HDD-Drives
1x 32GB NVME-SSD (Intel Optane)

I used bcache to setup up the SSD as caching device and all other six 
drives are backing devices. After all that was in place, I formatted the 
six HHDs with btrfs in RAID5. Everything works as expected for the last 
7 months now.


By now I have a spare of 6x 2TB HDD drives and I want to replace the old 
500GB disks one by one. So I started with the first one by deleting it 
from the btrfs. This worked fine, I had no issues there. After that I 
cleanly detached the empty disk from bcache, still everything is fine, 
so I removed it. Here are the commandlines for this:


sudo btrfs device delete /dev/bcacheX /media/raid
cat /sys/block/bcacheX/bcache/state
cat /sys/block/bcacheX/bcache/dirty_data
sudo sh -c "echo 1 > /sys/block/bcacheX/bcache/detach"
cat /sys/block/bcacheX/bcache/state

After that I installed one of 2TB drives, attached it to bcache and 
added it to the raid. The next step was to balance the data over to the 
new drive. Please see the commandlines:


sudo make-bcache -B /dev/sdY
sudo sh -c "echo '60a63f7c-2e68-4503-9f25-71b6b00e47b2' > 
/sys/block/bcacheY/bcache/attach"

sudo sh -c "echo writeback > /sys/block/bcacheY/bcache/cache_mode"
sudo btrfs device add /dev/bcacheY /media/raid
sudo btrfs fi ba start /media/raid/

The balance worked fine until ~164GB were written to the new drive, this 
is about 50% of the data to be balanced. Suddenly write errors on the 
disk appear. The Raid slowly became unusable (I was running 3 VMs of the 
RAID while balancing). I think it worked for some time due to the SSD 
commiting the writes. At some point the balancing stopped and I was only 
able to kill the VMs. I checked the I/Os on the disks and the SSD spit 
out constant 1,2 GB/s read. I think the bcache somehow delivered data to 
the btrfs and it got rejected there and requested again, but this is 
just a guess. Anyway, I ended up resetting the host and I physically 
disconnected the broken disk and put a new one in place. I also created 
a bcache backing device on it and issued the following command to 
replace the faulty disk:


sudo btrfs replace start -r 7 /dev/bcache5 /media/raid

The filesystem needs to be mounted read/write for this command to work. 
It is now doing its work, but very slow, about 3,5 MB/s. Unfortunately 
the syslog reports a lot of these messages:


...
scrub_missing_raid56_worker: 62 callbacks suppressed
BTRFS error (device bcache0): failed to rebuild valid logical 
4929143865344 for dev (null)

...
BTRFS error (device bcache0): failed to rebuild valid logical 
4932249866240 for dev (null)

scrub_missing_raid56_worker: 1 callbacks suppressed
BTRFS error (device bcache0): failed to rebuild valid logical 
4933254250496 for dev (null)



If I try to read a file from the filesystem, the output-command fails 
with a simple I/O error and the syslog shows something entries similar 
to this:


BTRFS warning (device bcache0): csum failed root 5 ino 1143 off 
7274496 csum 0xf554 expected csum 0x6340b527 mirror 2


So far, so good (or bad). It took about 6 hours for 4,3% of the 
replacement so far. No read or write errors have been reported for the 
replacement procedure ("btrfs replace status"). I will let it to its 
thing until finished. Before the first 2TB disk failed, 164 GB of data 
have been written according to "btrfs filesystem show". If  I check the 
amount of data written to the new drive, the 4,3% represent about 82 GB 
(according to /proc/diskstats). I don't know how to interpret this, but 
anyway.


And now finally my questions: If the replace command finishes 
successfully, what should I do next. A scrub? A balance? Another backup? ;-)
Do you see anything that I have done wrong in this procedure? Do the 
warnings and the errors reported from btrfs mean, that the data is lost? :-(


Here is some additional info (**edited**):

$ sudo btrfs fi sh
Total devices 7 FS bytes used 1.56TiB
Label: none  uuid: 9f765025-5354-47e4-afcc-a601b2a52703
devid0 size 1.82TiB used 164.03GiB path /dev/bcache5
devid1 size 465.76GiB used 360.03GiB path /dev/bcache4
devid3 size 465.76GiB used 360.00GiB path /dev/bcache3
devid4 size 465.76GiB used 359.03GiB path /dev/bcache1
devid5 size 465.76GiB used 360.00GiB path /dev/bcache0
devid6 size 465.76GiB used 360.03GiB path /dev/bcache2
*** Some devices missing

$ sudo btrfs dev stats /media/raid/
[/dev/bcache5].write_io_errs0
[/dev/bcache5].read_io_errs 0
[/dev/bcache5].flush_io_errs0
[/dev/bcache5].corruption_errs  0
[/dev/bcache5].generation_errs  0
[/dev/bcache4].write_io_errs0
[/dev/bcache4].read_io_errs 0
[/dev/bcache4].flush_io_errs0
[/dev/bcache4].corruption_errs  0

Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread Nikolay Borisov



On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.
> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec
> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien 
> ---
>  fs/btrfs/btrfs_inode.h |  1 +
>  fs/btrfs/ctree.h   |  2 +-
>  fs/btrfs/extent_io.c   | 16 ++--
>  fs/btrfs/inode.c   | 10 ++
>  fs/btrfs/ioctl.c   |  2 +-
>  5 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1343ac57b438..4182bfbb56be 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -29,6 +29,7 @@ enum {
>   BTRFS_INODE_IN_DELALLOC_LIST,
>   BTRFS_INODE_READDIO_NEED_LOCK,
>   BTRFS_INODE_HAS_PROPS,
> + BTRFS_INODE_TAGGED_FLUSH,
>  };
>  
>  /* in memory btrfs inode */
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2cddfe7806a4..82682da5a40d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct 
> btrfs_trans_handle *trans,
>  struct inode *inode, u64 new_size,
>  u32 min_type);
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
> unsigned int extra_bits,
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4dd6faab02bb..c21d8a0e010a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
> address_space *mapping,
>   range_whole = 1;
>   scanned = 1;
>   }
> - if (wbc->sync_mode == WB_SYNC_ALL)
> +
> + /*
> +  * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
> +  * start_delalloc_inodes(). We do the tagged writepage as long as we are
> +  * the first one who do the filemap_flush() on this inode.
> +  */
> + if (range_whole && wbc->nr_to_write == LONG_MAX &&
> + wbc->sync_mode == WB_SYNC_NONE &&
> + test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
> + _I(inode)->runtime_flags))
Actually this check can be simplified to:

range_whole && test_and_clear_bit. filemap_flush triggers range_whole =
1 and then you care about TAGGED_FLUSH (or w/e it's going to be named)
to be set. The nr_to_write && syncmode just make it a tad more difficult
to reason about the code.


> + wbc->tagged_writepages = 1;
> +
> + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>   tag = PAGECACHE_TAG_TOWRITE;
>   else
>   tag = PAGECACHE_TAG_DIRTY;
>  retry:
> - if (wbc->sync_mode == WB_SYNC_ALL)
> + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>   tag_pages_for_writeback(mapping, index, end);
>   done_index = index;
>   while (!done && !nr_to_write_done && (index <= end) &&
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 3ea5339603cf..3df3cbbe91c5 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work 
> *btrfs_alloc_delalloc_work(struct inode *inode
>   * some fairly slow code that needs optimization. This walks the list
>   * of all the inodes with pending delalloc and forces them to disk.
>   */
> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
> +static int start_delalloc_inodes(struct btrfs_root *root, int nr, int 
> snapshot)
>  {
>   struct btrfs_inode *binode;
>   struct inode 

Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread ethanlien

Nikolay Borisov 於 2018-11-01 18:01 寫到:

On 1.11.18 г. 11:56 ч., ethanlien wrote:

Nikolay Borisov 於 2018-11-01 16:59 寫到:

On 1.11.18 г. 8:49 ч., Ethan Lien wrote:

Snapshot is expected to be fast. But if there are writers steadily
create dirty pages in our subvolume, the snapshot may take a very 
long

time to complete. To fix the problem, we use tagged writepage for
snapshot
flusher as we do in the generic write_cache_pages(), so we can ommit
pages
dirtied after the snapshot command.


So the gist of this commit really is that you detect when 
filemap_flush
has been called from snapshot context and tag all pages at *that* 
time

as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
might have been dirtied beforehand. Your description kind of dances
around this idea without really saying it explicitly. Those semantics
make sense, however i'd like to be stated more explicitly in the 
change

 log.

However, this is done at the expense of consistency, so we have 
faster
snapshots but depending the file which are stored in them they might 
be

broken (i.e a database) since they will be missing pages.



tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY.
If a dirty
page get missed here, it means someone has initiated the flush before
us, so we
will wait that dirty page to complete in create_snapshot() ->
btrfs_wait_ordered_extents().


And what happens in the scenario where you have  someone dirtying 
pages,

you issue the snapshot ioctl, mark all currently dirtied pages as
TOWRITE and then you have more delalloc pages being dirtied following
initial call to tag_pages_for_writeback , you will miss those, no ?



We don't freeze the filesystem when doing snapshot, so there is no 
guarantee
the page dirtied right after we send the ioctl, will be included in 
snapshot.
If a page is dirtied while we scan the dirty pages and its offset is 
before our

current index, we miss it in our current snapshot implementation too.





We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 
5;

time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec


Nice.



This is the best case for this patch since for a sequential write 
case,

we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 
5;

time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is less compared with the sequential write case, 
since

we omit only half of the pages dirtied after snapshot command.

Signed-off-by: Ethan Lien 
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent_io.c   | 16 ++--
 fs/btrfs/inode.c   | 10 ++
 fs/btrfs/ioctl.c   |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..4182bfbb56be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ enum {
 BTRFS_INODE_IN_DELALLOC_LIST,
 BTRFS_INODE_READDIO_NEED_LOCK,
 BTRFS_INODE_HAS_PROPS,
+    BTRFS_INODE_TAGGED_FLUSH,


IMO the name of the flag should be different. Something like "lie 
flush"


I meant "lite flush" or "strict flush" or something like that, alluding
to the fact we are restricting the set of pages being flushed.

or whatever. Because all flushes are tagged i.e TAG_DIRTY or 
TAG_TOWRITE.




OK I'll use another name.


 };

 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct
btrfs_trans_handle *trans,
    struct inode *inode, u64 new_size,
    u32 min_type);

-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int 
nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 
end,

   unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..c21d8a0e010a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct
address_space *mapping,
 range_whole = 1;
 scanned = 1;
 }
-    if (wbc->sync_mode == WB_SYNC_ALL)
+
+    /*
+ * We don't care if we are the one who set
BTRFS_INODE_TAGGED_FLUSH in
+ * start_delalloc_inodes(). We do the tagged 

Re: [PATCH v2] Btrfs: fix missing delayed iputs on unmount

2018-11-01 Thread David Sterba
On Wed, Oct 31, 2018 at 10:06:08AM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> There's a race between close_ctree() and cleaner_kthread().
> close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
> sees it set, but this is racy; the cleaner might have already checked
> the bit and could be cleaning stuff. In particular, if it deletes unused
> block groups, it will create delayed iputs for the free space cache
> inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
> longer running delayed iputs after a commit. Therefore, if the cleaner
> creates more delayed iputs after delayed iputs are run in
> btrfs_commit_super(), we will leak inodes on unmount and get a busy
> inode crash from the VFS.
> 
> Fix it by parking the cleaner

Ouch, that's IMO wrong way to fix it. The bug is on a higher level,
we're missing a commit or clean up data structures. Messing with state
of a thread would be the last thing I'd try after proving that it's not
possible to fix in the logic of btrfs itself.

The shutdown sequence in close_tree is quite tricky and we've had bugs
there. The interdependencies of thread and data structures and other
subsystems cannot have loops that could not find an ordering that will
not leak something.

It's not a big problem if some step is done more than once, like
committing or cleaning up some other structures if we know that
it could create new.


Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread Nikolay Borisov



On 1.11.18 г. 11:56 ч., ethanlien wrote:
> Nikolay Borisov 於 2018-11-01 16:59 寫到:
>> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>>> Snapshot is expected to be fast. But if there are writers steadily
>>> create dirty pages in our subvolume, the snapshot may take a very long
>>> time to complete. To fix the problem, we use tagged writepage for
>>> snapshot
>>> flusher as we do in the generic write_cache_pages(), so we can ommit
>>> pages
>>> dirtied after the snapshot command.
>>
>> So the gist of this commit really is that you detect when filemap_flush
>> has been called from snapshot context and tag all pages at *that* time
>> as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
>> might have been dirtied beforehand. Your description kind of dances
>> around this idea without really saying it explicitly. Those semantics
>> make sense, however i'd like to be stated more explicitly in the change
>>  log.
>>
>> However, this is done at the expense of consistency, so we have faster
>> snapshots but depending the file which are stored in them they might be
>> broken (i.e a database) since they will be missing pages.
>>
> 
> tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY.
> If a dirty
> page get missed here, it means someone has initiated the flush before
> us, so we
> will wait that dirty page to complete in create_snapshot() ->
> btrfs_wait_ordered_extents().

And what happens in the scenario where you have  someone dirtying pages,
you issue the snapshot ioctl, mark all currently dirtied pages as
TOWRITE and then you have more delalloc pages being dirtied following
initial call to tag_pages_for_writeback , you will miss those, no ?

> 
>>>
>>> We do a simple snapshot speed test on a Intel D-1531 box:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 1m58sec
>>> patched:  6.54sec
>>
>> Nice.
>>
>>>
>>> This is the best case for this patch since for a sequential write case,
>>> we omit nearly all pages dirtied after the snapshot command.
>>>
>>> For a multi writers, random write test:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 15.83sec
>>> patched:  10.35sec
>>>
>>> The improvement is less compared with the sequential write case, since
>>> we omit only half of the pages dirtied after snapshot command.
>>>
>>> Signed-off-by: Ethan Lien 
>>> ---
>>>  fs/btrfs/btrfs_inode.h |  1 +
>>>  fs/btrfs/ctree.h   |  2 +-
>>>  fs/btrfs/extent_io.c   | 16 ++--
>>>  fs/btrfs/inode.c   | 10 ++
>>>  fs/btrfs/ioctl.c   |  2 +-
>>>  5 files changed, 23 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>>> index 1343ac57b438..4182bfbb56be 100644
>>> --- a/fs/btrfs/btrfs_inode.h
>>> +++ b/fs/btrfs/btrfs_inode.h
>>> @@ -29,6 +29,7 @@ enum {
>>>  BTRFS_INODE_IN_DELALLOC_LIST,
>>>  BTRFS_INODE_READDIO_NEED_LOCK,
>>>  BTRFS_INODE_HAS_PROPS,
>>> +    BTRFS_INODE_TAGGED_FLUSH,
>>
>> IMO the name of the flag should be different. Something like "lie flush"

I meant "lite flush" or "strict flush" or something like that, alluding
to the fact we are restricting the set of pages being flushed.

>> or whatever. Because all flushes are tagged i.e TAG_DIRTY or TAG_TOWRITE.
>>
> 
> OK I'll use another name.
> 
>>>  };
>>>
>>>  /* in memory btrfs inode */
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 2cddfe7806a4..82682da5a40d 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct
>>> btrfs_trans_handle *trans,
>>>     struct inode *inode, u64 new_size,
>>>     u32 min_type);
>>>
>>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
>>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>>>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>>>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
>>>    unsigned int extra_bits,
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index 4dd6faab02bb..c21d8a0e010a 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct
>>> address_space *mapping,
>>>  range_whole = 1;
>>>  scanned = 1;
>>>  }
>>> -    if (wbc->sync_mode == WB_SYNC_ALL)
>>> +
>>> +    /*
>>> + * We don't care if we are the one who set
>>> BTRFS_INODE_TAGGED_FLUSH in
>>> + * start_delalloc_inodes(). We do the 

Re: [PATCH v2] btrfs: add zstd compression level support

2018-11-01 Thread Timofey Titovets
ср, 31 окт. 2018 г. в 21:12, Nick Terrell :
>
> From: Jennifer Liu 
>
> Adds zstd compression level support to btrfs. Zstd requires
> different amounts of memory for each level, so the design had
> to be modified to allow set_level() to allocate memory. We
> preallocate one workspace of the maximum size to guarantee
> forward progress. This feature is expected to be useful for
> read-mostly filesystems, or when creating images.
>
> Benchmarks run in qemu on Intel x86 with a single core.
> The benchmark measures the time to copy the Silesia corpus [0] to
> a btrfs filesystem 10 times, then read it back.
>
> The two important things to note are:
> - The decompression speed and memory remains constant.
>   The memory required to decompress is the same as level 1.
> - The compression speed and ratio will vary based on the source.
>
> Level   Ratio   Compression Decompression   Compression Memory
> 1   2.59153 MB/s112 MB/s0.8 MB
> 2   2.67136 MB/s113 MB/s1.0 MB
> 3   2.72106 MB/s115 MB/s1.3 MB
> 4   2.7886  MB/s109 MB/s0.9 MB
> 5   2.8369  MB/s109 MB/s1.4 MB
> 6   2.8953  MB/s110 MB/s1.5 MB
> 7   2.9140  MB/s112 MB/s1.4 MB
> 8   2.9234  MB/s110 MB/s1.8 MB
> 9   2.9327  MB/s109 MB/s1.8 MB
> 10  2.9422  MB/s109 MB/s1.8 MB
> 11  2.9517  MB/s114 MB/s1.8 MB
> 12  2.9513  MB/s113 MB/s1.8 MB
> 13  2.9510  MB/s111 MB/s2.3 MB
> 14  2.997   MB/s110 MB/s2.6 MB
> 15  3.036   MB/s110 MB/s2.6 MB
>
> [0] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
>
> Signed-off-by: Jennifer Liu 
> Signed-off-by: Nick Terrell 
> Reviewed-by: Omar Sandoval 
> ---
> v1 -> v2:
> - Don't reflow the unchanged line.
>
>  fs/btrfs/compression.c | 169 +
>  fs/btrfs/compression.h |  18 +++--
>  fs/btrfs/lzo.c |   5 +-
>  fs/btrfs/super.c   |   7 +-
>  fs/btrfs/zlib.c|  33 
>  fs/btrfs/zstd.c|  74 +-
>  6 files changed, 202 insertions(+), 104 deletions(-)
>
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 2955a4ea2fa8..b46652cb653e 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -822,9 +822,12 @@ void __init btrfs_init_compress(void)
>
> /*
>  * Preallocate one workspace for each compression type so
> -* we can guarantee forward progress in the worst case
> +* we can guarantee forward progress in the worst case.
> +* Provide the maximum compression level to guarantee large
> +* enough workspace.
>  */
> -   workspace = btrfs_compress_op[i]->alloc_workspace();
> +   workspace = btrfs_compress_op[i]->alloc_workspace(
> +   btrfs_compress_op[i]->max_level);
> if (IS_ERR(workspace)) {
> pr_warn("BTRFS: cannot preallocate compression 
> workspace, will try later\n");
> } else {
> @@ -835,23 +838,78 @@ void __init btrfs_init_compress(void)
> }
>  }
>
> +/*
> + * put a workspace struct back on the list or free it if we have enough
> + * idle ones sitting around
> + */
> +static void __free_workspace(int type, struct list_head *workspace,
> +bool heuristic)
> +{
> +   int idx = type - 1;
> +   struct list_head *idle_ws;
> +   spinlock_t *ws_lock;
> +   atomic_t *total_ws;
> +   wait_queue_head_t *ws_wait;
> +   int *free_ws;
> +
> +   if (heuristic) {
> +   idle_ws  = _heuristic_ws.idle_ws;
> +   ws_lock  = _heuristic_ws.ws_lock;
> +   total_ws = _heuristic_ws.total_ws;
> +   ws_wait  = _heuristic_ws.ws_wait;
> +   free_ws  = _heuristic_ws.free_ws;
> +   } else {
> +   idle_ws  = _comp_ws[idx].idle_ws;
> +   ws_lock  = _comp_ws[idx].ws_lock;
> +   total_ws = _comp_ws[idx].total_ws;
> +   ws_wait  = _comp_ws[idx].ws_wait;
> +   free_ws  = _comp_ws[idx].free_ws;
> +   }
> +
> +   spin_lock(ws_lock);
> +   if (*free_ws <= num_online_cpus()) {
> +   list_add(workspace, idle_ws);
> +   (*free_ws)++;
> +   spin_unlock(ws_lock);
> +   goto wake;
> +   }
> +   spin_unlock(ws_lock);
> +
> +   if (heuristic)
> +   free_heuristic_ws(workspace);
> +   else
> +   btrfs_compress_op[idx]->free_workspace(workspace);
> +   atomic_dec(total_ws);
> +wake:
> +   cond_wake_up(ws_wait);
> +}
> +
> +static void free_workspace(int type, struct list_head *ws)
> +{
> +   

Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread ethanlien

Nikolay Borisov 於 2018-11-01 16:59 寫到:

On 1.11.18 г. 8:49 ч., Ethan Lien wrote:

Snapshot is expected to be fast. But if there are writers steadily
create dirty pages in our subvolume, the snapshot may take a very long
time to complete. To fix the problem, we use tagged writepage for 
snapshot
flusher as we do in the generic write_cache_pages(), so we can ommit 
pages

dirtied after the snapshot command.


So the gist of this commit really is that you detect when filemap_flush
has been called from snapshot context and tag all pages at *that* time
as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
might have been dirtied beforehand. Your description kind of dances
around this idea without really saying it explicitly. Those semantics
make sense, however i'd like to be stated more explicitly in the change
 log.

However, this is done at the expense of consistency, so we have faster
snapshots but depending the file which are stored in them they might be
broken (i.e a database) since they will be missing pages.



tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY. 
If a dirty
page get missed here, it means someone has initiated the flush before 
us, so we
will wait that dirty page to complete in create_snapshot() -> 
btrfs_wait_ordered_extents().




We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec


Nice.



This is the best case for this patch since for a sequential write 
case,

we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is less compared with the sequential write case, since
we omit only half of the pages dirtied after snapshot command.

Signed-off-by: Ethan Lien 
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent_io.c   | 16 ++--
 fs/btrfs/inode.c   | 10 ++
 fs/btrfs/ioctl.c   |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..4182bfbb56be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
+   BTRFS_INODE_TAGGED_FLUSH,


IMO the name of the flag should be different. Something like "lie 
flush"
or whatever. Because all flushes are tagged i.e TAG_DIRTY or 
TAG_TOWRITE.




OK I'll use another name.


 };

 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct 
btrfs_trans_handle *trans,

   struct inode *inode, u64 new_size,
   u32 min_type);

-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int 
nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 
end,

  unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..c21d8a0e010a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
address_space *mapping,

range_whole = 1;
scanned = 1;
}
-   if (wbc->sync_mode == WB_SYNC_ALL)
+
+   /*
+	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH 
in
+	 * start_delalloc_inodes(). We do the tagged writepage as long as we 
are

+* the first one who do the filemap_flush() on this inode.
+*/


I think the first sentence of this comment could be removed.



OK.


+   if (range_whole && wbc->nr_to_write == LONG_MAX &&
+   wbc->sync_mode == WB_SYNC_NONE &&
+   test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
+   _I(inode)->runtime_flags))
+   wbc->tagged_writepages = 1;
+
+   if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
 retry:
-   if (wbc->sync_mode == WB_SYNC_ALL)
+   if (wbc->sync_mode == WB_SYNC_ALL || 

Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread Nikolay Borisov



On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.

So the gist of this commit really is that you detect when filemap_flush
has been called from snapshot context and tag all pages at *that* time
as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
might have been dirtied beforehand. Your description kind of dances
around this idea without really saying it explicitly. Those semantics
make sense, however i'd like to be stated more explicitly in the change
 log.

However, this is done at the expense of consistency, so we have faster
snapshots but depending the file which are stored in them they might be
broken (i.e a database) since they will be missing pages.

> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec

Nice.

> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien 
> ---
>  fs/btrfs/btrfs_inode.h |  1 +
>  fs/btrfs/ctree.h   |  2 +-
>  fs/btrfs/extent_io.c   | 16 ++--
>  fs/btrfs/inode.c   | 10 ++
>  fs/btrfs/ioctl.c   |  2 +-
>  5 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1343ac57b438..4182bfbb56be 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -29,6 +29,7 @@ enum {
>   BTRFS_INODE_IN_DELALLOC_LIST,
>   BTRFS_INODE_READDIO_NEED_LOCK,
>   BTRFS_INODE_HAS_PROPS,
> + BTRFS_INODE_TAGGED_FLUSH,

IMO the name of the flag should be different. Something like "lie flush"
or whatever. Because all flushes are tagged i.e TAG_DIRTY or TAG_TOWRITE.

>  };
>  
>  /* in memory btrfs inode */
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2cddfe7806a4..82682da5a40d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct 
> btrfs_trans_handle *trans,
>  struct inode *inode, u64 new_size,
>  u32 min_type);
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
> unsigned int extra_bits,
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4dd6faab02bb..c21d8a0e010a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
> address_space *mapping,
>   range_whole = 1;
>   scanned = 1;
>   }
> - if (wbc->sync_mode == WB_SYNC_ALL)
> +
> + /*
> +  * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
> +  * start_delalloc_inodes(). We do the tagged writepage as long as we are
> +  * the first one who do the filemap_flush() on this inode.
> +  */

I think the first sentence of this comment could be removed.

> + if (range_whole && wbc->nr_to_write == LONG_MAX &&
> + wbc->sync_mode == WB_SYNC_NONE &&
> + test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
> + _I(inode)->runtime_flags))
> + wbc->tagged_writepages = 1;
> +
> + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>   tag = PAGECACHE_TAG_TOWRITE;
>   else
>   tag = PAGECACHE_TAG_DIRTY;
>  retry:
> - if (wbc->sync_mode == WB_SYNC_ALL)
> + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>   tag_pages_for_writeback(mapping, index, end);
>   done_index = index;
>   while (!done && !nr_to_write_done && (index <= end) &&
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> 

[Spam]Quick Response

2018-11-01 Thread Annable Katherine Grosvenor
Good day,

My name is Annable Katherine Grosvenor, I'm 57yrs old, a widow, no kids, from 
the United Kingdom, I'm very sorry to bother you with this message but it is 
very important for me that I send out this message because I am very sick and 
at the point of death, I'm diagnosed with Ovarian Cancer and from all 
indications, I will not survive it because my Doctor courageously told me that 
I have few months to live, and also I can see that my health is fast 
deteriorating, the aggression of cancer has reached a critical stage.

I'm contacting you from my sick bed with the help of my personal nurse, I need 
a trustworthy, Godfearing and a reliable person I can bank on to carry out my 
last wish for me which is also the wish of my late husband, I have a 
humanitarian project worth three million, five hundred thousand dollars only, 
if you promise to help me accomplish this my last dying wish to the Glory of 
God and humanity, I will give you thirty percent of the total sum.

For further Enquiry about my mail to you pls kindly get back to me on my 
private email address so I can give you further details on how to go about this

Thank you very much for taking out time to read my message to you.

Yours Truly,
Annable Katherine Grosvenor
annablekatherinegrosve...@gmail.com


[PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

2018-11-01 Thread Ethan Lien
Snapshot is expected to be fast. But if there are writers steadily
create dirty pages in our subvolume, the snapshot may take a very long
time to complete. To fix the problem, we use tagged writepage for snapshot
flusher as we do in the generic write_cache_pages(), so we can ommit pages
dirtied after the snapshot command.

We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec

This is the best case for this patch since for a sequential write case,
we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is less compared with the sequential write case, since
we omit only half of the pages dirtied after snapshot command.

Signed-off-by: Ethan Lien 
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/extent_io.c   | 16 ++--
 fs/btrfs/inode.c   | 10 ++
 fs/btrfs/ioctl.c   |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..4182bfbb56be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ enum {
BTRFS_INODE_IN_DELALLOC_LIST,
BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
+   BTRFS_INODE_TAGGED_FLUSH,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle 
*trans,
   struct inode *inode, u64 new_size,
   u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..c21d8a0e010a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
address_space *mapping,
range_whole = 1;
scanned = 1;
}
-   if (wbc->sync_mode == WB_SYNC_ALL)
+
+   /*
+* We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
+* start_delalloc_inodes(). We do the tagged writepage as long as we are
+* the first one who do the filemap_flush() on this inode.
+*/
+   if (range_whole && wbc->nr_to_write == LONG_MAX &&
+   wbc->sync_mode == WB_SYNC_NONE &&
+   test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
+   _I(inode)->runtime_flags))
+   wbc->tagged_writepages = 1;
+
+   if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
 retry:
-   if (wbc->sync_mode == WB_SYNC_ALL)
+   if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
while (!done && !nr_to_write_done && (index <= end) &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..3df3cbbe91c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work 
*btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, int snapshot)
 {
struct btrfs_inode *binode;
struct inode *inode;
@@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct btrfs_root 
*root, int nr)
}
spin_unlock(>delalloc_lock);
 
+   if (snapshot)
+   set_bit(BTRFS_INODE_TAGGED_FLUSH, 
>runtime_flags);
work = btrfs_alloc_delalloc_work(inode);
if (!work) {
iput(inode);
@@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct btrfs_root 
*root, int nr)