Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

ethanlien Thu, 01 Nov 2018 02:57:02 -0700

Nikolay Borisov 於 2018-11-01 16:59 寫到:

On 1.11.18 г. 8:49 ч., Ethan Lien wrote:

Snapshot is expected to be fast. But if there are writers steadily
create dirty pages in our subvolume, the snapshot may take a very long
time to complete. To fix the problem, we use tagged writepage forsnapshotflusher as we do in the generic write_cache_pages(), so we can ommitpages
dirtied after the snapshot command.


So the gist of this commit really is that you detect when filemap_flush
has been called from snapshot context and tag all pages at *that* time
as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
might have been dirtied beforehand. Your description kind of dances
around this idea without really saying it explicitly. Those semantics
make sense, however i'd like to be stated more explicitly in the change
 log.

However, this is done at the expense of consistency, so we have faster
snapshots but depending the file which are stored in them they might be
broken (i.e a database) since they will be missing pages.

tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY.If a dirtypage get missed here, it means someone has initiated the flush beforeus, so wewill wait that dirty page to complete in create_snapshot() ->btrfs_wait_ordered_extents().


We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec


Nice.

This is the best case for this patch since for a sequential writecase,

we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is less compared with the sequential write case, since
we omit only half of the pages dirtied after snapshot command.

Signed-off-by: Ethan Lien <ethanl...@synology.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent_io.c   | 16 ++++++++++++++--
 fs/btrfs/inode.c       | 10 ++++++----
 fs/btrfs/ioctl.c       |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..4182bfbb56be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ enum {
        BTRFS_INODE_IN_DELALLOC_LIST,
        BTRFS_INODE_READDIO_NEED_LOCK,
        BTRFS_INODE_HAS_PROPS,
+       BTRFS_INODE_TAGGED_FLUSH,

IMO the name of the flag should be different. Something like "lieflush"or whatever. Because all flushes are tagged i.e TAG_DIRTY orTAG_TOWRITE.


OK I'll use another name.

 };

 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(structbtrfs_trans_handle *trans,
                               struct inode *inode, u64 new_size,
                               u32 min_type);

-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, intnr);int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64end,
                              unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..c21d8a0e010a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(structaddress_space *mapping,
                        range_whole = 1;
                scanned = 1;
        }
-       if (wbc->sync_mode == WB_SYNC_ALL)
+
+       /*
+ * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSHin+ * start_delalloc_inodes(). We do the tagged writepage as long as weare
+        * the first one who do the filemap_flush() on this inode.
+        */


I think the first sentence of this comment could be removed.

OK.

+       if (range_whole && wbc->nr_to_write == LONG_MAX &&
+                       wbc->sync_mode == WB_SYNC_NONE &&
+                       test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
+                               &BTRFS_I(inode)->runtime_flags))
+               wbc->tagged_writepages = 1;
+
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
 retry:
-       if (wbc->sync_mode == WB_SYNC_ALL)
+       if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && !nr_to_write_done && (index <= end) &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..3df3cbbe91c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work*btrfs_alloc_delalloc_work(struct inode *inode

  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)

+static int start_delalloc_inodes(struct btrfs_root *root, int nr, intsnapshot)

snapshot is a boolean value, so define it as such and not an int. Iknow

the codebase is inconsistent in that manner but long-term all boolean
'int's shall be converted to bools.


OK i'll use boolean here.

 {
        struct btrfs_inode *binode;
        struct inode *inode;

@@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(structbtrfs_root *root, int nr)

                }
                spin_unlock(&root->delalloc_lock);

+               if (snapshot)
+                       set_bit(BTRFS_INODE_TAGGED_FLUSH, 
&binode->runtime_flags);
                work = btrfs_alloc_delalloc_work(inode);
                if (!work) {
                        iput(inode);

@@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(structbtrfs_root *root, int nr)

        return ret;
 }

-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

@@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(structbtrfs_root *root)

        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                return -EROFS;

-       ret = start_delalloc_inodes(root, -1);
+       ret = start_delalloc_inodes(root, -1, 1);
        if (ret > 0)
                ret = 0;
        return ret;

@@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(structbtrfs_fs_info *fs_info, int nr)

                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);

-               ret = start_delalloc_inodes(root, nr);
+               ret = start_delalloc_inodes(root, nr, 0);
                btrfs_put_fs_root(root);
                if (ret < 0)
                        goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..d1293b6c31f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c

@@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root*root, struct inode *dir,

        wait_event(root->subv_writers->wait,
                   percpu_counter_sum(&root->subv_writers->counter) == 0);

-       ret = btrfs_start_delalloc_inodes(root);
+       ret = btrfs_start_delalloc_snapshot(root);
        if (ret)
                goto dec_and_free;

Re: [PATCH] btrfs: use tagged writepage to mitigate livelock of snapshot

Reply via email to