On 2019/06/13 23:33, Josef Bacik wrote:
> On Fri, Jun 07, 2019 at 10:10:24PM +0900, Naohiro Aota wrote:
>> Currently, dev-replace copy all the device extents on source device to the
>> target device, and it also clones new incoming write I/Os from users to the
>> source device into the target device.
>>
>> Cloning incoming IOs can break the sequential write rule in the target
>> device. When write is mapped in the middle of block group, that I/O is
>> directed in the middle of a zone of target device, which breaks the
>> sequential write rule.
>>
>> However, the cloning function cannot be simply disabled since incoming I/Os
>> targeting already copied device extents must be cloned so that the I/O is
>> executed on the target device.
>>
>> We cannot use dev_replace->cursor_{left,right} to determine whether bio
>> is going to not yet copied region.  Since we have time gap between
>> finishing btrfs_scrub_dev() and rewriting the mapping tree in
>> btrfs_dev_replace_finishing(), we can have newly allocated device extent
>> which is never cloned (by handle_ops_on_dev_replace) nor copied (by the
>> dev-replace process).
>>
>> So the point is to copy only already existing device extents. This patch
>> introduce mark_block_group_to_copy() to mark existing block group as a
>> target of copying. Then, handle_ops_on_dev_replace() and dev-replace can
>> check the flag to do their job.
>>
>> This patch also handles empty region between used extents. Since
>> dev-replace is smart to copy only used extents on source device, we have to
>> fill the gap to honor the sequential write rule in the target device.
>>
>> Signed-off-by: Naohiro Aota <naohiro.a...@wdc.com>
>> ---
>>   fs/btrfs/ctree.h       |   1 +
>>   fs/btrfs/dev-replace.c |  96 +++++++++++++++++++++++
>>   fs/btrfs/extent-tree.c |  32 +++++++-
>>   fs/btrfs/scrub.c       | 169 +++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/volumes.c     |  27 ++++++-
>>   5 files changed, 319 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index dad8ea5c3b99..a0be2b96117a 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -639,6 +639,7 @@ struct btrfs_block_group_cache {
>>      unsigned int has_caching_ctl:1;
>>      unsigned int removed:1;
>>      unsigned int wp_broken:1;
>> +    unsigned int to_copy:1;
>>   
>>      int disk_cache_state;
>>   
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index fbe5ea2a04ed..5011b5ce0e75 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -263,6 +263,13 @@ static int btrfs_init_dev_replace_tgtdev(struct 
>> btrfs_fs_info *fs_info,
>>      device->dev_stats_valid = 1;
>>      set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>>      device->fs_devices = fs_info->fs_devices;
>> +    if (bdev_is_zoned(bdev)) {
>> +            ret = btrfs_get_dev_zonetypes(device);
>> +            if (ret) {
>> +                    mutex_unlock(&fs_info->fs_devices->device_list_mutex);
>> +                    goto error;
>> +            }
>> +    }
>>      list_add(&device->dev_list, &fs_info->fs_devices->devices);
>>      fs_info->fs_devices->num_devices++;
>>      fs_info->fs_devices->open_devices++;
>> @@ -396,6 +403,88 @@ static char* btrfs_dev_name(struct btrfs_device *device)
>>              return rcu_str_deref(device->name);
>>   }
>>   
>> +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
>> +                                struct btrfs_device *src_dev)
>> +{
>> +    struct btrfs_path *path;
>> +    struct btrfs_key key;
>> +    struct btrfs_key found_key;
>> +    struct btrfs_root *root = fs_info->dev_root;
>> +    struct btrfs_dev_extent *dev_extent = NULL;
>> +    struct btrfs_block_group_cache *cache;
>> +    struct extent_buffer *l;
>> +    int slot;
>> +    int ret;
>> +    u64 chunk_offset, length;
>> +
>> +    path = btrfs_alloc_path();
>> +    if (!path)
>> +            return -ENOMEM;
>> +
>> +    path->reada = READA_FORWARD;
>> +    path->search_commit_root = 1;
>> +    path->skip_locking = 1;
>> +
>> +    key.objectid = src_dev->devid;
>> +    key.offset = 0ull;
>> +    key.type = BTRFS_DEV_EXTENT_KEY;
>> +
>> +    while (1) {
>> +            ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
>> +            if (ret < 0)
>> +                    break;
>> +            if (ret > 0) {
>> +                    if (path->slots[0] >=
>> +                        btrfs_header_nritems(path->nodes[0])) {
>> +                            ret = btrfs_next_leaf(root, path);
>> +                            if (ret < 0)
>> +                                    break;
>> +                            if (ret > 0) {
>> +                                    ret = 0;
>> +                                    break;
>> +                            }
>> +                    } else {
>> +                            ret = 0;
>> +                    }
>> +            }
>> +
>> +            l = path->nodes[0];
>> +            slot = path->slots[0];
>> +
>> +            btrfs_item_key_to_cpu(l, &found_key, slot);
>> +
>> +            if (found_key.objectid != src_dev->devid)
>> +                    break;
>> +
>> +            if (found_key.type != BTRFS_DEV_EXTENT_KEY)
>> +                    break;
>> +
>> +            if (found_key.offset < key.offset)
>> +                    break;
>> +
>> +            dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
>> +            length = btrfs_dev_extent_length(l, dev_extent);
>> +
>> +            chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
>> +
>> +            cache = btrfs_lookup_block_group(fs_info, chunk_offset);
>> +            if (!cache)
>> +                    goto skip;
>> +
>> +            cache->to_copy = 1;
>> +
>> +            btrfs_put_block_group(cache);
>> +
>> +skip:
>> +            key.offset = found_key.offset + length;
>> +            btrfs_release_path(path);
>> +    }
>> +
>> +    btrfs_free_path(path);
>> +
>> +    return ret;
>> +}
>> +
>>   static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
>>              const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
>>              int read_src)
>> @@ -439,6 +528,13 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
>> *fs_info,
>>      }
>>   
>>      need_unlock = true;
>> +
>> +    mutex_lock(&fs_info->chunk_mutex);
>> +    ret = mark_block_group_to_copy(fs_info, src_device);
>> +    mutex_unlock(&fs_info->chunk_mutex);
>> +    if (ret)
>> +            return ret;
>> +
>>      down_write(&dev_replace->rwsem);
>>      switch (dev_replace->replace_state) {
>>      case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index ff4d55d6ef04..268365dd9a5d 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -29,6 +29,7 @@
>>   #include "qgroup.h"
>>   #include "ref-verify.h"
>>   #include "rcu-string.h"
>> +#include "dev-replace.h"
>>   
>>   #undef SCRAMBLE_DELAYED_REFS
>>   
>> @@ -2022,7 +2023,31 @@ int btrfs_discard_extent(struct btrfs_fs_info 
>> *fs_info, u64 bytenr,
>>                      if (btrfs_dev_is_sequential(stripe->dev,
>>                                                  stripe->physical) &&
>>                          stripe->length == stripe->dev->zone_size) {
>> -                            ret = blkdev_reset_zones(stripe->dev->bdev,
>> +                            struct btrfs_device *dev = stripe->dev;
>> +
>> +                            ret = blkdev_reset_zones(dev->bdev,
>> +                                                     stripe->physical >>
>> +                                                             SECTOR_SHIFT,
>> +                                                     stripe->length >>
>> +                                                             SECTOR_SHIFT,
>> +                                                     GFP_NOFS);
>> +                            if (!ret)
>> +                                    discarded_bytes += stripe->length;
>> +                            else
>> +                                    break;
>> +                            set_bit(stripe->physical >>
>> +                                    dev->zone_size_shift,
>> +                                    dev->empty_zones);
>> +
>> +                            if (!btrfs_dev_replace_is_ongoing(
>> +                                        &fs_info->dev_replace) ||
>> +                                stripe->dev != fs_info->dev_replace.srcdev)
>> +                                    continue;
>> +
>> +                            /* send to target as well */
>> +                            dev = fs_info->dev_replace.tgtdev;
>> +
>> +                            ret = blkdev_reset_zones(dev->bdev,
> 
> This is unrelated to dev replace isn't it?  Please make this it's own patch, 
> and
> it's own helper while you are at it.  Thanks,
> 
> Josef
> 

Actually, patch 0015 introduced zone reset here. And this patch extend that code
to reset also the corresponding zone when dev_replace is on going. The diff is
messed up here.

I'll add the reset helper in the next version.
Thanks,

Reply via email to